Esempio n. 1
0
def raph_alignment_report(ja_smk, letter_ja):
    csv_lst = []
    lst_raph = []
    smk_siman = 0
    smk_pages = map_semak_page_siman(ja_smk, to_print=False)
    for seg in traverse_ja(ja_smk):
        for raph_l_in_smk in re.finditer(u'@55([\u05d0-\u05ea]{1,3})', seg['data']):
            lst_raph.append((raph_l_in_smk.group(1),
                             seg['data'][raph_l_in_smk.span()[0] - 20: raph_l_in_smk.span()[1] + 20],
                             (seg['indices'][0] + 1)))
    raph_11 = []
    for raph in traverse_ja(letter_ja):
        raph_11.append(raph)  # re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1))
    page = 21
    prob = 0
    for raph, smk_l in zip(raph_11, lst_raph):

        print re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), smk_l[0], numToHeb(smk_l[2])
        csv_dict = {u'smk letter': smk_l[0], u'raph letter': re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1),
                    u'smk words': smk_l[1], u'raph line': raph['data'], u'siman': numToHeb(smk_l[2]), u'aprx page in scan': smk_pages[numToHeb(smk_l[2])]}
        if re.search(u'@77', smk_l[1]):
            page += 1
        if re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1) != smk_l[0]:
            prob += 1
            print "*"
            csv_dict['problem'] = True
            # break
        csv_lst.append(csv_dict)
    print 'prob', prob
    print 'done'
    toCSV(u'testcsvreport', csv_lst, [u'smk letter', u'raph letter', u'smk words',
                                u'raph line', u'siman', u'aprx page in scan', u'problem'])
    return csv_lst
Esempio n. 2
0
def link_semak_raph(smk_ja, raph_ja):
    #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph
    #by running on the ja_raph segments
    smk_raph = []
    raph_letter = []
    for seg in traverse_ja(smk_ja):
        if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']):
            for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})', seg['data']):
                # smk_raph.append([seg['indices'][:], letter])
                smk_raph.append([letter, seg['indices']])
    last = [-1, -1]
    for seg in traverse_ja(raph_ja):
        if seg['indices'][0:2] == last[0:2]:
            continue
        else:
            raph_letter.append(seg)
        last = seg['indices']

    problem_count = 0
    for smk, raph in zip(smk_raph, raph_letter):
        if getGematria(smk[0]) == (raph['indices'][1]+1):
            print getGematria(smk[0]), raph['indices'][1]+1, \
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
        else:
            problem_count +=1
            print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
    print problem_count
Esempio n. 3
0
def link_semak_raph(smk_ja, raph_ja):
    #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph
    #by running on the ja_raph segments
    smk_raph = []
    raph_letter = []
    for seg in traverse_ja(smk_ja):
        if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']):
            for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})',
                                     seg['data']):
                # smk_raph.append([seg['indices'][:], letter])
                smk_raph.append([letter, seg['indices']])
    last = [-1, -1]
    for seg in traverse_ja(raph_ja):
        if seg['indices'][0:2] == last[0:2]:
            continue
        else:
            raph_letter.append(seg)
        last = seg['indices']

    problem_count = 0
    for smk, raph in zip(smk_raph, raph_letter):
        if getGematria(smk[0]) == (raph['indices'][1] + 1):
            print getGematria(smk[0]), raph['indices'][1]+1, \
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
        else:
            problem_count += 1
            print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
    print problem_count
Esempio n. 4
0
def link_raph(ja_smk, ja_raph_simanim):  # look how to get this information where it is coming from.
    # ja_raph_simanim = siman, letter
    links = []
    i = 0
    prev_siman = 1
    for seg in traverse_ja(ja_smk):
        for x in re.findall(u'@55', seg['data']): # if re.search(u'@55', seg['data']):
            siman = seg['indices'][0] + 1

            if siman != prev_siman:
                i = 0
            prev_siman = siman

            segment = seg['indices'][1] + 1
            i += 1
            link = (
                {
                    "refs": [
                        "Sefer Mitzvot Katan {}:{}".format(siman, segment),
                        "Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan {}:{}".format(siman, i),  # really should be a ref link to the whole raph
                    ],
                    "type": "commentary",
                    'inline_reference': {
                        'data-commentator': 'Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan',
                        'data-order': i
                    },
                    "auto": True,
                    "generated_by": "semak_parser"

                })
            # dh_text = dh['data']
            # append to links list
            links.append(link)
    return links
Esempio n. 5
0
def link_raavad(text_ja):
    # create the link objects btween the dibur HaMatchil and the main text
    links = []
    # use a generator to go over the text and find the 3 level indices
    for dh in traverse_ja(text_ja):
        link = ({
            "refs": [
                "Raavad on Sefer Yetzirah " +
                '%d:%d:%d' % tuple(x + 1 for x in dh['indices']),
                "Sefer Yetzirah " +
                '%d:%d' % tuple(x + 1 for x in dh['indices'][:2]),
            ],
            "type":
            "commentary",
            "auto":
            True,
            "generated_by":
            "raavad_parse"
        })
        dh_text = dh['data']
        # append to links list
        links.append(link)
    # shave off the last link of "slik" shpuldn't be linked in
    links.pop()
    return links
Esempio n. 6
0
def linker(parsed_commentary, commentator_name):
    """
    Build up a list of links for a text where the commentator follows the base text exactly
    :param parsed_commentary: parsed text to link
    :param commentator_name: Name commentator as appears on the JaggedArrayNode to be linked
    :return: list of links
    """

    links = []
    for comment in traverse_ja(parsed_commentary):
        indices = [i + 1 for i in comment['indices']]
        links.append({
            'refs': [
                'Sefer Yetzirah {}:{}'.format(*indices[:-1]),
                '{} {}:{}:{}'.format(commentator_name, *indices)
            ],
            'type':
            'commentary',
            'auto':
            True,
            'generated_by':
            'Sefer Yetzirah Parse Script'
        })

    return links
Esempio n. 7
0
def test_traverse_ja():
    test_ja = [['foo', 'bar'], ['hello', 'world']]
    explicit_data = [
        {'data': 'foo', 'indices': [0, 0]},
        {'data': 'bar', 'indices': [0, 1]},
        {'data': 'hello', 'indices': [1, 0]},
        {'data': 'world', 'indices': [1, 1]}
    ]
    for test_item, explicit_item in zip(util.traverse_ja(test_ja), explicit_data):
        assert test_item == explicit_item
Esempio n. 8
0
def build_table(old_ja):
    j = 0
    ind_list = []
    for x in traverse_ja(old_ja):
        k = x['indices']
        # k0 = '{}'.format(k[0])
        # k1 = '{}'.format(k[1])
        # k2 = '{}'.format(k[2])
        # l = [j_str,k0,k1,k2]
        l = [j, k[0], k[1], k[2]]
        ind_list.append(l)
        j += 1
    return ind_list
Esempio n. 9
0
def build_table(old_ja):
    j = 0
    ind_list = []
    for x in traverse_ja(old_ja):
        k = x['indices']
        # k0 = '{}'.format(k[0])
        # k1 = '{}'.format(k[1])
        # k2 = '{}'.format(k[2])
        # l = [j_str,k0,k1,k2]
        l = [j, k[0], k[1], k[2]]
        ind_list.append(l)
        j += 1
    return ind_list
Esempio n. 10
0
def build_links(parsed_data):

    link_bases = []

    for book in library.get_indexes_in_category('Torah'):
        for segment in traverse_ja(parsed_data[book]):
            link_bases.append('{} {}:{}'.format(book, *[i+1 for i in segment['indices']]))

    return [{
        'refs': [base, 'Tafsir Rasag, {}'.format(base)],
        'type': 'targum',
        'auto': False,
        'generated_by': 'Tafsir Rasag Parse script'
    } for base in link_bases]
Esempio n. 11
0
def build_links(parsed):

    bases = []
    for book_num, book in enumerate(parsed):
        for line in traverse_ja(book):
            bases.append('{} Chronicles {}:{}'.format('I'*(book_num+1), *[i+1 for i in line['indices']]))

    links = [{
        'refs': [base, 'Aramaic Targum to {}'.format(base)],
        'type': 'targum',
        'auto': False,
        'generated_by': 'Chronicles parse script'
    }for base in bases]

    return links
Esempio n. 12
0
def raph_alignment_report(ja_smk, letter_ja):
    csv_lst = []
    lst_raph = []
    smk_siman = 0
    smk_pages = map_semak_page_siman(ja_smk, to_print=False)
    for seg in traverse_ja(ja_smk):
        for raph_l_in_smk in re.finditer(u'@55([\u05d0-\u05ea]{1,3})', seg['data']):
            lst_raph.append((raph_l_in_smk.group(1),
                             seg['data'][raph_l_in_smk.span()[0] - 20: raph_l_in_smk.span()[1] + 20],
                             (seg['indices'][0] + 1)))
    raph_11 = []
    for raph in traverse_ja(letter_ja):
        raph_11.append(raph)  # re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1))
    page = 21
    prob = 0
    i = 0
    for raph, smk_l in zip(letter_ja, lst_raph):  # zip(raph_11, lst_raph):

        # print re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), smk_l[0], numToHeb(smk_l[2])
        csv_dict = {u'smk letter': smk_l[0],  u'raph': raph[i], u'siman': numToHeb(smk_l[2]), u'aprx page in scan': smk_pages[numToHeb(smk_l[2])]}
        # u'raph letter': re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), u'raph line': raph['data']
        # u'smk words': smk_l[1],
        i += 0
        if re.search(u'@77', smk_l[1]):
            page += 1
        # if re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1) != smk_l[0]:
        #     prob += 1
        #     print "*"
        #     csv_dict['problem'] = True
        #     # break
        csv_lst.append(csv_dict)
    print 'prob', prob
    print 'done'
    toCSV(u'testcsvreport', csv_lst, [u'smk letter', u'raph',
                                 u'siman', u'aprx page in scan'])  #, u'problem', u'smk words',u'raph line',
    return csv_lst
Esempio n. 13
0
def build_links(parsed_data):

    link_bases = []

    for book in library.get_indexes_in_category('Torah'):
        for segment in traverse_ja(parsed_data[book]):
            link_bases.append('{} {}:{}'.format(
                book, *[i + 1 for i in segment['indices']]))

    return [{
        'refs': [base, 'Tafsir Rasag, {}'.format(base)],
        'type': 'targum',
        'auto': False,
        'generated_by': 'Tafsir Rasag Parse script'
    } for base in link_bases]
Esempio n. 14
0
def linker(dict_of_ja):
    links = []

    for book in library.get_indexes_in_category('Torah'):
        for segment in util.traverse_ja(dict_of_ja[book]):

            refs = [u'{}.{}.{}'.format(book, *[x+1 for x in segment['indices'][:-1]]),
                    u'Baal HaTurim, {}.{}.{}.{}'.format(book, *[x+1 for x in segment['indices']])]

            links.append(
                {
                    'refs': refs,
                    'type': 'commentary',
                    'auto': False,
                    'generated_by': 'Baal HaTurim parse script'
                }
            )
    return links
Esempio n. 15
0
def map_semak_page_siman(smk_ja, to_print=True):
    '''
    create a dictionary from key: siman value: page(s) that the siman is on
    :param smk_ja: smk ja parsed according to simanim @22
    :return: dictionary. keys: siman (he letter), value: list of pages the siman spans over. (pages according to scan -
    starts on p. 21)
    '''
    siman_page = OrderedDict()
    page_count = 21
    start_page = False
    lst_seg = {'data': '', 'indices': []}
    for seg in traverse_ja(smk_ja):
        for i, page in enumerate(re.finditer(u'@77', seg['data'])):
            page_count += 1
            try:
                siman_page[numToHeb(seg['indices'][0] + 1)].append(page_count)
            except KeyError:
                if not start_page:
                    siman_page[numToHeb(seg['indices'][0] +
                                        1)] = [page_count - 1, page_count]
                    start_page = False
                else:
                    siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count]
            if re.search(u'@77 ?$', lst_seg['data']):
                start_page = True
                siman_page[numToHeb(lst_seg['indices'][0] +
                                    1)].remove(page_count)
        if not list(re.finditer(u'@77', seg['data'])):
            try:
                siman_page[numToHeb(seg['indices'][0] + 1)]
            except KeyError:
                siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count]
            if re.search(u'@77 ?$', lst_seg['data']):
                start_page = True
                try:
                    siman_page[numToHeb(lst_seg['indices'][0] +
                                        1)].remove(page_count)
                except ValueError:
                    pass
        lst_seg = seg
    if to_print:
        for k in siman_page.keys():
            print k, siman_page[k]
    return siman_page
Esempio n. 16
0
def linker(parsed_commentary, commentator_name):
    """
    Build up a list of links for a text where the commentator follows the base text exactly
    :param parsed_commentary: parsed text to link
    :param commentator_name: Name commentator as appears on the JaggedArrayNode to be linked
    :return: list of links
    """

    links = []
    for comment in traverse_ja(parsed_commentary):
        indices = [i + 1 for i in comment['indices']]
        links.append({
            'refs': ['Sefer Yetzirah {}:{}'.format(*indices[:-1]),
                     '{} {}:{}:{}'.format(commentator_name, *indices)],
            'type': 'commentary',
            'auto': True,
            'generated_by': 'Sefer Yetzirah Parse Script'
        })

    return links
Esempio n. 17
0
def map_semak_page_siman(smk_ja, to_print = True):
    '''
    create a dictionary from key: siman value: page(s) that the siman is on
    :param smk_ja: smk ja parsed according to simanim @22
    :return: dictionary. keys: siman (he letter), value: list of pages the siman spans over. (pages according to scan -
    starts on p. 21)
    '''
    siman_page = OrderedDict()
    page_count = 21
    start_page = False
    lst_seg = {'data': '', 'indices': []}
    for seg in traverse_ja(smk_ja):
        for i, page in enumerate(re.finditer(u'@77', seg['data'])):
            page_count += 1
            try:
                siman_page[numToHeb(seg['indices'][0]+1)].append(page_count)
            except KeyError:
                if not start_page:
                    siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count - 1, page_count]
                    start_page = False
                else:
                    siman_page[numToHeb(seg['indices'][0]+1)] = [page_count]
            if re.search(u'@77 ?$', lst_seg['data']):
                start_page = True
                siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count)
        if not list(re.finditer(u'@77', seg['data'])):
            try:
                siman_page[numToHeb(seg['indices'][0]+1)]
            except KeyError:
                siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count]
            if re.search(u'@77 ?$', lst_seg['data']):
                start_page = True
                try:
                    siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count)
                except ValueError:
                    pass
        lst_seg = seg
    if to_print:
        for k in siman_page.keys():
            print k, siman_page[k]
    return siman_page
Esempio n. 18
0
def link_bs(text_dict):
    links = []
    for text in text_dict.keys():
        book = re.match(u'(.*?)\s', text).group().strip()
        for dh in traverse_ja(text_dict[text]):
            perek = (dh['indices'][0] + 1)
            pasuk = (dh['indices'][1] + 1)
            comment = (dh['indices'][2]+1)
            link = (
                {
                    "refs": [
                        'Bekhor Shor, {} {}:{}:{}'.format(book, perek, pasuk, comment),
                        '{} {}:{}'.format(book,perek, pasuk),
                    ],
                    "type": "commentary",
                    "auto": True,
                    "generated_by": "bekhor_shor_parser"
                })
            # append to links list
            links.append(link)
    return links
def link_bs(text_dict):
    links = []
    for text in text_dict.keys():
        book = re.match(u'(.*?)\s', text).group().strip()
        for dh in traverse_ja(text_dict[text]):
            perek = (dh['indices'][0] + 1)
            pasuk = (dh['indices'][1] + 1)
            comment = (dh['indices'][2]+1)
            link = (
                {
                    "refs": [
                        'Bekhor Shor, {} {}:{}:{}'.format(book, perek, pasuk, comment),
                        '{} {}:{}'.format(book,perek, pasuk),
                    ],
                    "type": "commentary",
                    "auto": True,
                    "generated_by": "bekhor_shor_parser"
                })
            # append to links list
            links.append(link)
    return links
Esempio n. 20
0
def link_raavad(text_ja):
    # create the link objects btween the dibur HaMatchil and the main text
    links = []
    # use a generator to go over the text and find the 3 level indices
    for dh in traverse_ja(text_ja):
        link = (
            {
                "refs": [
                    "Raavad on Sefer Yetzirah " + '%d:%d:%d' % tuple(x + 1 for x in dh['indices']),
                    "Sefer Yetzirah " + '%d:%d' % tuple(x + 1 for x in dh['indices'][:2]),
                ],
                "type": "commentary",
                "auto": True,
                "generated_by": "raavad_parse"
            })
        dh_text = dh['data']
        # append to links list
        links.append(link)
    # shave off the last link of "slik" shpuldn't be linked in
    links.pop()
    return links
Esempio n. 21
0
def link_raph(ja_smk, ja_raph_simanim
              ):  # look how to get this information where it is coming from.
    # ja_raph_simanim = siman, letter
    links = []
    i = 0
    prev_siman = 1
    for seg in traverse_ja(ja_smk):
        for x in re.findall(u'@55',
                            seg['data']):  # if re.search(u'@55', seg['data']):
            siman = seg['indices'][0] + 1

            if siman != prev_siman:
                i = 0
            prev_siman = siman

            segment = seg['indices'][1] + 1
            i += 1
            link = ({
                "refs": [
                    "Sefer Mitzvot Katan {}:{}".format(siman, segment),
                    "Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan {}:{}".
                    format(siman,
                           i),  # really should be a ref link to the whole raph
                ],
                "type":
                "commentary",
                'inline_reference': {
                    'data-commentator':
                    'Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan',
                    'data-order': i
                },
                "auto":
                True,
                "generated_by":
                "semak_parser"
            })
            # dh_text = dh['data']
            # append to links list
            links.append(link)
    return links
Esempio n. 22
0
def links(text_dict):
    links = []
    for book in text_dict.keys():
        for dh in traverse_ja(text_dict[book]):
            perek = (dh['indices'][0] + 1)
            pasuk = (dh['indices'][1] + 1)
            comment = (dh['indices'][2] + 1)
            link = ({
                "refs": [
                    'Tur HaAroch, {} {}:{}:{}'.format(book, perek, pasuk,
                                                      comment),
                    '{} {}:{}'.format(book, perek, pasuk),
                ],
                "type":
                "commentary",
                "auto":
                True,
                "generated_by":
                "tur_torah_parser"
            })
            # append to links list
            links.append(link)
    return links
Esempio n. 23
0
    index_dict = {
        'title': 'HaGra on Sefer Yetzirah Gra Version',
        'categories': ['Commentary2','Kabbalah','Gra'],
        'schema': schema.serialize() # This line converts the schema into json
    }
    post_index(index_dict)

    post_text('HaGra on Sefer Yetzirah Gra Version', text_version, index_count='on')

# post with the post function
post_this()

# create the link objects btween the dibur HaMatchil of the GRA and the main text
gra_links = []
# use a generator to go over the text and find the 3 level indices
for dh in traverse_ja(gra):
        link = (
            {
            "refs": [
                "HaGra on Sefer Yetzirah Gra Version " + '%d:%d:%d' %tuple(x+1 for x in dh['indices']),
                "Sefer Yetzirah Gra Version " + '%d:%d' %tuple(x+1 for x in dh['indices'][:2]),
            ],
            "type": "commentary",
            "auto": True,
            "generated_by": "gra_parse"
        })
        dh_text = dh['data']
        # append to links list
        gra_links.append(link)

# shave off the last link of "slik" shpuldn't be linked in
Esempio n. 24
0
def generate_links(parsed_data,
                   link_filename='fixed_links.xml',
                   error_file='errors.csv'):
    """
    Using an xml of data from daat and parsed text, generate all links
    :param parsed_data: Dictionary keys are books of Torah, values are parsed text into ja.
    :param link_filename: Filename of xml file that holds link data.
    :param error_file: Filename of csv file which contains all comments that could not be linked.
    :return: List of link objects
    """
    links, errors = [], []
    root = ET.parse(link_filename).getroot()

    for book in library.get_indexes_in_category('Torah'):
        book_element = root.find(book)
        for comment in util.traverse_ja(parsed_data[book], bottom=basestring):

            good_verse = True
            chapter, verse = comment['indices'][0], comment['indices'][1]

            # get the verse from the xml
            verse_element = book_element.find(
                "./chapter[@chap_index='{}']/verse[@verse_index='{}']".format(
                    chapter + 1, verse + 1))
            rashis = Ref('Rashi on {}.{}.{}'.format(book, chapter + 1,
                                                    verse + 1))
            total_rashis = len(rashis.all_subrefs())

            if verse_element is None:
                good_verse = False

            # compare number of Rashis on daat and sefaria. If only one Rashi link can be made
            elif total_rashis != int(verse_element.find(
                    'total_rashis').text) and total_rashis != 1:
                good_verse = False

            # compare number of siftei chakhmim on daat and Torat Emet
            elif len(parsed_data[book][chapter][verse]) != len(
                    verse_element.findall('comment')):
                good_verse = False

            if good_verse:

                # grab the exact Rashi comment number to link to
                comment_number = comment['indices'][2]
                comment_element = verse_element.findall(
                    'comment')[comment_number]
                if total_rashis == 1:
                    rashi_value = 1
                else:
                    rashi_value = int(comment_element.attrib['rashi_comment'])

                refs = [
                    u'Siftei Hakhamim, {}.{}.{}.{}'.format(
                        book, *[x + 1 for x in comment['indices']]),
                    u'Rashi on {}.{}.{}.{}'.format(book, chapter + 1,
                                                   verse + 1, rashi_value)
                ]

                # build the link object
                links.append({
                    'refs': refs,
                    'type': 'commentary',
                    'auto': False,
                    'generated_by': 'Siftei Hakhamim parse script'
                })

            else:
                bad_link = [book]
                bad_link.extend([x + 1 for x in comment['indices']])
                url = 'draft.sefaria.org/Siftei_Hakhamim,_{}.{}.{}.{}'\
                    .format(book, *[x+1 for x in comment['indices']])
                bad_link.append(url)
                errors.append(bad_link)

    # write errors to csv file
    with open(error_file, 'w') as outfile:
        writer = csv.writer(outfile, delimiter=';')
        writer.writerow(['Book', 'Chapter', 'Verse', 'Comment', 'url'])
        writer.writerows(errors)

    return links
Esempio n. 25
0
    soup = soupAndOpen("./pages/%s" % (filename))

    if siman_num is 3 or siman_num is 4 or siman_num is 7:  #siman numbers that did not conform to be able to parse
        print "outlier", siman_num
        outlierParse(soup, siman_num)

    else:
        print "regular", siman_num
        regularParse(soup, siman_num)

ja_to_xml(simanim_ja.array(), ["siman", "seif", "comment"])

links = []

for comment in traverse_ja(simanim_ja.array()):
    links.append({
        'refs': [
            'Shulchan_Arukh, Orach_Chayim.{}.{}'.format(
                comment['indices'][0] - 1, comment['indices'][1] - 1),
            'Biur Halacha.{}.{}.{}'.format(
                *[i - 1 for i in comment['indices']])
        ],
        'type':
        'commentary',
        'auto':
        True,
        'generated_by':
        'Biur Halacha linker'
    })
def generate_links(parsed_data, link_filename='fixed_links.xml', error_file='errors.csv'):
    """
    Using an xml of data from daat and parsed text, generate all links
    :param parsed_data: Dictionary keys are books of Torah, values are parsed text into ja.
    :param link_filename: Filename of xml file that holds link data.
    :param error_file: Filename of csv file which contains all comments that could not be linked.
    :return: List of link objects
    """
    links, errors = [], []
    root = ET.parse(link_filename).getroot()

    for book in library.get_indexes_in_category('Torah'):
        book_element = root.find(book)
        for comment in util.traverse_ja(parsed_data[book], bottom=basestring):

            good_verse = True
            chapter, verse = comment['indices'][0], comment['indices'][1]

            # get the verse from the xml
            verse_element = book_element.find("./chapter[@chap_index='{}']/verse[@verse_index='{}']"
                                              .format(chapter+1, verse+1))
            rashis = Ref('Rashi on {}.{}.{}'.format(book, chapter+1, verse+1))
            total_rashis = len(rashis.all_subrefs())

            if verse_element is None:
                good_verse = False

            # compare number of Rashis on daat and sefaria. If only one Rashi link can be made
            elif total_rashis != int(verse_element.find('total_rashis').text) and total_rashis != 1:
                good_verse = False

            # compare number of siftei chakhmim on daat and Torat Emet
            elif len(parsed_data[book][chapter][verse]) != len(verse_element.findall('comment')):
                good_verse = False

            if good_verse:

                # grab the exact Rashi comment number to link to
                comment_number = comment['indices'][2]
                comment_element = verse_element.findall('comment')[comment_number]
                if total_rashis == 1:
                    rashi_value = 1
                else:
                    rashi_value = int(comment_element.attrib['rashi_comment'])

                refs = [u'Siftei Hakhamim, {}.{}.{}.{}'.format(book, *[x+1 for x in comment['indices']]),
                        u'Rashi on {}.{}.{}.{}'.format(book, chapter+1, verse+1, rashi_value)]

                # build the link object
                links.append({
                    'refs': refs,
                    'type': 'commentary',
                    'auto': False,
                    'generated_by': 'Siftei Hakhamim parse script'
                })

            else:
                bad_link = [book]
                bad_link.extend([x+1 for x in comment['indices']])
                url = 'draft.sefaria.org/Siftei_Hakhamim,_{}.{}.{}.{}'\
                    .format(book, *[x+1 for x in comment['indices']])
                bad_link.append(url)
                errors.append(bad_link)

    # write errors to csv file
    with open(error_file, 'w') as outfile:
        writer = csv.writer(outfile, delimiter=';')
        writer.writerow(['Book', 'Chapter', 'Verse', 'Comment', 'url'])
        writer.writerows(errors)

    return links
Esempio n. 27
0
        'schema': schema.serialize()  # This line converts the schema into json
    }
    post_index(index_dict)

    post_text('HaGra on Sefer Yetzirah Gra Version',
              text_version,
              index_count='on')


# post with the post function
post_this()

# create the link objects btween the dibur HaMatchil of the GRA and the main text
gra_links = []
# use a generator to go over the text and find the 3 level indices
for dh in traverse_ja(gra):
    link = ({
        "refs": [
            "HaGra on Sefer Yetzirah Gra Version " +
            '%d:%d:%d' % tuple(x + 1 for x in dh['indices']),
            "Sefer Yetzirah Gra Version " +
            '%d:%d' % tuple(x + 1 for x in dh['indices'][:2]),
        ],
        "type":
        "commentary",
        "auto":
        True,
        "generated_by":
        "gra_parse"
    })
    dh_text = dh['data']