Beispiel #1
0
def parse_Raph_simanim(alinged_list):
    '''
    note: although there is (not often) a differentiation in the original txt file,
    raph letters can be divided into smaller segments. In this code we combined those segments.
    returning, every raph letter as a line.
    '''
    ja = []
    siman = []
    i = 1
    prev_siman = u'א'
    for obj in alinged_list:
        if obj['siman'] == prev_siman:
            siman.append(obj['raph'])
            continue
        else:
            ja.append(siman)
            while getGematria(obj['siman']) != (getGematria(prev_siman) + i):
                ja.append([])
                i += 1
            i = 1
            siman = []
            siman.append(obj['raph'])
        prev_siman = obj['siman']
    ja.append(siman)
    ja_to_xml(ja, ['siman', 'letter'], 'raph_simanim.xml')
    return ja
Beispiel #2
0
def parse_hagahot_by_letter(filename):
    def cleaner(my_text):
        replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u''}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'\$', line) and not line.isspace():
            line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    try:
        ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True,  grab_all=[True, True], group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'
    new_ja = regs_devide(cleaned, regs, u'(נשלם מלאכת שבעת הימים)')
    ja_to_xml(new_ja, ['siman', 'letter', 'segments'], 'hagahot_letters.xml')

    return new_ja
def fifty_parse(lines, replace_dict):
    # start the parsing of part fifty
    arr = []
    perek = []
    peska = []
    new_lines = []
    for line in lines:
        line = split_lines(line)
        new_lines.extend(line)

    for line in new_lines:
        if line.find(ur'@05') is not -1:
            if perek:

                perek.append(peska)
                peska = []
                arr.append(perek)
                perek = []
        else:
            if (line.find(u'@13') is not -1) and (peska):
                perek.append(peska)
                peska = []
            line = multiple_replace(line, replace_dict, using_regex=True)
            peska.append(line)
    perek.append(peska)
    arr.append(perek)
    ja_to_xml(arr,['perek', 'piska', 'break'], 'raavad_50.xml')

    return arr
Beispiel #4
0
def parse_he(filename):
    """
    :returns a dictionary, key: name of book, value: JaggadArray obj of the ja for the book
    """
    replace_dict = {
        u'@(11|44|99)': u'<b>',
        u'@(33|55)': u'</b>',
        ur'@22\(([\u05d0-\u05ea]{1,3})\)': u'',
        ur'@(22|77)': u''
    }

    def cleaner(my_text):
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)

        return new

    regs = [
        ur'@00(?P<gim>)', ur'@02(?P<gim>[\u05d0-\u05ea]{1,3})',
        ur'@22\((?P<gim>[\u05d0-\u05ea]{1,3})\)'
    ]  # ,ur'@77'
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of Parasha start with @01
    cleaned = []
    dh_list = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
        if starting and not re.search(u'@01', line) and not line.isspace():
            dh_recognize = re.compile(ur'@11(.*?)@33')
            if dh_recognize.search(line):
                dh_list.append(dh_recognize.search(line).group(1))
            line = re.sub(dh_recognize, ur'#<b>\1</b>', line)
            line = re.split(ur'#', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                cleaned.extend(line)

    tt_ja = file_to_ja_g(4,
                         cleaned,
                         regs,
                         cleaner,
                         gimatria=True,
                         group_name='gim',
                         grab_all=[False, False, False]).array()
    Pentateuch = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']
    parsed_texts = dict({book: ja for book, ja in zip(Pentateuch, tt_ja)})

    for book, ja in zip(Pentateuch, tt_ja):
        ja_to_xml(ja, ['perek', 'pasuk', 'comment'], 'tur_{}.xml'.format(book))

    # for str in  dh_list:
    #     print str
    return parsed_texts
def parse_hagahot_by_letter(filename):
    def cleaner(my_text):
        replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@77': u''}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'\$', line) and not line.isspace():
            line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    try:
        ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True,  grab_all=[True, True], group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'
    new_ja = regs_devide(cleaned, regs, u'(נשלם מלאכת שבעת הימים)')
    ja_to_xml(new_ja, ['siman', 'letter', 'segments'], 'hagahot_letters.xml')

    return new_ja
def parse_Raph_simanim(alinged_list):
    '''
    note: although there is (not often) a differentiation in the original txt file,
    raph letters can be divided into smaller segments. In this code we combined those segments.
    returning, every raph letter as a line.
    '''
    ja = []
    siman = []
    i = 1
    prev_siman = u'א'
    for obj in alinged_list:
        if obj['siman'] == prev_siman:
          siman.append(obj['raph'])
          continue
        else:
            ja.append(siman)
            while getGematria(obj['siman']) != (getGematria(prev_siman) + i):
                ja.append([])
                i += 1
            i = 1
            siman = []
            siman.append(obj['raph'])
        prev_siman = obj['siman']
    ja.append(siman)
    ja_to_xml(ja, ['siman', 'letter'], 'raph_simanim.xml')
    return ja
Beispiel #7
0
def raavad_perush_parse(lines, replace_dict):
    # start the parsing of part raavad text itself
    arr = []
    first_p = True
    first_m = True
    first_d = True
    perek = []
    mishna = []
    dibur = []
    for line in lines:
        if line.find(u'@00') is not -1:
            # perek
            if first_p:
                first_p = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                arr.append(perek)
                perek = []
                first_m = True  # since this is opening a new perek
                first_d = True
        elif line.find(
                u'@22'
        ) is not -1:  # notice that this parsing is given that there is no text on same line with @22 and @00
            # mishna
            if first_m:
                first_m = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                first_d = True  # since this is opening a new mishna
        else:
            # this line is going to be part of the dibur
            # Dibur Hamatchil
            if re.search(u'@(31|98)', line) and (
                    not first_d
            ):  # and not first_d:  # probably start a new dibur
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
            else:
                if first_d:
                    first_d = False
            # segment ocr tag fixing
            line = multiple_replace(line, replace_dict, using_regex=True)
            dibur.append(line)
    dibur = ' '.join(dibur)
    mishna.append(dibur)
    perek.append(mishna)
    arr.append(perek)
    ja_to_xml(arr, ['perek', 'mishna', 'dibur'], 'raavad_text.xml')

    return arr
Beispiel #8
0
def parse_raph(filename, smk_ja):
    '''

    :param filename: raph source txt file
    :param smk_ja: JA obj smk parsed [siman,segment]
    :return: JA obj parsed [siman, letter] some simanim will be empty
    '''

    def cleaner(my_text):
        replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'',
                        u'@(33|22)': u''}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    try:
        ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array()
    except AttributeError:
        print 'there are more regs then levels...'

    ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml')

    d1 = 0
    aligned = []
    siman = []
    segment = []
    for letter in smk_ja.array():
        for seg in letter:
            for ff in re.finditer(u'@55[\u05d0-\u05ea]{0,3}', seg):
                # segment.append(ja[d1])
                siman.append(ja[d1])
                d1 += 1
            if segment != []:
                siman.extend(segment) #rather then append
                # segment = []
        aligned.append(siman)
        siman = []
    ja_to_xml(aligned, ['siman', 'letter', 'segment'], 'raph_simanim_24.xml')
    return JaggedArray(aligned)
Beispiel #9
0
def test_jagged_array_to_xml():
    xml_buffer = StringIO()
    util.ja_to_xml(['foo'], ['foo'], xml_buffer)
    xml_buffer.seek(0)
    assert xml_buffer.read() == '<root><foo index="1">foo</foo></root>'

    with pytest.raises(TypeError):
        util.ja_to_xml(['foo', {'bar'}], ['foo'], xml_buffer)
    xml_buffer.close()
Beispiel #10
0
def raavad_parse():
    with codecs.open('yitzira_raavad.txt', 'r', 'utf-8') as fp:
        lines = fp.readlines()
    # init JA and there depths
    ja_sp = []  # JA starting points in the txt

    # dictionary for line ocr tag fixing
    replace_dict = {
        u'@(44|13|31|41)': u'<b>',
        u'@(45|14|32|42)': u'</b>',  # bold in text
        u'@(03|04|10|11|98|99|56)': u'',  #
        u'@55': ur'<img src = " " height = "100" width = "100">',  # image tag
        ur'(\*\[.*?\])': ur'<small>\1</small>'  # notes in the text
    }
    # check if we got to the end of the legend and change to started
    startJA = None
    for line_num, line in enumerate(lines):
        if line == u'\n':
            startJA = line_num + 1  # ignoring the book name from text
            ja_sp.append(startJA)

    ja_fifty = fifty_parse(lines[ja_sp[0] + 1:ja_sp[1]], replace_dict)
    ja_32_hakdama_n = threty_two_parse(lines[ja_sp[1]:ja_sp[2]], replace_dict,
                                       'hakdama_n')
    ja_32_netivot = threty_two_parse(lines[ja_sp[2]:ja_sp[3]], replace_dict,
                                     'netivot')
    ja_32_hakdama_p = threty_two_parse(lines[ja_sp[3]:ja_sp[4]], replace_dict,
                                       'hakdama_p')
    ja_32_perush = threty_two_parse(lines[ja_sp[4]:ja_sp[5]], replace_dict,
                                    'perush')
    ja_old_parse = raavad_perush_parse(lines[ja_sp[5]:], replace_dict)
    ja_raavad_perush = raavad_new_parse(ja_old_parse)

    #  not nice fixing of segments into break tags
    ja_32_netivot[31] = ja_32_netivot[31] + '<br><small>' + ja_32_netivot[
        32] + '</small>'
    ja_32_netivot = ja_32_netivot[:32]
    ja_32_perush[0] = ja_32_hakdama_p[0] + '<br>' + ja_32_hakdama_p[
        1] + '<br><br>' + ja_32_perush[0]

    ja_to_xml(ja_32_perush, ['netiv'], 'test.xml')
    return {
        'Raavad on Sefer Yetzirah':
        ja_raavad_perush,
        'Raavad on Sefer Yetzirah, Introduction, The Fifty Gates of Understanding':
        ja_fifty,
        'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom, Introduction':
        ja_32_hakdama_n,
        'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom':
        ja_32_netivot,
        'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom, The Thirty Two Paths Explained':
        ja_32_perush,
        'old_parsing_of_perush':
        ja_old_parse  # outputing the old parse since it is used for the linking and linking is an outer function
    }
Beispiel #11
0
def parse_semak(filename):
    def cleaner(my_text):
        replace_dict = {
            u'@11(.*?)@12': ur'<b>\1</b>',
            u'@33(.*?)@34': ur'<b>\1</b>',
            u'@66(.*?)@67': ur'\1',
            u"@44": u""
        }

        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@22(?P<gim>[\u05d0-\u05ea]{1,3})']  #, u'@(11|23|33)(?P<gim>)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    letter_section = []
    alt_day = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if re.search(u'@00', line):
            alt_day.append(line_num)
        if not re.search(u'@00', line) and not line.isspace():
            if re.search(u'@22', line):
                line = re.split(u'(@22[\u05d0-\u05ea]{1,3})', line)
                if isinstance(line, basestring):
                    cleaned.append(line)
                else:
                    [cleaned.append(st) for st in line if st]
            else:
                cleaned.append(line)
    alt_day.append(len(lines))
    print alt_day
    try:
        smk_ja = file_to_ja_g(2,
                              cleaned,
                              regs,
                              cleaner,
                              gimatria=True,
                              grab_all=[False, True, True],
                              group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'
    ja_to_xml(smk_ja, ['letter', 'segments'], 'smk.xml')

    return smk_ja
def raavad_perush_parse(lines, replace_dict):
    # start the parsing of part raavad text itself
    arr = []
    first_p = True
    first_m = True
    first_d = True
    perek = []
    mishna = []
    dibur = []
    for line in lines:
        if line.find(u'@00') is not -1:
            # perek
            if first_p:
                first_p = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                arr.append(perek)
                perek = []
                first_m = True  # since this is opening a new perek
                first_d = True
        elif line.find(u'@22') is not -1:  # notice that this parsing is given that there is no text on same line with @22 and @00
            # mishna
            if first_m:
                first_m = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                first_d = True  # since this is opening a new mishna
        else:
            # this line is going to be part of the dibur
            # Dibur Hamatchil
            if re.search(u'@(31|98)', line) and (not first_d):# and not first_d:  # probably start a new dibur
                    dibur = ' '.join(dibur)
                    mishna.append(dibur)
                    dibur = []
            else:
                if first_d:
                    first_d = False
            # segment ocr tag fixing
            line = multiple_replace(line, replace_dict, using_regex=True)
            dibur.append(line)
    dibur = ' '.join(dibur)
    mishna.append(dibur)
    perek.append(mishna)
    arr.append(perek)
    ja_to_xml(arr,['perek', 'mishna', 'dibur'], 'raavad_text.xml')

    return arr
def parse_general(filename):

    def cleaner(my_text):
        return my_text

    regs = [u'@22 ?(\u05e4\u05e8\u05e7)? ?(?P<gim>[\u05d0-\u05ea]{1,3})', u'\{(?P<gim>[\u05d0-\u05ea]{1,3})\}']
    with codecs.open(filename, 'r', 'utf-8') as infile:
        bs_ja = file_to_ja_g(3, infile, regs, cleaner).array()
        print bs_ja
        ja_to_xml(bs_ja,['perek','pasuk','comment'], '{}xml'.format(re.search(u'.*\.',filename).group()))
        return bs_ja
def raavad_new_parse(ja):

    newJa, m1, p1 = [], [], []
    for p in ja:
        for m in p:
            d1 = [split_lines(d) for d in m]
            p1.extend(d1)
        newJa.append(p1)
        p1 = []
    ja_to_xml(newJa,['perek', 'dibur','paragraph'], 'new_parse.xml')
    return newJa
Beispiel #15
0
def parse_general(filename):

    def cleaner(my_text):
        return my_text

    regs = [u'@22 ?(\u05e4\u05e8\u05e7)? ?(?P<gim>[\u05d0-\u05ea]{1,3})', u'\{(?P<gim>[\u05d0-\u05ea]{1,3})\}']
    with codecs.open(filename, 'r', 'utf-8') as infile:
        bs_ja = file_to_ja_g(3, infile, regs, cleaner).array()
        print(bs_ja)
        ja_to_xml(bs_ja,['perek','pasuk','comment'], '{}xml'.format(re.search(u'.*\.',filename).group()))
        return bs_ja
Beispiel #16
0
def raavad_new_parse(ja):

    newJa, m1, p1 = [], [], []
    for p in ja:
        for m in p:
            d1 = [split_lines(d) for d in m]
            p1.extend(d1)
        newJa.append(p1)
        p1 = []
    ja_to_xml(newJa, ['perek', 'dibur', 'paragraph'], 'new_parse.xml')
    return newJa
Beispiel #17
0
def basic_test_suite():
    root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True)
    basetext = root.getBaseTextArray()
    ja_to_xml(basetext, ['Section', 'Segment'], 'base_text.xml')
    # root.review_commentaries()
    # root.check_commentary_chapters()
    comms = root.body.commentaries
    for c in comms.get_commentary():
        if comms.is_linked_commentary(c) and c.get_author() != 'UNKNOWN':
            parsed = c.parse_linked()
            ja_to_xml(parsed, ['Chapter', 'Verse', 'Comment'], 'commentary.xml')
            break
Beispiel #18
0
def basic_test_suite():
    root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True)
    basetext = root.getBaseTextArray()
    ja_to_xml(basetext, ['Section', 'Segment'], 'base_text.xml')
    # root.review_commentaries()
    # root.check_commentary_chapters()
    comms = root.body.commentaries
    for c in comms.get_commentary():
        if comms.is_linked_commentary(c) and c.get_author() != 'UNKNOWN':
            parsed = c.parse_linked()
            ja_to_xml(parsed, ['Chapter', 'Verse', 'Comment'], 'commentary.xml')
            break
Beispiel #19
0
def parse_Raph(filename):
    def cleaner(my_text):
        replace_dict = {
            u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'',
            u'@33': u''
        }  #{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [
        ur'@77(?P<gim>[\u05d0-\u05ea]{0,3})',
        ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'
    ]  # (?P<gim>[\u05d0-\u05ea]{1,3})
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    letter_section = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@(?:77|11)[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line
                 if st]  #(st and not re.search(u'@(77)', st))]
            # else:
            #     cleaned.append(line)
    try:
        ja = file_to_ja_g(3,
                          cleaned,
                          regs,
                          cleaner,
                          gimatria=True,
                          grab_all=[False, False, True],
                          group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'

    ja_to_xml(ja, ['page', 'letter', 'segments'], 'raph.xml')

    return ja
Beispiel #20
0
def parse_smk(filename):
    '''
    :param filename: smk source txt file
    :return: JA obj smk parsed to depth 2 [siman, segment] (including a citation segment at the top of each siman)
    '''

    def cleaner(my_text):
        replace_dict = {u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>',
                        u'@66(.*?)@67': ur'\1'}  # , u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@22(?P<gim>[\u05d0-\u05ea]{1,3})']  # , u'@(11|23|33)(?P<gim>)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    letter_section = []
    alt_day = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if re.search(u'@00', line):
            alt_day.append(line_num)
        if not re.search(u'@00', line) and not line.isspace():
            if re.search(u'@22', line):
                line = re.split(u'(@22[\u05d0-\u05ea]{1,3})', line)
                if isinstance(line, basestring):
                    cleaned.append(line)
                else:
                    [cleaned.append(st) for st in line if st]
            else:
                cleaned.append(line)
    alt_day.append(len(lines))
    print alt_day
    try:
        smk_ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False, True, True],
                              group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'

    ja_to_xml(smk_ja, ['letter', 'segments'], 'smk.xml')

    return JaggedArray(smk_ja)
Beispiel #21
0
def parse_hagahot(filename, smk_ja, raph_ja):
    '''

    :param filename: hagahot source txt file
    :param smk_ja: smk JA obj [siman, segment]
    :param raph_ja: raph JA obj [siman, letter]
    :return: JA obj
    '''

    ja_hagahot = []
    def cleaner(my_text):
        #todo: deal with @44 and @00 (@00 maybe should be only in smk base text? - ask Shmuel)
        replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@(33|77|88|99)': u'', u'@55(.*?)@66': u'<b>\1</b>'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line):
            line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line)
            if isinstance(line, basestring) and line != u'':
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    try:
        ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False], group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'
    ja_to_xml(ja, ['siman', 'letter'], 'hagahot_letters_25.xml') #, 'segments'

    # for hghds in
    return JaggedArray(ja_hagahot)
Beispiel #22
0
def hagahot_parse(ja_hagahot, hagahot_dict_lst):
    def num_haghot_in_siman(siman_dict):
        return len(siman_dict['smk']) + len(siman_dict['raph'])

    ja_hagahot = JaggedArray(ja_hagahot)
    ja_hagahot = ja_hagahot.flatten_to_array()
    hg_ja = []
    p_hg = 0
    for dict in hagahot_dict_lst:
        if re.search(u"^@[^1]", ja_hagahot[p_hg]):
            p_hg += 1
        p_hg_end = p_hg + num_haghot_in_siman(dict)
        hg_ja.append(ja_hagahot[p_hg:p_hg_end])
        p_hg = p_hg_end
    hg_ja.append(ja_hagahot[p_hg::])

    ja_to_xml(hg_ja, ['siman', 'letter'], 'haghot_by_smk_simanim.xml')
    return hg_ja
Beispiel #23
0
def test(book):
    qa_issues = open('Ibn Ezra on {} misalignments.txt'.format(book), 'w')
    levi = parse(file_data[book])
    vtitle = 'Devarim' if book == 'Deuteronomy' else book
    torat_emet = Ref("Ibn Ezra on {}".format(book)).text('he', 'Ibn Ezra on {} -- Torat Emet'.format(vtitle)).ja().array()
    count = 0
    for c_index, (my_chapter, thier_chapter) in enumerate(zip(levi, torat_emet)):
        for v_index, (my_verse, their_verse) in enumerate(zip(my_chapter, thier_chapter)):
            if len(my_verse) != len(their_verse):
                    qa_issues.write('issue found at {}:{}\n'.format(c_index+1, v_index+1))
                    count += 1
        if len(my_chapter) != len(thier_chapter):
            by_length = sorted((my_chapter, thier_chapter), key=lambda x:len(x))
            for i in range(len(by_length[0]), len(by_length[1])):
                qa_issues.write('issue found at {}:{}\n'.format(c_index+1, i+1))
                count += 1
    qa_issues.close()
    print '{} issues found'.format(count)
    ja_to_xml(levi, ['Chapter', 'Verse', 'Comment'])
def hagahot_parse(ja_hagahot, hagahot_dict_lst):

    def num_haghot_in_siman(siman_dict):
        return len(siman_dict['smk']) + len(siman_dict['raph'])

    ja_hagahot = JaggedArray(ja_hagahot)
    ja_hagahot = ja_hagahot.flatten_to_array()
    hg_ja = []
    p_hg = 0
    for dict in hagahot_dict_lst:
        if re.search(u"^@[^1]", ja_hagahot[p_hg]):
            p_hg += 1
        p_hg_end = p_hg + num_haghot_in_siman(dict)
        hg_ja.append(ja_hagahot[p_hg:p_hg_end])
        p_hg = p_hg_end
    hg_ja.append(ja_hagahot[p_hg::])

    ja_to_xml(hg_ja, ['siman', 'letter'], 'haghot_by_smk_simanim.xml')
    return hg_ja
Beispiel #25
0
def parse_Raph_by_letter(filename):
    '''parsing according to the letters, is the main ja, to post for the raph'''
    def cleaner(my_text):
        replace_dict = {
            u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'',
            u'@33': u''
        }  #{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    new_ja = regs_devide(cleaned, regs)
    try:
        # ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True,  grab_all=[True, True], group_name='gim').array()
        ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array()
    except AttributeError:
        print 'there are more regs then levels...'

    # ja_to_xml(new_ja, ['Alef', 'letter', 'segments'], 'raph_letters.xml')
    ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml')

    return ja
Beispiel #26
0
def clean(JA, replace_dict):
    '''

    :param JA: JA obj of the text to be cleand
    :param replace_dict: a dictionary of what to replace
    :return: cleaned JA
    '''
    # replace_dict = {u'@23': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>',
    #                 u'@66(.*?)@67': ur'\1'}  # , u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'}
    lstlst = JA.array()
    new = []
    nd1 = []
    for d1 in lstlst:
        for d2 in d1:
            nd2 = multiple_replace(d2, replace_dict, using_regex=True)
            nd1.append(nd2)
        new.append(nd1)
        nd1 = []
    ja_to_xml(new, ['letter', 'segments'], 'clean_smk.xml')
    return JaggedArray(new)
Beispiel #27
0
def threty_two_parse(lines, replace_dict, str):
    # start the parsing of 32 netivot
    arr = []
    netiv = []
    first = True
    for line in lines:
        if re.search(u'@(13|03)', line):  # and (netiv):
            if first:
                first = False
            else:
                netiv = ' '.join(netiv)
                arr.append(netiv)
                netiv = []
        line = multiple_replace(line, replace_dict, using_regex=True)
        netiv.append(line.strip())
    netiv = ' '.join(netiv)
    arr.append(netiv)
    ja_to_xml(arr, ['netiv'], '{}{}'.format(str, '_32.xml'))

    return arr
def threty_two_parse(lines, replace_dict, str):
    # start the parsing of 32 netivot
    arr = []
    netiv = []
    first = True
    for line in lines:
        if re.search(u'@(13|03)', line):# and (netiv):
            if first:
                first = False
            else:
                netiv = ' '.join(netiv)
                arr.append(netiv)
                netiv = []
        line = multiple_replace(line, replace_dict, using_regex=True)
        netiv.append(line.strip())
    netiv = ' '.join(netiv)
    arr.append(netiv)
    ja_to_xml(arr, ['netiv'], '{}{}'.format(str,'_32.xml'))

    return arr
def raavad_parse():
    with codecs.open('yitzira_raavad.txt', 'r', 'utf-8') as fp:
        lines = fp.readlines()
    # init JA and there depths
    ja_sp = [] # JA starting points in the txt

    # dictionary for line ocr tag fixing
    replace_dict = {u'@(44|13|31|41)': u'<b>', u'@(45|14|32|42)': u'</b>',# bold in text
                    u'@(03|04|10|11|98|99|56)' : u'', #
                    u'@55' : ur'<img src = " " height = "100" width = "100">',  # image tag
                    ur'(\*\[.*?\])': ur'<small>\1</small>'  # notes in the text
                    }
    # check if we got to the end of the legend and change to started
    startJA = None
    for line_num, line in enumerate(lines):
        if line == u'\n':
            startJA = line_num + 1  # ignoring the book name from text
            ja_sp.append(startJA)

    ja_fifty = fifty_parse(lines[ja_sp[0]+1:ja_sp[1]],replace_dict)
    ja_32_hakdama_n = threty_two_parse(lines[ja_sp[1]:ja_sp[2]], replace_dict, 'hakdama_n')
    ja_32_netivot = threty_two_parse(lines[ja_sp[2]:ja_sp[3]], replace_dict, 'netivot')
    ja_32_hakdama_p = threty_two_parse(lines[ja_sp[3]:ja_sp[4]], replace_dict, 'hakdama_p')
    ja_32_perush = threty_two_parse(lines[ja_sp[4]:ja_sp[5]], replace_dict, 'perush')
    ja_old_parse = raavad_perush_parse(lines[ja_sp[5]:], replace_dict)
    ja_raavad_perush = raavad_new_parse(ja_old_parse)

    #  not nice fixing of segments into break tags
    ja_32_netivot[31] = ja_32_netivot[31] + '<br><small>' + ja_32_netivot[32] + '</small>'
    ja_32_netivot = ja_32_netivot[:32]
    ja_32_perush[0] = ja_32_hakdama_p[0] + '<br>' + ja_32_hakdama_p[1] + '<br><br>' + ja_32_perush[0]

    ja_to_xml(ja_32_perush, ['netiv'], 'test.xml')
    return {'Raavad on Sefer Yetzirah': ja_raavad_perush,
                'Raavad on Sefer Yetzirah, Introduction, The Fifty Gates of Understanding': ja_fifty,
                'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom, Introduction': ja_32_hakdama_n,
                'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom': ja_32_netivot,
                'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom, The Thirty Two Paths Explained': ja_32_perush,
                'old_parsing_of_perush' : ja_old_parse # outputing the old parse since it is used for the linking and linking is an outer function
            }
def parse_Raph_by_letter(filename):
    '''parsing according to the letters, is the main ja, to post for the raph'''
    def cleaner(my_text):
        replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    new_ja = regs_devide(cleaned, regs)
    try:
        # ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True,  grab_all=[True, True], group_name='gim').array()
        ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array()
    except AttributeError:
        print 'there are more regs then levels...'

    # ja_to_xml(new_ja, ['Alef', 'letter', 'segments'], 'raph_letters.xml')
    ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml')

    return ja
def parse_Raph(filename):
    def cleaner(my_text):
        replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
        new = []
        for line in my_text:
            line = multiple_replace(line,replace_dict,using_regex=True)
            new.append(line)
        return new

    regs = [ur'@77(?P<gim>[\u05d0-\u05ea]{0,3})', ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})']  # (?P<gim>[\u05d0-\u05ea]{1,3})
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    letter_section = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@(?:77|11)[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]#(st and not re.search(u'@(77)', st))]
            # else:
            #     cleaned.append(line)
    try:
        ja = file_to_ja_g(3, cleaned, regs, cleaner, gimatria=True,  grab_all=[False, False, True], group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'

    ja_to_xml(ja, ['page', 'letter', 'segments'], 'raph.xml')

    return ja
# -*- coding: utf-8 -*-
import codecs
from sefaria.model import *
import regex
from sources import functions
from data_utilities import util
from sources.Rif_on_Nedarim import rif_nedarim_functions

"""
index record
parse text
text record
link
clean
"""

index = rif_nedarim_functions.create_index()
functions.post_index(index)

rif_nedarim = rif_nedarim_functions.parse()

ref = 'Rif_Nedarim'
text = rif_nedarim_functions.create_text(rif_nedarim)
functions.post_text(ref, text)

testing_file = codecs.open("testing_file.txt", 'w', 'utf-8')
util.jagged_array_to_file(testing_file, rif_nedarim, ['Daf', 'Line'])
testing_file.close()

util.ja_to_xml(rif_nedarim, ['Daf', 'Line'])
Beispiel #33
0
# text = rasag_commentaries_functions.create_text(positive_commandments)
# functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Negative Commandments'
# text = rasag_commentaries_functions.create_text(negative_commandments)
# functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Laws of the Courts, Introduction'
# text = rasag_commentaries_functions.create_text(punishments[0])
# functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Laws of the Courts'
# text = rasag_commentaries_functions.create_text(punishments[1])
# functions.post_text(ref, text)
#
util.ja_to_xml(communal[1], ['FIRST', 'SECOND', 'THIRD', 'FOURTH', 'FIFTH'])
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Communal Laws, Introduction'
# text = rasag_commentaries_functions.create_text(communal[0])
# functions.post_text(ref, text)
#
ref = 'Commentary on Sefer Hamitzvot of Rasag, Communal Laws'
text = rasag_commentaries_functions.create_text(communal[1])
functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Appendix, Introduction'
# text = rasag_commentaries_functions.create_text(miluim[0])
# functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Appendix'
# text = rasag_commentaries_functions.create_text(miluim[1])
regexes = [chapter_regex, halacha_regex]
for the_file in [x for x in os.listdir(folder) if "xml" not in x]:
    name = the_file.replace(".txt", "").split("-")[2]
    sefer = the_file.split("-")[1]

    sefarim.add(sefer)
    if name in processed:
        # Skip second version of Nashim
        continue
    file_path = os.path.join(folder, the_file)
    with codecs.open(file_path, "r", "utf-8") as infile:
        j = file_to_ja_g(3, infile, regexes, clean_segments, gimatria=True)
        processed[name] = {"cat": sefer, "text": j.array()}


        ja_to_xml(j.array(), ["Chapter","Halacha","Comment"], file_path.replace("txt","xml"))

processed[u"הלכות שופר וסוכה ולולב"] = {
    "cat": processed[u"הלכות שופר"]["cat"],
    "text": processed[u"הלכות שופר"]["text"][:3] + processed[u"הלכות סוכה"]["text"][3:6] + processed[u"הלכות לולב"]["text"][6:]
}
del processed[u"הלכות שופר"]
del processed[u"הלכות סוכה"]
del processed[u"הלכות לולב"]

processed[u"הלכות מגילה וחנוכה"] = {
    "cat": processed[u"הלכות מגילה"]["cat"],
    "text": processed[u"הלכות מגילה"]["text"][:2] + processed[u"הלכות חנוכה"]["text"][2:]
}
del processed[u"הלכות מגילה"]
del processed[u"הלכות חנוכה"]
from sources import functions
from data_utilities import util
from sources.Ralbag_on_Ruth import ralbag_ruth_functions

"""
index record
parse text
text record
link
clean
"""

index = ralbag_ruth_functions.create_index()
functions.post_index(index)

ralbag_ruth_dict = ralbag_ruth_functions.parse()

for key in ralbag_ruth_dict:
    ref = 'Ralbag Ruth'
    if key == 'Benefits':
        ref += ',_Benefits'
    text = ralbag_ruth_functions.create_text(ralbag_ruth_dict[key])
    functions.post_text(ref, text)

list_of_links = ralbag_ruth_functions.create_links(ralbag_ruth_dict['Commentary'])
functions.post_link(list_of_links)

ralbag_ruth = [ralbag_ruth_dict['Commentary'], ralbag_ruth_dict['Benefits']]

util.ja_to_xml(ralbag_ruth, ['FIRST', 'SECOND', 'THIRD', 'FOURTH'])
# text = rasag_commentaries_functions.create_text(positive_commandments)
# functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Negative Commandments'
# text = rasag_commentaries_functions.create_text(negative_commandments)
# functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Laws of the Courts, Introduction'
# text = rasag_commentaries_functions.create_text(punishments[0])
# functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Laws of the Courts'
# text = rasag_commentaries_functions.create_text(punishments[1])
# functions.post_text(ref, text)
#
util.ja_to_xml(communal[1], ['FIRST', 'SECOND', 'THIRD', 'FOURTH', 'FIFTH'])
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Communal Laws, Introduction'
# text = rasag_commentaries_functions.create_text(communal[0])
# functions.post_text(ref, text)
#
ref = 'Commentary on Sefer Hamitzvot of Rasag, Communal Laws'
text = rasag_commentaries_functions.create_text(communal[1])
functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Appendix, Introduction'
# text = rasag_commentaries_functions.create_text(miluim[0])
# functions.post_text(ref, text)
#
# ref = 'Commentary on Sefer Hamitzvot of Rasag, Appendix'
# text = rasag_commentaries_functions.create_text(miluim[1])
Beispiel #37
0
from sources import functions
from data_utilities import util
from sources.Ralbag_on_Esther import ralbag_esther_functions

"""
index record
parse text
text record
link
clean
"""

index = ralbag_esther_functions.create_index()
functions.post_index(index)

ralbag_esther_dict = ralbag_esther_functions.parse()

for key in ralbag_esther_dict:
    ref = 'Ralbag Esther,_{}'.format(key)
    if key == 'Commentary':
        ref = 'Ralbag Esther'
    text = ralbag_esther_functions.create_text(ralbag_esther_dict[key])
    functions.post_text(ref, text)

list_of_links = ralbag_esther_functions.create_links(ralbag_esther_dict['Commentary'])
functions.post_link(list_of_links)

ralbag_esther = [ralbag_esther_dict['Introduction'], ralbag_esther_dict['Commentary'], ralbag_esther_dict['Benefits']]

util.ja_to_xml(ralbag_esther, ['FIRST', 'SECOND', 'THIRD', 'FOURTH'])
Beispiel #38
0
        if line.find(ur'@05') is not -1:
            if perek:

                perek.append(peska)
                peska = []
                arr.append(perek)
                perek = []
        else:
            if (line.find(u'@13') is not -1) and (peska):
                perek.append(peska)
                peska = []
            line = multiple_replace(line, replace_dict, using_regex=True)
            peska.append(line)
    perek.append(peska)
    arr.append(perek)
    ja_to_xml(arr, ['perek', 'piska', 'break'], 'raavad_50.xml')

    return arr


# split a long line into a list of smaller lines useing key words like "VeHene".
#   note: if you use this spliting methos you want to be sure not to use the "".join()


def split_lines(line):
    # for line in lines:
    line = re.sub(u'\. (\u05d5?(\u05d4\u05e0\u05d4|\u05e2\u05d5\u05d3))',
                  ur'. ~\1', line)
    line_list = re.split(ur'~', line)
    return line_list
Beispiel #39
0
            ja.set_element(indices, temp, [])
            temp = []
            if pasuk_dh:
                indices = [
                    int(pasuk_dh.group(1)) - 1,
                    int(pasuk_dh.group(2)) - 1, indices[2]
                ]
                indices[2] = 0
            elif reg_dh:
                indices[2] += 1
        if not line.isspace() and not re.match(
                ur' *Parshat *(\S+) *(\S+)? *',
                line):  # don't put into array names of Parasha or empty lines
            temp.append(line)

    ja_to_xml(ja.array(), ['perek', 'pasuk', 'comment'],
              '{}.xml'.format(re.match('(.*)\.', filename).group(1)))
    return ja


def parse_all_en():
    en_texts = {}
    pentateuch = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']
    for book in pentateuch:
        parsed_book = parse_en('en_tur_{}.txt'.format(book.lower()))
        en_texts[book] = parsed_book
    return en_texts


def tt_schema():
    record_root = SchemaNode()
    record_root.add_title('Tur HaAroch', 'en', True)
    if sefer == u"ספר קרבנות":
        sefer = u"ספר קורבנות"
    if sefer == u"ספר קנין":
        sefer = u"ספר קניין"

    sefarim.add(sefer)
    if name in processed:
        # Skip second version of Nashim
        continue
    file_path = os.path.join(folder, the_file)
    with codecs.open(file_path, "r", "utf-8") as infile:
        j = file_to_ja_g(3, infile, regexes, clean_segments, gimatria=True)
        processed[name] = {"cat": sefer, "text": j.array()}


        ja_to_xml(j.array(), ["Chapter","Halacha","Comment"], file_path.replace("txt","xml"))


processed[u"הלכות תפילה וברכת כהנים"] = {
    "cat": processed[u"הלכות תפלה"]["cat"],
    "text": processed[u"הלכות תפלה"]["text"][:13] + processed[u"הלכות נשיאת כפים"]["text"][13:]
}
del processed[u"הלכות תפלה"]
del processed[u"הלכות נשיאת כפים"]


processed[u"הלכות תפילין ומזוזה וספר תורה"] = {
    "cat": processed[u"הלכות תפילין"]["cat"],
    "text": processed[u"הלכות תפילין"]["text"][:4] + processed[u"הלכות מזוזה"]["text"][4:6] + processed[u"הלכות ספר תורה"]["text"][6:]
}
del processed[u"הלכות תפילין"]
def text_parse():
    # open, read, close the original txt file
    with codecs.open('yitzira_pri_yitzhak.txt', 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    # init section lists and flags
    parsed = []
    perek = []
    mishna = []
    dibur = []
    first_p = True  # first perek flag
    first_m = True  # first mishna flag
    first_d = True  # first dibur flag
    ofen = False # 'ofen' flag

    # dictionary for line ocr tag fixing
    replace_dict = {
                    u'@11': u'',  # not necessary ocr tag
                    u'@31': u'<b>', u'@32': u'</b>',  # bold dibur hamatchil
                    u'@44': u'<b>', u'@45': u'</b>',  # was bold in text
                    u'@98': u'<small>', u'@99': u'</small>',  # the slik at the end
                    ur'\[\*(.*?)\]': ur'<small>[\1]</small>'  # footnotes
                    }
    # loop on lines and creat the jagged array
    for line in lines[starting:]:
        if line.find(u'@00') is not -1:
            # perek
            if first_p:
                first_p = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                parsed.append(perek)
                perek = []
                first_m = True  # since this is opening a new perek

        elif line.find(u'@22') == 0:  # notice that this parsing is given that there is no text on same line with @22 and @00
            # mishna
            if first_m:
                first_m = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                first_d = True  # since this is opening a new mishna
        else: # this line is going to be part of the dibur
            # Dibur Hamatchil
            if regex.search(u'@(03|31|98)', line):  # probably start a new dibur
                 if (not ofen) and (not first_d): # prob close prev dibur
                    dibur = ' '.join(dibur)
                    mishna.append(dibur)
                    dibur = []
                 else:
                    if ofen:
                        ofen = False
                    if first_d:
                        first_d = False
            if regex.search(u'@03', line):
                ofen = True
            # segment ocr tag fixing
            line = multiple_replace(line, replace_dict, using_regex = True)
            dibur.append(line)

    # once reached the end close all what was opened
    dibur = ' '.join(dibur)
    mishna.append(dibur)
    perek.append(mishna)
    parsed.append(perek)
    ja_to_xml(parsed,['perek','mishna','dibur'],filename = 'pri.xml')
    return parsed
Beispiel #42
0
    if siman_num is 5 or siman_num is 6:
        print "not real", siman_num
        continue  # 5 & 6 are simanim with no text but a page

    soup = soupAndOpen("./pages/%s" % (filename))

    if siman_num is 3 or siman_num is 4 or siman_num is 7:  #siman numbers that did not conform to be able to parse
        print "outlier", siman_num
        outlierParse(soup, siman_num)

    else:
        print "regular", siman_num
        regularParse(soup, siman_num)

ja_to_xml(simanim_ja.array(), ["siman", "seif", "comment"])

links = []

for comment in traverse_ja(simanim_ja.array()):
    links.append({
        'refs': [
            'Shulchan_Arukh, Orach_Chayim.{}.{}'.format(
                comment['indices'][0] - 1, comment['indices'][1] - 1),
            'Biur Halacha.{}.{}.{}'.format(
                *[i - 1 for i in comment['indices']])
        ],
        'type':
        'commentary',
        'auto':
        True,
from sources import functions
from data_utilities import util
from sources.Ralbag_on_Shir_HaShirim import ralbag_shir_hashirim_functions

"""
index record
parse text
text record
link
clean
"""

index = ralbag_shir_hashirim_functions.create_index()
functions.post_index(index)

ralbag_shir_hashirim_dict = ralbag_shir_hashirim_functions.parse()

for key in ralbag_shir_hashirim_dict:
    ref = 'Ralbag Song of Songs'
    if key == 'Introduction':
        ref += ',_Introduction'
    text = ralbag_shir_hashirim_functions.create_text(ralbag_shir_hashirim_dict[key])
    functions.post_text(ref, text)

list_of_links = ralbag_shir_hashirim_functions.create_links(ralbag_shir_hashirim_dict['Commentary'])
functions.post_link(list_of_links)

ralbag_ruth = [ralbag_shir_hashirim_dict['Introduction'], ralbag_shir_hashirim_dict['Commentary']]

util.ja_to_xml(ralbag_ruth, ['FIRST', 'SECOND', 'THIRD', 'FOURTH'])
# -*- coding: utf-8 -*-
import codecs
from sefaria.model import *
import regex
from sources import functions
from data_utilities import util
from sources.Rif_on_Nedarim import rif_nedarim_functions
"""
index record
parse text
text record
link
clean
"""

index = rif_nedarim_functions.create_index()
functions.post_index(index)

rif_nedarim = rif_nedarim_functions.parse()

ref = 'Rif_Nedarim'
text = rif_nedarim_functions.create_text(rif_nedarim)
functions.post_text(ref, text)

testing_file = codecs.open("testing_file.txt", 'w', 'utf-8')
util.jagged_array_to_file(testing_file, rif_nedarim, ['Daf', 'Line'])
testing_file.close()

util.ja_to_xml(rif_nedarim, ['Daf', 'Line'])