def jaggedarray_from_file(input_file, perek_tag, mishna_tag, skip_tag):
    """
    :param input_file: File to parse
    :param perek_tag: Used to identify the start of a new perek.
    :param mishna_tag: Identify next mishna.
    :return: A 2D jaggedArray to match Sefaria's format. Rough, will require more processing.
    """

    chapters, mishnayot, current = [], [], []
    found_first_chapter = False

    for line in input_file:

        # look for skip_tag
        if re.search(skip_tag, line):
            continue

        # look for tags
        new_chapter, new_mishna = re.search(perek_tag, line), re.search(mishna_tag, line)

        # make sure perek and mishna don't appear on the same line
        if new_chapter and new_mishna:
            print 'Mishna starts on same line as chapter\n'
            print '{}\n\n'.format(new_chapter.group())
            input_file.close()
            sys.exit(1)

        # found chapter tag.
        if new_chapter:
            if found_first_chapter:
                if current != []:
                    mishnayot.append(u' '.join(current).lstrip())
                    current = []
                chapters.append(mishnayot)
                mishnayot = []
            else:
                found_first_chapter = True
            continue

        if found_first_chapter:
            if new_mishna:
                if current != []:
                    mishnayot.append(u' '.join(current).lstrip())
                current = [util.multiple_replace(line, {u'\n': u'', u'\r': u'', new_mishna.group(): u''})]

            else:
                current.append(util.multiple_replace(line, {u'\n': u'', }))
            # add next line

    else:
        mishnayot.append(u''.join(current).lstrip())
        chapters.append(mishnayot)

    return chapters
Beispiel #2
0
def clean_line(line):
    line = strip_nikkud(line)
    replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right?
    line = multiple_replace(line, replace_dict, using_regex=True)
    # line = re.sub(u'[:\?]', '', line)
    # line = re.sub(u'”', u'"', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    f_ayyen = re.search(reg_ayyen_tur, line)
    f_lo_manu = re.search(reg_lo_manu, line)

    if f_ayyen:
        line = line[:f_ayyen.start()]
    if f_lo_manu:
        line = re.sub(f_lo_manu.group('a'), u"", line)
    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
Beispiel #3
0
 def cleaner(my_text):
     replace_dict = {u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}#, u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'}
     new = []
     for line in my_text:
         line = multiple_replace(line, replace_dict, using_regex=True)
         new.append(line)
     return new
Beispiel #4
0
def parse_en(filename):
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    ja = JaggedArray([[[[]]]])
    placing = u'(\s*[0-9]{1,2}),([0-9]{1,2})-?[0-9]*\.'  # the regex to find the indexing on Monk
    # q1, q2 = ur'“', ur'”' # Rabbi Monk uses these to enclose translation of a pasuk
    # dh_reg = ur'([\u05d0 - \u05ea]*), *({}.*?{})'.format(q1, q2)
    replace_dict = {placing: u'', u'@': ''}
    temp = []
    indices = [0] * 3
    for line in lines:
        pasuk_dh = re.match(placing, line)
        reg_dh = re.search(
            ur'@([\u05d0-\u05ea|\\s]*)',
            line)  #  reg_dh = re.search(ur'([\u05d0-\u05ea]+, *“.*?”)',line)
        line = multiple_replace(line, replace_dict, using_regex=True)
        if pasuk_dh or reg_dh:
            temp = ' '.join(temp)
            ja.set_element(indices, temp, [])
            temp = []
            if pasuk_dh:
                indices = [
                    int(pasuk_dh.group(1)) - 1,
                    int(pasuk_dh.group(2)) - 1, indices[2]
                ]
                indices[2] = 0
            elif reg_dh:
                indices[2] += 1
        if not line.isspace() and not re.match(
                ur' *Parshat *(\S+) *(\S+)? *',
                line):  # don't put into array names of Parasha or empty lines
            temp.append(line)
Beispiel #5
0
    def cleaner(my_text):
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)

        return new
Beispiel #6
0
 def cleaner(my_text):
     replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@77': u''}
     new = []
     for line in my_text:
         line = multiple_replace(line, replace_dict, using_regex=True)
         new.append(line)
     return new
 def cleaner(my_text):
     result = []
     for line in my_text:
         new_line = multiple_replace(line, {u'@31': u'<b>', u'@32': u'</b>'})
         new_line = re.sub(u'@[0-9]{2}', u'', new_line)
         result.append(new_line)
     return result
 def cleaner(my_text):
     replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@77': u''}
     new = []
     for line in my_text:
         line = multiple_replace(line, replace_dict, using_regex=True)
         new.append(line)
     return new
Beispiel #9
0
def join_singlet_tags(infile, infile_name, tag):
    """
    Certain tags may appear on their own line when they need to be inline with the text. This function
    fixes this.
    :param infile: Input file to be edited
    :param infile_name: Path to file to be edited
    :param tag: tag to search for
    :return: The updated file
    """

    infile.seek(0)
    temp_file_name = '{}.tmp'.format(infile_name)
    temp_file = codecs.open(temp_file_name, 'w', 'utf-8')
    replacements = {u'\r': u' ', u'\n': u' '}

    # clean up problematic lines then write them to temp file
    for line in infile:
        if re.match(tag, line) and len(line.split()) == 1:
            line = util.multiple_replace(line, replacements)
            line = re.sub(u' +', u' ', line)
        temp_file.write(line)

    infile.close(), temp_file.close()
    os.remove(infile_name)
    os.rename(temp_file_name, infile_name)

    return codecs.open(infile_name, 'r', 'utf-8')
Beispiel #10
0
 def cleaner(my_text):
     replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@(33|22)': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
     new = []
     for line in my_text:
         line = multiple_replace(line,replace_dict,using_regex=True)
         new.append(line)
     return new
 def cleaner(my_text):
     replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
     new = []
     for line in my_text:
         line = multiple_replace(line,replace_dict,using_regex=True)
         new.append(line)
     return new
def join_singlet_tags(infile, infile_name, tag):
    """
    Certain tags may appear on their own line when they need to be inline with the text. This function
    fixes this.
    :param infile: Input file to be edited
    :param infile_name: Path to file to be edited
    :param tag: tag to search for
    :return: The updated file
    """

    infile.seek(0)
    temp_file_name = '{}.tmp'.format(infile_name)
    temp_file = codecs.open(temp_file_name, 'w', 'utf-8')
    replacements = {u'\r': u' ', u'\n': u' '}

    # clean up problematic lines then write them to temp file
    for line in infile:
        if re.match(tag, line) and len(line.split()) == 1:
            line = util.multiple_replace(line, replacements)
            line = re.sub(u' +', u' ', line)
        temp_file.write(line)

    infile.close(), temp_file.close()
    os.remove(infile_name)
    os.rename(temp_file_name, infile_name)

    return codecs.open(infile_name, 'r', 'utf-8')
Beispiel #13
0
def clean_line(line):
    line = strip_nikkud(line)
    replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right?
    line = multiple_replace(line, replace_dict, using_regex=True)
    # line = re.sub(u'[:\?]', '', line)
    # line = re.sub(u'”', u'"', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    f_ayyen = re.search(reg_ayyen_tur, line)
    f_lo_manu = re.search(reg_lo_manu, line)

    if f_ayyen:
        line = line[:f_ayyen.start()]
    if f_lo_manu:
        line = re.sub(f_lo_manu.group('a'), u"", line)
    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
def fifty_parse(lines, replace_dict):
    # start the parsing of part fifty
    arr = []
    perek = []
    peska = []
    new_lines = []
    for line in lines:
        line = split_lines(line)
        new_lines.extend(line)

    for line in new_lines:
        if line.find(ur'@05') is not -1:
            if perek:

                perek.append(peska)
                peska = []
                arr.append(perek)
                perek = []
        else:
            if (line.find(u'@13') is not -1) and (peska):
                perek.append(peska)
                peska = []
            line = multiple_replace(line, replace_dict, using_regex=True)
            peska.append(line)
    perek.append(peska)
    arr.append(perek)
    ja_to_xml(arr,['perek', 'piska', 'break'], 'raavad_50.xml')

    return arr
Beispiel #15
0
def raavad_perush_parse(lines, replace_dict):
    # start the parsing of part raavad text itself
    arr = []
    first_p = True
    first_m = True
    first_d = True
    perek = []
    mishna = []
    dibur = []
    for line in lines:
        if line.find(u'@00') is not -1:
            # perek
            if first_p:
                first_p = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                arr.append(perek)
                perek = []
                first_m = True  # since this is opening a new perek
                first_d = True
        elif line.find(
                u'@22'
        ) is not -1:  # notice that this parsing is given that there is no text on same line with @22 and @00
            # mishna
            if first_m:
                first_m = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                first_d = True  # since this is opening a new mishna
        else:
            # this line is going to be part of the dibur
            # Dibur Hamatchil
            if re.search(u'@(31|98)', line) and (
                    not first_d
            ):  # and not first_d:  # probably start a new dibur
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
            else:
                if first_d:
                    first_d = False
            # segment ocr tag fixing
            line = multiple_replace(line, replace_dict, using_regex=True)
            dibur.append(line)
    dibur = ' '.join(dibur)
    mishna.append(dibur)
    perek.append(mishna)
    arr.append(perek)
    ja_to_xml(arr, ['perek', 'mishna', 'dibur'], 'raavad_text.xml')

    return arr
Beispiel #16
0
 def cleaner(my_text):
     #todo: deal with @44 and @00 (@00 maybe should be only in smk base text? - ask Shmuel)
     replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@(33|77|88|99)': u'', u'@55(.*?)@66': u'<b>\1</b>'}
     new = []
     for line in my_text:
         line = multiple_replace(line, replace_dict, using_regex=True)
         new.append(line)
     return new
Beispiel #17
0
def rewrtie_csv(fromcsv, newcsv, readColumnHeader, toWriteHeaders=None):
    headerNames, lines = fromCSV(fromcsv, u'fixed_{}'.format(readColumnHeader),
                                 readColumnHeader)
    if not toWriteHeaders:
        toWriteHeaders = headerNames
    regs = {
        u'rambam':
        re.compile(
            u'(\u05e8\u05de\u05d1"\u05dd.*?)(?:\.|\u05d5?\u05d8\u05d5\u05e8|\u05d5?\u05e1\u05de"?\u05d2|\n)'
        ),
        u'smg':
        re.compile(
            u'(\u05e1\u05de"?\u05d2.*?)(?:\.|\u05d5?\u05d8\u05d5\u05e8|\u05d5?\u05e8\u05de\u05d1"\u05dd|\n)'
        ),
        u'tur':
        re.compile(u'\u05d8\u05d5\u05e8(.*?)(?:\.|:|\n|@)')
    }
    # rows = OrderedDict()
    rows = []
    siman_cit_lines = 1
    prv_siman = 1
    for line_dict in lines:
        repdict = {
            u'טוא"ח': u'טור אורח חיים',
            u'טא"ח': u'טור אורח חיים',
            u'טי"ד': u'טור יורה דעה',
            u'טוי"ד': u'טור יורה דעה',
            u'טח"מ': u'טור חושן משפט',
            u'טוח"מ': u'טור חושן משפט'
        }
        line = multiple_replace(line_dict[u'full'], repdict)
        # line = line_dict[u'full']
        row_dict = {u'siman': line_dict[u'siman'], u'full': line}  # +u'.'}
        if line_dict[u'siman'] == prv_siman:
            siman_cit_lines += 1
        else:
            siman_cit_lines = 1
        rambam = re.search(regs[u'rambam'], line)
        if rambam:
            rambam = sarsehu(rambam.group(1).strip())
            rambam = get_a_Ref_from_chopped_txt(rambam, VERBOSE=False)
            row_dict[u'rambam'] = rambam
        tur = re.search(regs[u'tur'], line)
        if tur:
            tur = tur.group(1).strip()
            tur = get_a_Ref_from_chopped_txt(u'טור, {}'.format(tur))
            row_dict[u'tur'] = tur
        rows.append(row_dict)
        # rows[(row_dict[u'siman'], siman_cit_lines)] = row_dict
        prv_siman = line_dict[u'siman']
    links, smgs = link_smg(
        u'fixed_{}'.format(readColumnHeader))  #link_smg(u'smg_smk_test')

    for i, (smk_siman, seg, smg) in enumerate(smgs):
        if smg:
            rows[i][u'smg'] = eval(smg)
            rows[i][u'smk_segment'] = seg  #int(smk_siman)-1
    toCSV(newcsv, rows, toWriteHeaders)
    def cleaner(my_text):
        replace_dict = {u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1',
                        u"@44": u""}

        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new
Beispiel #19
0
 def cleaner(my_text):
     result = []
     for line in my_text:
         new_line = multiple_replace(line, {
             u'@31': u'<b>',
             u'@32': u'</b>'
         })
         new_line = re.sub(u'@[0-9]{2}', u'', new_line)
         result.append(new_line)
     return result
def raavad_perush_parse(lines, replace_dict):
    # start the parsing of part raavad text itself
    arr = []
    first_p = True
    first_m = True
    first_d = True
    perek = []
    mishna = []
    dibur = []
    for line in lines:
        if line.find(u'@00') is not -1:
            # perek
            if first_p:
                first_p = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                arr.append(perek)
                perek = []
                first_m = True  # since this is opening a new perek
                first_d = True
        elif line.find(u'@22') is not -1:  # notice that this parsing is given that there is no text on same line with @22 and @00
            # mishna
            if first_m:
                first_m = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                first_d = True  # since this is opening a new mishna
        else:
            # this line is going to be part of the dibur
            # Dibur Hamatchil
            if re.search(u'@(31|98)', line) and (not first_d):# and not first_d:  # probably start a new dibur
                    dibur = ' '.join(dibur)
                    mishna.append(dibur)
                    dibur = []
            else:
                if first_d:
                    first_d = False
            # segment ocr tag fixing
            line = multiple_replace(line, replace_dict, using_regex=True)
            dibur.append(line)
    dibur = ' '.join(dibur)
    mishna.append(dibur)
    perek.append(mishna)
    arr.append(perek)
    ja_to_xml(arr,['perek', 'mishna', 'dibur'], 'raavad_text.xml')

    return arr
Beispiel #21
0
def before_post_cleaner(ja, replace_dict):
    new_ja = []
    new_siman = []
    for i, siman in enumerate(ja):
        for seg_number, seg in enumerate(siman):
            seg = multiple_replace(seg, replace_dict, using_regex=True)
            if re.search(u'<small></small>', seg):
                continue
            new_siman.append(seg)
        new_ja.append(new_siman)
        new_siman = []
    return new_ja
def before_post_cleaner(ja, replace_dict):
    new_ja = []
    new_siman = []
    for i, siman in enumerate(ja):
        for seg_number, seg in enumerate(siman):
            seg = multiple_replace(seg, replace_dict, using_regex=True)
            if re.search(u'<small></small>', seg):
                continue
            new_siman.append(seg)
        new_ja.append(new_siman)
        new_siman = []
    return new_ja
Beispiel #23
0
def convert_smg(smg_str):
    conv_table = {
    u'Sefer Mitzvot Gadol, Volume One ' : u'Sefer Mitzvot Gadol, Negative Commandments ',
    u'Sefer Mitzvot Gadol, Volume Two ':u'Sefer Mitzvot Gadol, Positive Commandments ',
    u'Sefer Mitzvot Gadol, Volume Two, Laws of Eruvin ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Eruvin ',
    u'Sefer Mitzvot Gadol, Volume Two, Laws of Mourning ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Mourning ',
    u"Sefer Mitzvot Gadol, Volume Two, Laws of Tisha B'Av ": u"Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Tisha B'Av ",
    u'Sefer Mitzvot Gadol, Volume Two, Laws of Megillah ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Megillah ',
    u'Sefer Mitzvot Gadol, Volume Two, Laws of Chanukah ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Chanukah '
    }

    return multiple_replace(smg_str, conv_table, using_regex=True)
Beispiel #24
0
def convert_smg(smg_str):
    conv_table = {
    u'Sefer Mitzvot Gadol, Volume One ' : u'Sefer Mitzvot Gadol, Negative Commandments ',
    u'Sefer Mitzvot Gadol, Volume Two ':u'Sefer Mitzvot Gadol, Positive Commandments ',
    u'Sefer Mitzvot Gadol, Volume Two, Laws of Eruvin ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Eruvin ',
    u'Sefer Mitzvot Gadol, Volume Two, Laws of Mourning ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Mourning ',
    u"Sefer Mitzvot Gadol, Volume Two, Laws of Tisha B'Av ": u"Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Tisha B'Av ",
    u'Sefer Mitzvot Gadol, Volume Two, Laws of Megillah ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Megillah ',
    u'Sefer Mitzvot Gadol, Volume Two, Laws of Chanukah ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Chanukah '
    }

    return multiple_replace(smg_str, conv_table, using_regex=True)
Beispiel #25
0
    def cleaner(my_text):
        replace_dict = {
            u'@11(.*?)@12': ur'<b>\1</b>',
            u'@33(.*?)@34': ur'<b>\1</b>',
            u'@66(.*?)@67': ur'\1',
            u"@44": u""
        }

        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new
Beispiel #26
0
def clean_and_align(section):
    """
    Take a section of a raw parse, clean out tags and align segments.
    :param section: List of strings representing a raw text segment.
    :return: List of strings, cleaned and properly structured.
    """

    cleaned = []

    for line in section:
        line = re.sub(u'@[0-9]{2}', u'', line)
        line = re.sub(u'[!*]', u'', line)
        line = re.sub(u' +', u' ', line)
        line = util.multiple_replace(line, {u'\n': u'', u'\r': u''})
        cleaned.append(line)

    return cleaned
def clean_and_align(section):
    """
    Take a section of a raw parse, clean out tags and align segments.
    :param section: List of strings representing a raw text segment.
    :return: List of strings, cleaned and properly structured.
    """

    cleaned = []

    for line in section:
        line = re.sub(u'@[0-9]{2}', u'', line)
        line = re.sub(u'[!*]', u'', line)
        line = re.sub(u' +', u' ', line)
        line = util.multiple_replace(line, {u'\n': u'', u'\r': u''})
        cleaned.append(line)

    return cleaned
def rewrtie_csv(fromcsv, newcsv, readColumnHeader, toWriteHeaders=None):
    headerNames, lines = fromCSV(fromcsv, u'fixed_{}'.format(readColumnHeader), readColumnHeader)
    if not toWriteHeaders:
        toWriteHeaders = headerNames
    regs = {u'rambam': re.compile(
        u'(\u05e8\u05de\u05d1"\u05dd.*?)(?:\.|\u05d5?\u05d8\u05d5\u05e8|\u05d5?\u05e1\u05de"?\u05d2|\n)'),
            u'smg': re.compile(
                u'(\u05e1\u05de"?\u05d2.*?)(?:\.|\u05d5?\u05d8\u05d5\u05e8|\u05d5?\u05e8\u05de\u05d1"\u05dd|\n)'),
            u'tur': re.compile(u'\u05d8\u05d5\u05e8(.*?)(?:\.|:|\n|@)')}
    # rows = OrderedDict()
    rows = []
    siman_cit_lines = 1
    prv_siman = 1
    for line_dict in lines:
        repdict = {u'טוא"ח': u'טור אורח חיים', u'טא"ח': u'טור אורח חיים',u'טי"ד':u'טור יורה דעה', u'טוי"ד':u'טור יורה דעה',u'טח"מ': u'טור חושן משפט',u'טוח"מ': u'טור חושן משפט'}
        line = multiple_replace(line_dict[u'full'], repdict)
        # line = line_dict[u'full']
        row_dict = {u'siman': line_dict[u'siman'], u'full': line}# +u'.'}
        if line_dict[u'siman'] == prv_siman:
            siman_cit_lines += 1
        else:
            siman_cit_lines = 1
        rambam = re.search(regs[u'rambam'], line)
        if rambam:
            rambam = sarsehu(rambam.group(1).strip())
            rambam = get_a_Ref_from_chopped_txt(rambam, VERBOSE=False)
            row_dict[u'rambam'] = rambam
        tur = re.search(regs[u'tur'], line)
        if tur:
            tur = tur.group(1).strip()
            tur = get_a_Ref_from_chopped_txt(u'טור, {}'.format(tur))
            row_dict[u'tur'] = tur
        rows.append(row_dict)
        # rows[(row_dict[u'siman'], siman_cit_lines)] = row_dict
        prv_siman = line_dict[u'siman']
    links, smgs = link_smg(u'fixed_{}'.format(readColumnHeader)) #link_smg(u'smg_smk_test')

    for i, (smk_siman, seg, smg) in enumerate(smgs):
        if smg:
            rows[i][u'smg'] = eval(smg)
            rows[i][u'smk_segment'] = seg #int(smk_siman)-1
    toCSV(newcsv, rows, toWriteHeaders)
Beispiel #29
0
def clean(JA, replace_dict):
    '''

    :param JA: JA obj of the text to be cleand
    :param replace_dict: a dictionary of what to replace
    :return: cleaned JA
    '''
    # replace_dict = {u'@23': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>',
    #                 u'@66(.*?)@67': ur'\1'}  # , u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'}
    lstlst = JA.array()
    new = []
    nd1 = []
    for d1 in lstlst:
        for d2 in d1:
            nd2 = multiple_replace(d2, replace_dict, using_regex=True)
            nd1.append(nd2)
        new.append(nd1)
        nd1 = []
    ja_to_xml(new, ['letter', 'segments'], 'clean_smk.xml')
    return JaggedArray(new)
def threty_two_parse(lines, replace_dict, str):
    # start the parsing of 32 netivot
    arr = []
    netiv = []
    first = True
    for line in lines:
        if re.search(u'@(13|03)', line):# and (netiv):
            if first:
                first = False
            else:
                netiv = ' '.join(netiv)
                arr.append(netiv)
                netiv = []
        line = multiple_replace(line, replace_dict, using_regex=True)
        netiv.append(line.strip())
    netiv = ' '.join(netiv)
    arr.append(netiv)
    ja_to_xml(arr, ['netiv'], '{}{}'.format(str,'_32.xml'))

    return arr
Beispiel #31
0
def threty_two_parse(lines, replace_dict, str):
    # start the parsing of 32 netivot
    arr = []
    netiv = []
    first = True
    for line in lines:
        if re.search(u'@(13|03)', line):  # and (netiv):
            if first:
                first = False
            else:
                netiv = ' '.join(netiv)
                arr.append(netiv)
                netiv = []
        line = multiple_replace(line, replace_dict, using_regex=True)
        netiv.append(line.strip())
    netiv = ' '.join(netiv)
    arr.append(netiv)
    ja_to_xml(arr, ['netiv'], '{}{}'.format(str, '_32.xml'))

    return arr
Beispiel #32
0
def ari_parse():
    with codecs.open("yitzira_mishna.txt", "r", "utf-8") as fp:
        lines = fp.readlines()
    parsed = []
    perek = []
    mishna = []
    starting = None
    # dictionary for line ocr tag fixing
    replace_dict = {
        u"@(44)": u"<small>",
        u"@(45)": u"</small>",  # bava in parenthesis
        ur"(@(11|12|66|67)|\[\*.*?\])": u"",  # ocr tags that are not relevant (including erasing footnotes)
    }
    # check if we got to the end of the legend and change to started
    for line_num, line in enumerate(lines):
        if line == u"\n":
            starting = line_num + 1
            break

    for line in lines[starting:]:
        if line.find(u"@00") == 0:
            if perek:
                mishna = " ".join(mishna)
                perek.append(mishna)
                mishna = []
                parsed.append(perek)
                perek = []
        elif line.find(u"@22") == 0:
            if mishna:
                mishna = "".join(mishna)
                perek.append(mishna)
                mishna = []
        else:
            line = multiple_replace(line, replace_dict, using_regex=True)
            mishna.append(line.strip())

    mishna = " ".join(mishna)
    perek.append(mishna)
    parsed.append(perek)
    # ja_to_xml(parsed,['perek', 'mishna'])
    return parsed
Beispiel #33
0
def ari_parse():
    with codecs.open('yitzira_mishna.txt', 'r', 'utf-8') as fp:
        lines = fp.readlines()
    parsed = []
    perek = []
    mishna = []
    starting = None
    # dictionary for line ocr tag fixing
    replace_dict = {u'@(44)': u'<small>', u'@(45)':u'</small>',  # bava in parenthesis
                    ur'(@(11|12|66|67)|\[\*.*?\])': u''  # ocr tags that are not relevant (including erasing footnotes)
                    }
    # check if we got to the end of the legend and change to started
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break

    for line in lines[starting:]:
        if line.find(u'@00') == 0:
            if perek:
                mishna = ' '.join(mishna)
                perek.append(mishna)
                mishna = []
                parsed.append(perek)
                perek = []
        elif line.find(u'@22') == 0:
            if mishna:
                mishna = ''.join(mishna)
                perek.append(mishna)
                mishna = []
        else:
            line = multiple_replace(line, replace_dict, using_regex=True)
            mishna.append(line.strip())

    mishna = ' '.join(mishna)
    perek.append(mishna)
    parsed.append(perek)
    # ja_to_xml(parsed,['perek', 'mishna'])
    return parsed
Beispiel #34
0
def structure_boaz(chapter):

    new_comment = re.compile(u'@22')
    break_tag = re.compile(u'@23')
    skip_tag = re.compile(u'@99')
    parsed = []

    for line in chapter:
        line = util.multiple_replace(line, {u'\n': u'', u'\r': u''})

        if new_comment.match(line):
            parsed.append(line)

        elif break_tag.match(line):
            line = line.replace(break_tag.pattern, u'<br>')
            parsed[-1] += line

        elif skip_tag.match(line) or line == u'':
            continue

        else:
            parsed[-1] += u' {}'.format(line)

    return parsed
Beispiel #35
0
def fifty_parse(lines, replace_dict):
    # start the parsing of part fifty
    arr = []
    perek = []
    peska = []
    new_lines = []
    for line in lines:
        line = split_lines(line)
        new_lines.extend(line)

    for line in new_lines:
        if line.find(ur'@05') is not -1:
            if perek:

                perek.append(peska)
                peska = []
                arr.append(perek)
                perek = []
        else:
            if (line.find(u'@13') is not -1) and (peska):
                perek.append(peska)
                peska = []
            line = multiple_replace(line, replace_dict, using_regex=True)
            peska.append(line)
Beispiel #36
0
def text_parse():
    # open, read, close the original txt file
    with codecs.open('yitzira_gra.txt', 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    # init section lists and flags
    parsed = []
    perek = []
    mishna = []
    dibur = []
    first_p = True  # first perek flag
    first_m = True  # first mishna flag
    first_d = True  # first dibur flag
    ofen = False # 'ofen' flag

    # dictionary for line ocr tag fixing
    replace_dict = {u'@03': u'<b>', u'@04': u'</b><br>',  # title 'Ofen' in the gra's commentary
                    u'@11': u'',  # not necessary ocr tag
                    u'@31': u'<b>', u'@32': u'</b>',  # bold dibur hamatchil
                    u'@44': u'<b>', u'@45': u'</b>',  # was bold in text
                    u'@98': u'<small>', u'@99': u'</small>',  # the slik at the end
                    ur'\*\[(.*?)\]': ur'<small>[\1]</small>'  # footnotes
                    }
    # loop on lines and creat the jagged array
    for line in lines[starting:]:
        if line.find(u'@00') is not -1:
            # perek
            if first_p:
                first_p = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                parsed.append(perek)
                perek = []
                first_m = True  # since this is opening a new perek

        elif line.find(u'@22') == 0:  # notice that this parsing is given that there is no text on same line with @22 and @00
            # mishna
            if first_m:
                first_m = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                first_d = True  # since this is opening a new mishna
        else: # this line is going to be part of the dibur
            # Dibur Hamatchil
            if re.search(u'@(03|31|98)', line):  # probably start a new dibur
                 if (not ofen) and (not first_d): # prob close prev dibur
                    dibur = ' '.join(dibur)
                    mishna.append(dibur)
                    dibur = []
                 else:
                    if ofen:
                        ofen = False
                    if first_d:
                        first_d = False
            if re.search(u'@03', line):
                ofen = True
            # segment ocr tag fixing
            line = multiple_replace(line, replace_dict, using_regex = True)
            dibur.append(line)

    # once reached the end close all what was opened
    dibur = ' '.join(dibur)
    mishna.append(dibur)
    perek.append(mishna)
    parsed.append(perek)
    # ja_to_xml(parsed,['perek','mishna','dibur'],filename = 'gra.xml')
    return parsed
Beispiel #37
0
def jaggedarray_from_file(input_file, perek_tag, mishna_tag, skip_tag):
    """
    :param input_file: File to parse
    :param perek_tag: Used to identify the start of a new perek.
    :param mishna_tag: Identify next mishna.
    :return: A 2D jaggedArray to match Sefaria's format. Rough, will require more processing.
    """

    chapters, mishnayot, current = [], [], []
    found_first_chapter = False

    for line in input_file:

        # look for skip_tag
        if re.search(skip_tag, line):
            continue

        # look for tags
        new_chapter, new_mishna = re.search(perek_tag,
                                            line), re.search(mishna_tag, line)

        # make sure perek and mishna don't appear on the same line
        if new_chapter and new_mishna:
            print 'Mishna starts on same line as chapter\n'
            print '{}\n\n'.format(new_chapter.group())
            input_file.close()
            sys.exit(1)

        # found chapter tag.
        if new_chapter:
            if found_first_chapter:
                if current != []:
                    mishnayot.append(u' '.join(current).lstrip())
                    current = []
                chapters.append(mishnayot)
                mishnayot = []
            else:
                found_first_chapter = True
            continue

        if found_first_chapter:
            if new_mishna:
                if current != []:
                    mishnayot.append(u' '.join(current).lstrip())
                current = [
                    util.multiple_replace(line, {
                        u'\n': u'',
                        u'\r': u'',
                        new_mishna.group(): u''
                    })
                ]

            else:
                current.append(util.multiple_replace(line, {
                    u'\n': u'',
                }))
            # add next line

    else:
        mishnayot.append(u''.join(current).lstrip())
        chapters.append(mishnayot)

    return chapters
Beispiel #38
0
def text_parse():
    # open, read, close the original txt file
    with codecs.open('yitzira_gra.txt', 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    # init section lists and flags
    parsed = []
    perek = []
    mishna = []
    dibur = []
    first_p = True  # first perek flag
    first_m = True  # first mishna flag
    first_d = True  # first dibur flag
    ofen = False  # 'ofen' flag

    # dictionary for line ocr tag fixing
    replace_dict = {
        u'@03': u'<b>',
        u'@04': u'</b><br>',  # title 'Ofen' in the gra's commentary
        u'@11': u'',  # not necessary ocr tag
        u'@31': u'<b>',
        u'@32': u'</b>',  # bold dibur hamatchil
        u'@44': u'<b>',
        u'@45': u'</b>',  # was bold in text
        u'@98': u'<small>',
        u'@99': u'</small>',  # the slik at the end
        ur'\*\[(.*?)\]': ur'<small>[\1]</small>'  # footnotes
    }
    # loop on lines and creat the jagged array
    for line in lines[starting:]:
        if line.find(u'@00') is not -1:
            # perek
            if first_p:
                first_p = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                parsed.append(perek)
                perek = []
                first_m = True  # since this is opening a new perek

        elif line.find(
                u'@22'
        ) == 0:  # notice that this parsing is given that there is no text on same line with @22 and @00
            # mishna
            if first_m:
                first_m = False
            else:
                dibur = ' '.join(dibur)
                mishna.append(dibur)
                dibur = []
                perek.append(mishna)
                mishna = []
                first_d = True  # since this is opening a new mishna
        else:  # this line is going to be part of the dibur
            # Dibur Hamatchil
            if re.search(u'@(03|31|98)', line):  # probably start a new dibur
                if (not ofen) and (not first_d):  # prob close prev dibur
                    dibur = ' '.join(dibur)
                    mishna.append(dibur)
                    dibur = []
                else:
                    if ofen:
                        ofen = False
                    if first_d:
                        first_d = False
            if re.search(u'@03', line):
                ofen = True
            # segment ocr tag fixing
            line = multiple_replace(line, replace_dict, using_regex=True)
            dibur.append(line)

    # once reached the end close all what was opened
    dibur = ' '.join(dibur)
    mishna.append(dibur)
    perek.append(mishna)
    parsed.append(perek)
    # ja_to_xml(parsed,['perek','mishna','dibur'],filename = 'gra.xml')
    return parsed
Beispiel #39
0
    def _extract_important_data(self):

        book_names, parsha_names = [], []
        books, parashot, sections, segments = [], None, None, None

        def start_condition(html_fragment):

            soup = html_fragment
            if soup.u is not None:
                if soup.u.text == u'ספר בראשית':
                    return True
            return False

        def text_quote(html_fragment):
            soup = html_fragment
            if soup.div is None:
                return False
            else:
                return True

        def new_parsha(html_frament):
            soup = html_frament
            if soup.u is None:
                return False
            else:
                if re.search(u'פרשת ', soup.u.text):
                    return True
                else:
                    return False

        def new_book(html_frament):
            soup = html_frament
            if soup.u is None:
                return False
            else:
                if re.search(u'ספר', soup.u.text):
                    return True
                else:
                    return False

        text_started = False
        for line in self.lines:
            line = multiple_replace(line, {u'\n': u'', u'\r': u''})
            if re.match(u'<B', line) is None:
                continue

            soup = BeautifulSoup(line, 'html5lib')
            if text_started:
                if new_book(soup):
                    # add book name
                    book_names.append(soup.u.text)
                    if parashot is not None:
                        sections.append(segments)
                        parashot.append(sections)
                        books.append(parashot)

                    parashot, sections, segments = [], None, None

                elif new_parsha(soup):
                    parsha_names.append(soup.u.text)
                    if sections is not None:
                        sections.append(segments)
                        parashot.append(sections)

                    sections, segments = [], None

                    if text_quote(soup):
                        if segments is not None:
                            sections.append(segments)
                        segments = [soup.div.text]

                elif text_quote(soup):
                    if segments is not None:
                        sections.append(segments)
                    segments = [soup.div.text]

                else:
                    if soup.text == u'':
                        continue
                    else:
                        segments.append(soup.text)

            else:
                text_started = start_condition(soup)
                if text_started:
                    book_names.append(u'ספר בראשית')
                    parashot = []
        else:
            sections.append(segments)
            parashot.append(sections)
            books.append(parashot)
        return {
            'book names': book_names,
            'parsha names': parsha_names,
            'full_text': books
        }