Example #1
0
def parse():
    with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile:
        lines = infile.readlines()
    gate, chapter, whole_text = -1, -1, []
    root = JaggedArray([[]])
    found_beginning = False
    beginning = re.compile(
        ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})'
    )

    for line in lines:
        match = beginning.search(line)
        if match:
            if found_beginning:
                if re.search(ur'^\u05e4\u05e8\u05e7',
                             whole_text[0]):  # strip out some unnecessary text
                    root.set_element([gate, chapter], whole_text[1:], pad=[])
                else:
                    root.set_element([gate, chapter], whole_text, pad=[])
                whole_text = []
            else:
                found_beginning = True
            new_gate, new_chapter = getGematria(
                match.group(1)) - 1, getGematria(match.group(2)) - 1
            if new_gate - gate > 1 or new_chapter - chapter > 1:
                print 'skip found at Gate {} Chapter {}'.format(
                    new_gate + 1, new_chapter + 1)
            gate, chapter = new_gate, new_chapter

        elif found_beginning:
            if re.search(ur'<img', line):
                whole_text[-1] = add_image(line, whole_text[-1])
                continue
Example #2
0
def link_semak_raph(smk_ja, raph_ja):
    #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph
    #by running on the ja_raph segments
    smk_raph = []
    raph_letter = []
    for seg in traverse_ja(smk_ja):
        if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']):
            for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})', seg['data']):
                # smk_raph.append([seg['indices'][:], letter])
                smk_raph.append([letter, seg['indices']])
    last = [-1, -1]
    for seg in traverse_ja(raph_ja):
        if seg['indices'][0:2] == last[0:2]:
            continue
        else:
            raph_letter.append(seg)
        last = seg['indices']

    problem_count = 0
    for smk, raph in zip(smk_raph, raph_letter):
        if getGematria(smk[0]) == (raph['indices'][1]+1):
            print getGematria(smk[0]), raph['indices'][1]+1, \
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
        else:
            problem_count +=1
            print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
    print problem_count
Example #3
0
def parse():
    with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile:
        lines = infile.readlines()
    gate, chapter, whole_text = -1, -1, []
    root = JaggedArray([[]])
    found_beginning = False
    beginning = re.compile(ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})')

    for line in lines:
        match = beginning.search(line)
        if match:
            if found_beginning:
                if re.search(ur'^\u05e4\u05e8\u05e7', whole_text[0]):  # strip out some unnecessary text
                    root.set_element([gate, chapter], whole_text[1:], pad=[])
                else:
                    root.set_element([gate, chapter], whole_text, pad=[])
                whole_text = []
            else:
                found_beginning = True
            new_gate, new_chapter = getGematria(match.group(1))-1, getGematria(match.group(2))-1
            if new_gate - gate > 1 or new_chapter - chapter > 1:
                print 'skip found at Gate {} Chapter {}'.format(new_gate+1, new_chapter+1)
            gate, chapter = new_gate, new_chapter

        elif found_beginning:
            if re.search(ur'<img', line):
                whole_text[-1] = add_image(line, whole_text[-1])
                continue
def scrape_wiki():
    url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"

    page = requests.get(url)
    soup_body = BeautifulSoup(page.text, "lxml")
    tables = soup_body.select(".mw-parser-output > table")

    pairs = []
    links = []

    for table in tables:
        table_tr = table.select("tr")
        for col in table_tr:
            pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip()))

    for pair in pairs:
        if re.search(u'ספר|מספר', pair[0]):
            continue
        neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot'
        rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
        chinukh = getGematria(pair[0])
        print chinukh, rambam
        chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
        print neg_pos
        link = ({"refs": [
            u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen),
            u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
        ],
            "type": "Sifrei Mitzvot",
            "auto": True,
            "generated_by": "chinukh_rambam_sfm_linker"  # _sfm_linker what is this parametor intended to be?
        })
        print link['refs']
        links.append(link)
        return links
Example #5
0
def parse_Raph_simanim(alinged_list):
    '''
    note: although there is (not often) a differentiation in the original txt file,
    raph letters can be divided into smaller segments. In this code we combined those segments.
    returning, every raph letter as a line.
    '''
    ja = []
    siman = []
    i = 1
    prev_siman = u'א'
    for obj in alinged_list:
        if obj['siman'] == prev_siman:
          siman.append(obj['raph'])
          continue
        else:
            ja.append(siman)
            while getGematria(obj['siman']) != (getGematria(prev_siman) + i):
                ja.append([])
                i += 1
            i = 1
            siman = []
            siman.append(obj['raph'])
        prev_siman = obj['siman']
    ja.append(siman)
    ja_to_xml(ja, ['siman', 'letter'], 'raph_simanim.xml')
    return ja
def xmlify(filename):
    """
    create an xml representation of the text files
    :param filename: str name of file
    """
    with codecs.open(filename, 'r', 'utf-8') as infile:
        raw_rambam = infile.read()

    chap_index = [getGematria(i.group(1)) for i in re.finditer(ur'@00\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})', raw_rambam)]
    chapters = re.split(ur'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', raw_rambam)[1:]
    assert len(chap_index) == len(chapters)

    soup = BeautifulSoup(u'<root></root>', 'xml')
    for index, chapter in zip(chap_index, chapters):
        x_chapter = soup.new_tag('chapter', num=unicode(index))
        soup.root.append(x_chapter)

        v_indices = [getGematria(i.group(1)) for i in re.finditer(ur'@22([\u05d0-\u05ea]{1,2})', chapter)]
        verses = re.split(ur'@22[\u05d0-\u05ea]{1,2}', chapter)[1:]
        assert len(v_indices) == len(verses)

        for v_index, verse in zip(v_indices, verses):
            x_verse = soup.new_tag('verse', num=unicode(v_index))
            comments = verse.splitlines()
            for i, comment in enumerate(comments[1:]):
                x_comment = soup.new_tag('comment', num=unicode(i+1))
                x_comment.append(comment)
                x_verse.append(x_comment)

            x_chapter.append(x_verse)
    with codecs.open('./xml/{}'.format(filename.replace('.txt', '.xml')), 'w', 'utf-8') as outfile:
        outfile.write(unicode(soup.prettify()))
Example #7
0
def parse_Raph_simanim(alinged_list):
    '''
    note: although there is (not often) a differentiation in the original txt file,
    raph letters can be divided into smaller segments. In this code we combined those segments.
    returning, every raph letter as a line.
    '''
    ja = []
    siman = []
    i = 1
    prev_siman = u'א'
    for obj in alinged_list:
        if obj['siman'] == prev_siman:
            siman.append(obj['raph'])
            continue
        else:
            ja.append(siman)
            while getGematria(obj['siman']) != (getGematria(prev_siman) + i):
                ja.append([])
                i += 1
            i = 1
            siman = []
            siman.append(obj['raph'])
        prev_siman = obj['siman']
    ja.append(siman)
    ja_to_xml(ja, ['siman', 'letter'], 'raph_simanim.xml')
    return ja
def siman_smk_exctractor(smk_text):

    split = re.split(u'\s', smk_text)
    simanim = []
    for word in split:
        if not word or word == u'סימן' or word == u'סעיף':
            continue
        word = re.sub(u"[;.,']", u"", word)
        if re.search(u'-', word):
            borders = re.search(u"(.*?)-(.*)", word)
            start = getGematria(borders.group(1))
            end = getGematria(borders.group(2))
            for siman in range(start, end+1):
                simanim.append(siman)
        if not is_hebrew_number(word):
            if not check_vav(word):
                # print smk_text, simanim
                return simanim
            else:
                simanim.append(check_vav(word))
        else:
            smk_siman = getGematria(word)
            simanim.append(smk_siman)
    # print smk_text, simanim
    return simanim
Example #9
0
def link_semak_raph(smk_ja, raph_ja):
    #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph
    #by running on the ja_raph segments
    smk_raph = []
    raph_letter = []
    for seg in traverse_ja(smk_ja):
        if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']):
            for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})',
                                     seg['data']):
                # smk_raph.append([seg['indices'][:], letter])
                smk_raph.append([letter, seg['indices']])
    last = [-1, -1]
    for seg in traverse_ja(raph_ja):
        if seg['indices'][0:2] == last[0:2]:
            continue
        else:
            raph_letter.append(seg)
        last = seg['indices']

    problem_count = 0
    for smk, raph in zip(smk_raph, raph_letter):
        if getGematria(smk[0]) == (raph['indices'][1] + 1):
            print getGematria(smk[0]), raph['indices'][1]+1, \
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
        else:
            problem_count += 1
            print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
    print problem_count
Example #10
0
 def chapter_verse(text_fragment):
     searcher = re.compile(
         u'.*B.*-([\u05d0-\u05ea]{1,2})-\{([\u05d0-\u05ea]{1,2})\}')
     data = searcher.search(text_fragment)
     return {
         'chapter': util.getGematria(data.group(1)),
         'verse': util.getGematria(data.group(2))
     }
Example #11
0
def getMishnah(line):
    if line.find("@22") == 0:
        line = line.split(" ")[0].replace("@22", "")
        return getGematria(line)
    else:
        first_word = line.replace("@11", "").split(" ")[0]
        if len(first_word) == 1:
            return getGematria(first_word)
    return None
def getGematriaVav(str):
    str = str.strip()
    str = re.sub(u'''"|''', u'', str)
    case_set = {270,272,274,275,298,304,344,670,672,698,744} # from trello card 'Letter transpositions'
    if str[0] == u'ו' and (is_hebrew_number(str[1:]) or (getGematria(str[1:]) in case_set)):
        return getGematria(str[1:])
    elif is_hebrew_number(str) or getGematria(str) in case_set: # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str)
        return getGematria(str)
    else:
        # mass.ErrorFile.write('error in pointer, not Gimatria...')
        print 'error in pointer, not Gimatria...', str
Example #13
0
    def walk_through_file(self, filename):
        """
        Derive and store references from a single file
        :param filename:
        :return:
        """
        tester = Tester()
        previous_reference, seif = None, 0
        with codecs.open(filename, 'r', 'utf-8') as fp:
            lines = fp.readlines()
        for line in lines:
            if tester(re.search(u'@22([\u05d0-\u05ea]{1,3})', line)):
                siman = getGematria(tester.match.group(1))
                seif = 0
                previous_reference = self.get_default_reference(siman)

            if re.match(u'^@00\(', line):
                seif += 1
                reference = {
                    u'siman': siman,
                    u'local-seif': seif,
                    u'remote-seif': None,
                    u'comments-on': None,
                    u'raw-text': line
                }
                stripped = re.sub(u'[^\u05d0-\u05ea ]', u'', line)
                stripped = re.sub(
                    u'^\u05e1\u05d9(?:\u05de\u05df)?\s([\u05d0-\u05ea]{1,3})\s?',
                    u'', stripped)
                ref_match = self.reference_regex.match(stripped)
                if not ref_match:
                    print u"No match found for:"
                    print line
                    continue
                reference[u'comments-on'] = self.get_commentator(ref_match)
                reference[u'remote-seif'] = \
                    None if ref_match.group(u'seif') is None else getGematria(ref_match.group(u'seif'))
                if reference[u'comments-on'] is None:
                    reference[u'comments-on'] = previous_reference[
                        u'comments-on']
                else:
                    previous_reference[u'comments-on'] = reference[
                        u'comments-on']
                if reference[u'remote-seif'] is None:
                    reference[u'remote-seif'] = previous_reference[
                        u'remote-seif']
                else:
                    previous_reference[u'remote-seif'] = reference[
                        u'remote-seif']
                if reference[u'remote-seif'] is None:
                    print u'No remote seif for {} {}'.format(
                        reference[u'siman'], reference[u'local-seif'])
                self.record_list.append(reference)
Example #14
0
def getGematriaVav(str, mass):
    str = str.strip()
    str = re.sub(u'''"|''', u'', str)
    case_set = {270,272,274,275,298,304,344,670,672,698,744} # from trello card 'Letter transpositions'
    if str[0] == u'ו' and (is_hebrew_number(str[1:]) or (getGematria(str[1:]) in case_set)):
        return getGematria(str[1:])
    elif is_hebrew_number(str) or getGematria(str) in case_set: # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str)
        return getGematria(str)
    elif re.search(u'בהגהה?', str): # this is not gimatria but there is no need to send an error about it each time...
        return
    else:
        mass.write_shgia('error in pointer, not Gimatria...'+ str)
Example #15
0
def getGematriaVav(str, mass):
    str = str.strip()
    str = re.sub(u'''"|''', u'', str)
    case_set = {270,272,274,275,298,304,344,670,672,698,744} # from trello card 'Letter transpositions'
    if str[0] == u'ו' and (is_hebrew_number(str[1:]) or (getGematria(str[1:]) in case_set)):
        return getGematria(str[1:])
    elif is_hebrew_number(str) or getGematria(str) in case_set: # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str)
        return getGematria(str)
    elif re.search(u'בהגהה?', str): # this is not gimatria but there is no need to send an error about it each time...
        return
    else:
        mass.write_shgia('error in pointer, not Gimatria...'+ str)
def create_alt_struct_dict(rabbeinu_bahya_text_file, the_regex):
    first_perek, first_pasuk, current_perek, current_pasuk = 0, 0, 0, 0
    second_to_last_pasuk, second_to_last_comment_number = 0, 0
    first_comment_number, current_comment_number = 0, 0
    new_first_perek, new_first_pasuk, new_comment = True, True, True
    list_of_ranges = []

    with codecs.open(rabbeinu_bahya_text_file, 'r', 'utf-8') as the_file:
        for each_line in the_file:


            if "@99" in each_line:
                #list_of_ranges.append('{}.{}.{}-{}.{}.{}'.format(first_perek, first_pasuk, first_comment_number, current_perek, current_pasuk, current_comment_number))
                new_first_perek, new_first_pasuk, right_after_99 = True, True, True
                first_perek = 0

            elif "@00" in each_line:
                list_of_ranges.append('{}.{}.{}-{}.{}.{}'.format(first_perek, first_pasuk, first_comment_number, current_perek, current_pasuk, second_to_last_comment_number))
                new_first_perek, new_first_pasuk = True, True

            elif "@77" in each_line:
                list_of_ranges.append('{}.{}.{}-{}.{}.{}'.format(first_perek, first_pasuk, first_comment_number, current_perek, current_pasuk, second_to_last_comment_number))
                #new_first_perek, new_first_pasuk = True, True

            elif "@01" in each_line:

                matchObject = the_regex.search(each_line)
                if new_first_perek:
                    matchObject = the_regex.search(each_line)
                    first_perek = util.getGematria(matchObject.group(1))
                    new_first_perek = False
                current_perek = util.getGematria(matchObject.group(1))

            elif "@22" in each_line:

                matchObject = the_regex.search(each_line)
                if new_first_pasuk:
                    matchObject = the_regex.search(each_line)
                    first_pasuk = util.getGematria(matchObject.group(1))
                    new_first_pasuk = False
                    new_comment = True
                if new_comment:
                    first_comment_number = current_comment_number
                second_to_last_pasuk = current_pasuk
                current_pasuk = util.getGematria(matchObject.group(1))
                second_to_last_comment_number = current_comment_number
                current_comment_number = 0

            else:
                current_comment_number += 1

    return list_of_ranges
Example #17
0
def getGematriaVav(str):
    str = str.strip()
    str = re.sub(u'''"|''', u'', str)
    case_set = {270, 272, 274, 275, 298, 304, 344, 670, 672, 698,
                744}  # from trello card 'Letter transpositions'
    if str[0] == u'ו' and (is_hebrew_number(str[1:]) or
                           (getGematria(str[1:]) in case_set)):
        return getGematria(str[1:])
    elif is_hebrew_number(str) or getGematria(
            str
    ) in case_set:  # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str)
        return getGematria(str)
    else:
        # mass.ErrorFile.write('error in pointer, not Gimatria...')
        print 'error in pointer, not Gimatria...', str
    def grab_rashis(self):

        rashis = []
        for span in self.parsed_html.find_all('span', id='katom'):
            if span.text == u'\n':
                continue

            verse = {'comments': []}

            # grab the verse number
            match = re.search(u'\(([\u05d0-\u05ea]{1,2})\)', span.text)

            if match is None:
                verse['verse_number'] = '<unknown>'

            else:
                verse['verse_number'] = util.getGematria(match.group(1))

            structured_rashi = self.structure_rashi(span.text)
            for line in structured_rashi:
                if line is not u'':
                    # add all Siftei Hakhamim in an array according to each Rashi comment.
                    verse['comments'].append(re.findall(u'\[([\u05d0-\u05ea])\]', line))

            verse['total_rashis'] = len(structured_rashi)

            rashis.append(verse)
        return rashis
Example #19
0
    def grab_rashis(self):

        rashis = []
        for span in self.parsed_html.find_all('span', id='katom'):
            if span.text == u'\n':
                continue

            verse = {'comments': []}

            # grab the verse number
            match = re.search(u'\(([\u05d0-\u05ea]{1,2})\)', span.text)

            if match is None:
                verse['verse_number'] = '<unknown>'

            else:
                verse['verse_number'] = util.getGematria(match.group(1))

            structured_rashi = self.structure_rashi(span.text)
            for line in structured_rashi:
                if line is not u'':
                    # add all Siftei Hakhamim in an array according to each Rashi comment.
                    verse['comments'].append(
                        re.findall(u'\[([\u05d0-\u05ea])\]', line))

            verse['total_rashis'] = len(structured_rashi)

            rashis.append(verse)
        return rashis
Example #20
0
def find_skips(filename):
    """
    Looks for skipped comments.
    :param filename: File to scan
    """

    parser = TextParser(filename)
    offset = 0
    total_errors = 0
    for chapter in parser.chapter_strings:
        chap_number = util.getGematria(
            parser.chap_reg.search(chapter).group(1))
        if chap_number == 1:
            offset = 0
        comments = parser.comment_reg.findall(chapter)
        comment_values = [letters[comment[1]] for comment in comments]

        sequence = modulo_sequence(comment_values, 22, offset)
        offset = comment_values[-1] + 1

        if sequence['in_order']:
            continue
        else:
            print 'error in chapter {}'.format(chap_number)
            for error in sequence['errors']:
                print 'previous: {} expected: {} found: {}'.format(
                    error['previous'], error['expected'], error['found'])
            total_errors += len(sequence['errors'])
    print 'total errors: {}'.format(total_errors)
def align_comments(text_array):
    # strip out unnecessary lines
    remove = re.compile(u'@99')
    for index, line in enumerate(text_array):
        if remove.search(line):
            del text_array[index]

    section_name, result, tmp = '', {}, []
    t = u''.join(text_array)
    t = t.replace(u'\n', u'')
    t = t.replace(u'\r', u'')
    t = t.split(u' ')
    for word in t:
        search = re.search(u'@11([\u05d0-\u05ea"]){1,4}\*?\)', word)
        if search:
            section_name = getGematria(search.group(1).replace(u'"', u''))
            if section_name in result.keys():
                result[section_name].append(u'\n')
        if section_name not in result.keys():
            result[section_name] = []

        result[section_name].append(re.sub(u'@[0-9]{2}', u'', word))

    #


    return result
def find_skips(filename):
    """
    Looks for skipped comments.
    :param filename: File to scan
    """

    parser = TextParser(filename)
    offset = 0
    total_errors = 0
    for chapter in parser.chapter_strings:
        chap_number = util.getGematria(parser.chap_reg.search(chapter).group(1))
        if chap_number == 1:
            offset = 0
        comments = parser.comment_reg.findall(chapter)
        comment_values = [letters[comment[1]] for comment in comments]

        sequence = modulo_sequence(comment_values, 22, offset)
        offset = comment_values[-1]+1

        if sequence['in_order']:
            continue
        else:
            print 'error in chapter {}'.format(chap_number)
            for error in sequence['errors']:
                print 'previous: {} expected: {} found: {}'.format(
                    error['previous'], error['expected'], error['found'])
            total_errors += len(sequence['errors'])
    print 'total errors: {}'.format(total_errors)
Example #23
0
def get_civil_year(year_line, book):
    """
    JN are named by year. The he_title can be lifted directly from the text, this function converts them to English
    equivalent. The conversion is not exact, as an exact mapping of Parsha - Date is not available at this time.
    Therefore, each book will get a "typical" Hebrew data which is used to extract the standard civil date. This may
    contain several errors, which will be corrected down the road.
    :param year_line: A line of text from which year data is extracted. May contain multiple years (i.e. תרל"ז-תרל"ח)
    :param book: What book is this taken from (i.e. Genesis, Exodus etc.).
    :return: civil year(s)
    """

    typical_dates = {
        'Genesis': [7, 1],
        'Exodus': [10, 20],
        'Leviticus': [1, 1],
        'Numbers': [3, 20],
        'Deuteronomy': [5, 1]
    }

    he_years = [
        util.getGematria(match) + 5000
        for match in re.findall(u'[\u05d0-\u05ea"]{4,5}', year_line)
    ]
    en_years = [str(year) for year in he_years]

    return '; '.join(en_years)
Example #24
0
def check_segments():

    segments = []

    infile = codecs.open(filename, 'r', 'utf-8')

    headers = TagTester(u'@30', infile,
                        u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header()
    tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)')

    while not tester.eof:

        segments.append(
            tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1))

    infile.close()

    for sec_number, section in enumerate(segments):

        index = 1

        for title in section:

            title = title.replace(u'"', u'')
            count = util.getGematria(title)

            if count != index:

                print headers[sec_number - 1]
                print util.numToHeb(index)
                index = count
            index += 1
Example #25
0
def check_segments():

    segments = []

    infile = codecs.open(filename, 'r', 'utf-8')

    headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header()
    tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)')

    while not tester.eof:

        segments.append(tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1))

    infile.close()

    for sec_number, section in enumerate(segments):

        index = 1

        for title in section:

            title = title.replace(u'"', u'')
            count = util.getGematria(title)

            if count != index:

                print headers[sec_number-1]
                print util.numToHeb(index)
                index = count
            index += 1
def file_to_ja_g(depth, infile, expressions, cleaner,grab_all=False):
    """
    Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged
    Array in the desired structure (Chapter, verse, etc.)
    This function is a modulation of the origanal file_to_ja because it deals with gimatria letters
    so to place the correct chapters and segments in the currect places according to the hebrew letter numbering.
    Ofcourse it also puts in the padding where needed. (_g stands for Gimatria.
    :param depth: depth of the JaggedArray.
    :param infile: Text file to read from
    :param expressions: A list of regular expressions with which to identify section (chapter) level. Do
    not include an expression with which to break up the segment levels.
    :param cleaner: A function that takes a list of strings and returns an array with the text parsed
    correctly. Should also break up and remove unnecessary tagging data.
    :param grab_all: If set to true, will grab the lines indicating new sections.
    :return: A jagged_array with the text properly structured.
    """

    # instantiate ja
    # structure = reduce(lambda x,y: [x], range(depth-1), [])
    # ja = JaggedArray(structure)
    ja = JaggedArray([])
    # ensure there is a regex for every level except the lowest
    if depth - len(expressions) != 1:
        raise AttributeError('Not enough data to parse. Need {} expressions, '
                             'received {}'.format(depth-1, len(expressions)))

    # compile regexes, instantiate index list
    regexes, indices = [re.compile(ex) for ex in expressions], [-1]*len(expressions)
    temp = []

    # loop through file
    for line in infile:

        # check for matches to the regexes
        for i, reg in enumerate(regexes):
            found = reg.search(line)
            if found:
                # check that we've hit the first chapter and verse
                if indices.count(-1) == 0:
                    ja.set_element(indices, cleaner(temp), [])
                    temp = []

                    if grab_all:
                        temp.append(line)
                gimt = getGematria(found.group('gim'))
                if gimt != 0:
                    indices[i] = gimt - 1
                else:
                    indices[i] += 1
                indices[i+1:] = [-1 if x >= 0 else x for x in indices[i+1:]]
                break

        else:
            if indices.count(-1) == 0:
                temp.append(line)
    else:
        ja.set_element(indices, cleaner(temp), [])

    return ja
Example #27
0
def file_to_ja_g(depth, infile, expressions, cleaner,grab_all=False):
    """
    Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged
    Array in the desired structure (Chapter, verse, etc.)
    This function is a modulation of the origanal file_to_ja because it deals with gimatria letters
    so to place the correct chapters and segments in the currect places according to the hebrew letter numbering.
    Ofcourse it also puts in the padding where needed. (_g stands for Gimatria.
    :param depth: depth of the JaggedArray.
    :param infile: Text file to read from
    :param expressions: A list of regular expressions with which to identify section (chapter) level. Do
    not include an expression with which to break up the segment levels.
    :param cleaner: A function that takes a list of strings and returns an array with the text parsed
    correctly. Should also break up and remove unnecessary tagging data.
    :param grab_all: If set to true, will grab the lines indicating new sections.
    :return: A jagged_array with the text properly structured.
    """

    # instantiate ja
    # structure = reduce(lambda x,y: [x], range(depth-1), [])
    # ja = JaggedArray(structure)
    ja = JaggedArray([])
    # ensure there is a regex for every level except the lowest
    if depth - len(expressions) != 1:
        raise AttributeError('Not enough data to parse. Need {} expressions, '
                             'received {}'.format(depth-1, len(expressions)))

    # compile regexes, instantiate index list
    regexes, indices = [re.compile(ex) for ex in expressions], [-1]*len(expressions)
    temp = []

    # loop through file
    for line in infile:

        # check for matches to the regexes
        for i, reg in enumerate(regexes):
            found = reg.search(line)
            if found:
                # check that we've hit the first chapter and verse
                if indices.count(-1) == 0:
                    ja.set_element(indices, cleaner(temp), [])
                    temp = []

                    if grab_all:
                        temp.append(line)
                gimt = getGematria(found.group('gim'))
                if gimt != 0:
                    indices[i] = gimt - 1
                else:
                    indices[i] += 1
                indices[i+1:] = [-1 if x >= 0 else x for x in indices[i+1:]]
                break

        else:
            if indices.count(-1) == 0:
                temp.append(line)
    else:
        ja.set_element(indices, cleaner(temp), [])

    return ja
Example #28
0
def identify_star_locations(filename):
    def get_regex():
        partial_regexes = [
            u'@12([\u05d0-\u05ea]{1,3})', u'@11([\u05d0-\u05ea])', u'@11(\*)'
        ]
        names = [u'siman', u'seif', u'star']
        my_full_regexes = [
            u'(?P<{}>{})'.format(*i) for i in zip(names, partial_regexes)
        ]
        return re.compile(u'|'.join(my_full_regexes))

    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()

    siman, seif_index, seif_letter, num_stars = -1, -1, None, 0
    star_locations, current_star = [], {}
    line_regex = get_regex()

    for line in lines:
        line_data = line_regex.search(line)
        if line_data is None:
            continue

        elif line_data.lastgroup == u'star':
            num_stars += 1
            current_star = {
                u'siman_num': siman,
                u'preceding_index': seif_index,
                u'preceding_letter': seif_letter
            }

        else:
            if line_data.lastgroup == u'seif':
                seif_index += 1
                seif_letter = line_data.group(line_data.lastindex + 1)

            elif line_data.lastgroup == u'siman':
                siman = getGematria(line_data.group(line_data.lastindex + 1))
                seif_index = -1
                seif_letter = None

            else:
                raise LookupError(u"Expecting seif or siman, got {}".format(
                    line_data.lastgroup))

            if num_stars >= 1:
                current_star[u'star_count'] = num_stars
                current_star[u'following_index'] = seif_index
                current_star[u'following_letter'] = seif_letter
                star_locations.append(current_star)
                num_stars = 0
    else:
        if num_stars >= 1:
            current_star[u'star_count'] = num_stars
            current_star[u'following_index'] = 0
            current_star[u'following_letter'] = None
            star_locations.append(current_star)

    return star_locations
Example #29
0
def fill_in_missing_sections_and_update_last(each_line, base_list, this_regex, filler, last_index):
    match_object = this_regex.search(each_line)
    current_index = util.getGematria(match_object.group(1))
    diff = current_index - last_index
    while diff > 1:
        base_list.append(filler)
        diff -= 1
    return current_index
Example #30
0
def fill_in_missing_sections_and_updated_last(each_line, base_list, this_regex, filler, last_index):
    match_object = this_regex.search(each_line)
    current_index = util.getGematria(match_object.group(1))
    diff = current_index - last_index
    while diff > 1:
        base_list.append(filler)
        diff -= 1
    return current_index
def seferHamitzvot_from_rasag_comm(rasagCsvName, with_orig = False):
        # ind_rasag_comm = library.get_index("Commentary on Sefer Hamitzvot of Rasag")
        segments = Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Positive_Commandments').all_segment_refs()
        segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Negative_Commandments').all_segment_refs())
        segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Laws_of_the_Courts').all_segment_refs())
        segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Communal_Laws').all_segment_refs())

        cnt = {"Rasag":0, "Sefer HaMitzvot":0, "Semag":0, "Semak":0}
        dict_list = []
        for seg in segments:
            # sfHmtzvot = re.search(u'(?:ספר המצו?ות|סה"מ).{1,4}(עשין|לאוין|עשה|לא תעשה).{0,20}', seg.text('he').text)
            sfHmtzvot = re.search(u'(?:ספר המצוות|סה"מ)\s{1,4}\((.*?)\)', seg.text('he').text)
            smg = re.search(u'סמ"ג \((.*?)\)', seg.text('he').text)
            smk = re.search(u'סמ"ק (\(.*?\))', seg.text('he').text)
            row_dict = {}
            row_orig = {}
            if sfHmtzvot:
                # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                # row_orig["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                kind, simanim = rasag_exctractor(sfHmtzvot.group(1))
                # row_dict["Sefer HaMitzvot"] = ['Sefer HaMitzvot, {}.{}'.format(kind, siman) for siman in simanim]
                if kind:
                    row_dict["Sefer HaMitzvot"] = 'Sefer HaMitzvot, {}.{}'.format(kind, simanim[0])
                else:
                    print "no kind", sfHmtzvot.group(1)
                row_orig["Sefer HaMitzvot"] = sfHmtzvot.group()
                cnt["Sefer HaMitzvot"] += 1
            if smg:
                # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                kind, simanim = rasag_exctractor(smg.group(1))
                # row_dict["Semag"] = ['Sefer Mitzvot Gadol, {}.{}'.format(kind, siman) for siman in simanim]
                if kind:
                    row_dict["Semag"] = 'Sefer Mitzvot Gadol, {}.{}'.format(kind, simanim[0])
                else:
                    print "no kind", smg.group(1)
                row_orig["Semag"] = smg.group()
                cnt["Semag"] += 1
            if smk:
                # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                # simanim = siman_smk_exctractor(smk.group(1))
                smki = re.search(u"ב?סי'\s+(.*?)(?:\s*\))", smk.group(1))
                if smki:
                    siman = getGematria(smki.group(1))
                    row_dict["Semak"] = "Sefer Mitzvot Katan.{}".format(siman)
                    row_orig["Semak"] = smk.group()
                    cnt["Semak"] += 1
                else:
                    print u'***siman***' + smk.group()

            if row_dict:
                cnt["Rasag"] += 1
                row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                row_orig["Rasag"] = seg.normal()
                if with_orig:
                    dict_list.append(row_orig)
                dict_list.append(row_dict)
        toCsv(rasagCsvName, ["Rasag", "Sefer HaMitzvot", "Semag", "Semak"], dict_list)
        print cnt
Example #32
0
def identify_errors(siman, pattern, sequence_code):
    errors = []
    matches = list(re.finditer(pattern, siman))
    previous = 0
    jump_ahead = False
    for i, match in enumerate(matches):
        if jump_ahead:
            jump_ahead = False
            continue
        try:
            current, following = getGematria(match.group(1)), getGematria(
                matches[i + 1].group(1))
        except IndexError:
            break
        if current - previous == 0:  # double tag
            previous = current
            continue

        elif current - previous == 2 and following - current == 1:  # missing tag
            error = {
                u'type': u'missing',
                u'from_sequence': sequence_code,
                u'value': current - 1,
            }
            if i == 0:
                error[u'range'] = (0, match.start())
            else:
                error[u'range'] = (matches[i - 1].end(), match.start())
            errors.append(error)
            previous = current
            continue

        elif following - previous == 1 and current - previous != 1:  # out of place
            errors.append({
                u'type': u'out_of_place',
                u'from_sequence': sequence_code,
                u'value': current,
                u'tag': match.group(),
                u'loc': match.start()
            })
            previous = following
            jump_ahead = True
        else:
            previous = current
    return errors
def check_vav(st):
    if not st:
        return False
    if st[0] == u'ו':
        if is_hebrew_number(st[1:]):
            return getGematria(st[1:])
        else:
            return False
    return False
Example #34
0
def scrape_wiki():
    url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"

    page = requests.get(url)
    soup_body = BeautifulSoup(page.text, "lxml")
    tables = soup_body.select(".mw-parser-output > table")

    pairs = []
    links = []

    for table in tables:
        table_tr = table.select("tr")
        for col in table_tr:
            pairs.append((col.contents[1].text.strip(),
                          re.sub(u'</?td>', u'',
                                 col.contents[-1].text).strip()))

    for pair in pairs:
        if re.search(u'ספר|מספר', pair[0]):
            continue
        neg_pos = u"Negative Mitzvot" if re.search(
            u"לאו", pair[1]) else u'Positive Mitzvot'
        rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
        chinukh = getGematria(pair[0])
        print chinukh, rambam
        chinukh_simanlen = len(
            Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
        print neg_pos
        link = ({
            "refs": [
                u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1,
                                                   chinukh_simanlen),
                u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
            ],
            "type":
            "Sifrei Mitzvot",
            "auto":
            True,
            "generated_by":
            "chinukh_rambam_sfm_linker"  # _sfm_linker what is this parametor intended to be?
        })
        print link['refs']
        links.append(link)
        return links
Example #35
0
def parse(file_name):
    chapter_number = regex.compile('@00([\u05d0-\u05ea]{1,2})')
    chapter_index = 1
    section, comment = [], []

    seven, shorashim, nine = [], [], []
    chapter_seven_intro = True

    with codecs.open(file_name, 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if chapter_index != 7 and chapter_index != 9:
                    section.append(comment)
                    comment = []

                elif chapter_index == 7:
                    shorashim.append(comment)
                    seven.append(shorashim)
                    section.append(seven)
                    comment = []

                elif chapter_index == 9:
                    nine.append(comment)
                    section.append(nine)
                    comment = []

                match_object = chapter_number.search(each_line)
                chapter_index = util.getGematria(match_object.group(1))

            elif chapter_index != 7 and chapter_index != 9:
                each_line = clean_up(each_line)
                comment.append(each_line)

            elif chapter_index == 7:
                if "@01" in each_line:
                    if chapter_seven_intro:
                        seven.append(comment)
                        comment = []
                        chapter_seven_intro = False
                    else:
                        shorashim.append(comment)
                        comment = []
                else:
                    comment.append(each_line)

            elif chapter_index == 9:
                if "@01" in each_line:
                    nine.append(comment)
                    comment = []

                else:
                    comment.append(each_line)

    section.append(comment)
    return section
Example #36
0
def link_hg(hg_ja, hagahot_dict_lst, ja_raph):

    def link_hg_smk_or_raph(siman, smk_seg, hg, place_smk_hg, base_text):
        link = (
            {
                "refs": [
                    u"{} {}:{}".format(base_text, siman, smk_seg),
                    "Haggahot Chadashot on Sefer Mitzvot Katan {}:{}".format(siman, hg),  # really should be a ref link to the whole raph
                ],
                "type": "commentary",
                'inline_reference': {
                    'data-commentator': 'Haggahot Chadashot on Sefer Mitzvot Katan',
                    'data-order': place_smk_hg
                },
                "auto": True,
                "generated_by": "semak_parser"

            })
        return link

    # linking
    links = []
    smks = []
    raphs = []
    for dict in hagahot_dict_lst:
        smks += dict["smk"]
        raphs += dict["raph"]
    pts = 0
    ptr = 0
    link = None
    for dict in hagahot_dict_lst:
        # link all the haghot in a siman to the correct Semak segment
        pts_0 = 0
        ptr_0 = 0
        sim = getGematria(dict["siman"])
        # print sim
        for j, hgha in enumerate(hg_ja[sim-1]):
            smk_first = True
            if ptr < len(raphs) and smks[pts][0] == raphs[ptr][0]:
                if dict["raph"] and any([re.search(raphs[ptr][0], letter[0]) for letter in dict["raph"]]):
                    smk_first = False
            if smk_first and re.search(u"@11\({}\)".format(smks[pts][0]), hgha):  # pts < len(smks)
                link = link_hg_smk_or_raph(sim, smks[pts][1], j+1, pts_0+1, "Sefer Mitzvot Katan")
                pts += 1
                pts_0 += 1
            elif ptr < len(raphs) and re.search(u"@11\({}\)".format(raphs[ptr][0]), hgha):
                link = link_hg_smk_or_raph(sim, raphs[ptr][1], j+1, ptr_0+1, 'Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan')
                ptr += 1
                ptr_0 += 1
            else:
                print u"error {}: something with the numbering is wrong...".format(dict["siman"])

            if link:
                links.append(link)
    return links
def fill_in_missing_sections_and_updated_last(each_line, base_list, this_regex, filler, last_index):
    match_object = this_regex.search(each_line)
    string_of_mitzvot = match_object.group(1)
    string_of_mitzvot = string_of_mitzvot.strip()
    list_of_mitzvot = string_of_mitzvot.split()
    current_index = util.getGematria(list_of_mitzvot[0])
    diff = current_index - last_index
    while diff > 1:
        base_list.append(filler)
        diff -= 1
    return current_index
def fill_in_missing_sections_and_updated_last(each_line, base_list, this_regex, filler, last_index):
    match_object = this_regex.search(each_line)
    string_of_mitzvot = match_object.group(1)
    string_of_mitzvot = string_of_mitzvot.strip()
    list_of_mitzvot = string_of_mitzvot.split()
    current_index = util.getGematria(list_of_mitzvot[0])
    diff = current_index - last_index
    while diff > 1:
        base_list.append(filler)
        diff -= 1
    return current_index
def identify_star_locations(filename):
    def get_regex():
        partial_regexes = [u'@12([\u05d0-\u05ea]{1,3})', u'@11([\u05d0-\u05ea])', u'@11(\*)']
        names = [u'siman', u'seif', u'star']
        my_full_regexes = [u'(?P<{}>{})'.format(*i) for i in zip(names, partial_regexes)]
        return re.compile(u'|'.join(my_full_regexes))

    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()

    siman, seif_index, seif_letter, num_stars = -1, -1, None, 0
    star_locations, current_star = [], {}
    line_regex = get_regex()

    for line in lines:
        line_data = line_regex.search(line)
        if line_data is None:
            continue

        elif line_data.lastgroup == u'star':
            num_stars += 1
            current_star = {
                u'siman_num': siman,
                u'preceding_index': seif_index,
                u'preceding_letter': seif_letter
            }

        else:
            if line_data.lastgroup == u'seif':
                seif_index += 1
                seif_letter = line_data.group(line_data.lastindex+1)

            elif line_data.lastgroup == u'siman':
                siman = getGematria(line_data.group(line_data.lastindex+1))
                seif_index = -1
                seif_letter = None

            else: raise LookupError(u"Expecting seif or siman, got {}".format(line_data.lastgroup))

            if num_stars >= 1:
                current_star[u'star_count'] = num_stars
                current_star[u'following_index'] = seif_index
                current_star[u'following_letter'] = seif_letter
                star_locations.append(current_star)
                num_stars = 0
    else:
        if num_stars >= 1:
            current_star[u'star_count'] = num_stars
            current_star[u'following_index'] = 0
            current_star[u'following_letter'] = None
            star_locations.append(current_star)

    return star_locations
def rasag_exctractor(text):
    split = re.split(u"\s", text)
    simanim = []
    kind = None
    if re.search(u"(:?לאוין|לא תעשה)", split[0]):
            kind = u'Negative Commandments'
    elif re.search(u"(:?עשין|עשה)", split[0]):
            kind = u'Positive Commandments'
    for word in split[1:]:
        siman = getGematria(word)
        simanim.append(siman)
    return kind, simanim
Example #41
0
def xmlify(filename):
    """
    create an xml representation of the text files
    :param filename: str name of file
    """
    with codecs.open(filename, 'r', 'utf-8') as infile:
        raw_rambam = infile.read()

    chap_index = [
        getGematria(i.group(1)) for i in re.finditer(
            ur'@00\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})', raw_rambam)
    ]
    chapters = re.split(ur'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}',
                        raw_rambam)[1:]
    assert len(chap_index) == len(chapters)

    soup = BeautifulSoup(u'<root></root>', 'xml')
    for index, chapter in zip(chap_index, chapters):
        x_chapter = soup.new_tag('chapter', num=unicode(index))
        soup.root.append(x_chapter)

        v_indices = [
            getGematria(i.group(1))
            for i in re.finditer(ur'@22([\u05d0-\u05ea]{1,2})', chapter)
        ]
        verses = re.split(ur'@22[\u05d0-\u05ea]{1,2}', chapter)[1:]
        assert len(v_indices) == len(verses)

        for v_index, verse in zip(v_indices, verses):
            x_verse = soup.new_tag('verse', num=unicode(v_index))
            comments = verse.splitlines()
            for i, comment in enumerate(comments[1:]):
                x_comment = soup.new_tag('comment', num=unicode(i + 1))
                x_comment.append(comment)
                x_verse.append(x_comment)

            x_chapter.append(x_verse)
    with codecs.open('./xml/{}'.format(filename.replace('.txt', '.xml')), 'w',
                     'utf-8') as outfile:
        outfile.write(unicode(soup.prettify()))
Example #42
0
def produce_parsed_data(filename):

    with codecs.open(filename, 'r', 'utf-8') as datafile:
        parsed = util.file_to_ja(3, datafile, (m_pattern, comment_pattern), nothing)

        datafile.seek(0)

        names = util.grab_section_names(m_pattern, datafile, 1)
        names = [int(util.getGematria(name)) for name in names]

    comp_text = util.simple_to_complex(names, parsed.array())
    parsed = util.convert_dict_to_array(comp_text)

    return parsed
Example #43
0
    def _collect_hebrew_segments(self, soup):
        assert isinstance(soup, BeautifulSoup)
        he_reg = re.compile(u'^LMH')
        all_he_ps = soup.find_all('p', attrs={'class': he_reg})

        segments = []
        started = False
        chapter_reg = re.compile(ur'''\u05dc\u05d9\u05e7\u05d5\u05d8\u05d9 \u05de\u05d5\u05d4\u05e8[\u05f4"]\u05df\s(\u05ea\u05e0\u05d9\u05e0\u05d0\s)?\u05e1\u05d9\u05de\u05df\s(?P<chapter>[\u05d0-\u05ea"]{1,4})''')

        for he_p in all_he_ps:
            if he_p['class'] == u'LMH-styles_LMH-title':
                if not he_p.string:
                    raise AssertionError

                chapter_match = chapter_reg.match(he_p.string)
                if chapter_match:
                    if getGematria(chapter_match.group('chapter')) == self.number:
                        started = True

                    elif started:
                        break

            elif started:
                if re.search(u'Rashbam', he_p['class']):
                    continue
                segments.append(he_p)
            else:
                continue

        # if current segment ends on a Hebrew char, combine with the next segment
        bad_indices = []
        for i, (cur_segment, next_segment) in enumerate(zip(segments, segments[1:])):

            segment_text = cur_segment.text
            stripped_text = re.sub(u"[\u05b0-\u05C7]", u'', segment_text)  # strip nikkud
            if re.search(u'[\u05d0-\u05ea]\s*$', stripped_text):
                # merge this segment into this one
                bad_indices.append(i)
                for child in cur_segment.find_all(True):
                    child.unwrap()
                cur_segment.string = u'{} '.format(u' '.join(cur_segment.contents))
                next_segment.insert(0, cur_segment)
                cur_segment.unwrap()
            elif not segment_text:
                bad_indices.append(i)
        for i in reversed(bad_indices):
            segments.pop(i)

        assert len(segments) > 0
        self.hebrew_segments = [bleach.clean(s, tags=[], attributes={}, strip=True) for s in segments]
Example #44
0
def test_expression(pattern):
    """
    test a regular expression object to see if how well it grabs all "springs" and "rivers"
    :param pattern: regular expression string
    :return: List of missed "rivers", expressed as a tuple: (spring, river)
    """
    regex = re.compile(pattern)
    split = get_text().splitlines()
    matches = filter(None, [regex.search(match) for match in split])
    issues = []
    print u'last_match: {}'.format(matches[-1].group())

    expected_spring, expected_river = 1, 1
    for match in matches:
        spring, river = getGematria(match.group(1)), getGematria(match.group(3))
        if spring > expected_spring:
            expected_river = 1
            expected_spring = spring
        if river > expected_river:
            while river > expected_river:
                issues.append((expected_spring, expected_river))
                expected_river += 1
        expected_river += 1
    return issues
Example #45
0
def check_chapters():
    with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch:
        test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})')

        index = 1

        for header in test.grab_each_header(capture_group=1):

            header = header.replace(u'"', u'')
            count = util.getGematria(header)

            if count != index:
                print util.numToHeb(index)
                index = count
            index += 1
Example #46
0
def produce_parsed_data(filename):

    with codecs.open(filename, 'r', 'utf-8') as datafile:
        parsed = util.file_to_ja([[[]]], datafile,
                                 (m_pattern, comment_pattern), nothing)

        datafile.seek(0)

        names = util.grab_section_names(m_pattern, datafile, 1)
        names = [int(util.getGematria(name)) for name in names]

    comp_text = util.simple_to_complex(names, parsed.array())
    parsed = util.convert_dict_to_array(comp_text)

    return parsed
Example #47
0
def check_chapters():
    with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch:
        test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})')

        index = 1

        for header in test.grab_each_header(capture_group=1):

            header = header.replace(u'"', u'')
            count = util.getGematria(header)

            if count != index:
                print util.numToHeb(index)
                index = count
            index += 1
Example #48
0
def fix_file(filepath, start_siman, test_mode=False):
    output_list = []
    with codecs.open(filepath, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    counter = 0
    for line in lines:
        match = re.match(u'^@11([\u05d0-\u05ea]{1,3})$', line)
        if match and getGematria(match.group(1)) == 1:
            output_list.append(u'@00{}\n'.format(
                numToHeb(counter + start_siman)))
            counter += 1
        output_list.append(line)
    if test_mode:
        filepath = re.sub(ur'\.txt$', u'_test.txt', filepath)
    with codecs.open(filepath, 'w', 'utf-8') as fp:
        fp.writelines(output_list)
Example #49
0
def align_boaz_chapters(source_file, simple_array):
    """
    Boaz does not guarantee text for every chapter. Using the util library, this method will pad the parsed text with
     empty sections as necessary to accurately represent the data.
    :param source_file: File from which to derive chapter numbers
    :param simple_array: A "naive" parse of the data structured as a nested list.
    :return: Nested array, with proper padding to account for empty chapters.
    """

    # grab each chapter number from the source file
    chapters = [
        util.getGematria(n) for n in util.grab_section_names(
            u'@00פרק ([\u05d0-\u05ea]{1,2})', source_file, 1)
    ]
    as_dict = util.simple_to_complex(chapters, simple_array)
    return util.convert_dict_to_array(as_dict)
Example #50
0
def parser(name):
    with codecs.open('{}.txt'.format(name), 'r', 'utf-8') as infile:
        lines = infile.readlines()
    parsed_text = JaggedArray([[[]]])
    links = []
    chapter, mishnah, comment = -1, -1, -1
    for line in lines:
        if re.match(ur'@00\u05e4\u05e8\u05e7', line) is not None:
            chapter += 1
            comment = -1
            continue

        elif re.match(ur'@22', line) is not None:
            mishnah = getGematria(re.match(ur'@22([\u05d0-\u05ea]{1,2})', line).group(1)) - 1
            comment = -1
            continue
Example #51
0
def parse_and_post(file_name):
    mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})')
    rb_yonah_on_avot, perek_level_list, mishna_level_list = [], [], []
    new_perek, first_perek = True, True
    last_mishna = 0
    with codecs.open(file_name, 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if not first_perek:
                    perek_level_list.append(mishna_level_list)
                    rb_yonah_on_avot.append(perek_level_list)
                    perek_level_list, mishna_level_list = [], []
                    new_perek = True

                else:
                    first_perek = False

            elif "@22" in each_line:
                if not new_perek:
                    perek_level_list.append(mishna_level_list)
                    mishna_level_list = []

                    match_object = mishna_number_regex.search(each_line)
                    mishna_number = util.getGematria(match_object.group(1))
                    diff = mishna_number - last_mishna
                    while diff > 1:
                        perek_level_list.append([])
                        diff -= 1

                    last_mishna = mishna_number

                else:
                    new_perek = False
                    last_mishna = 1

            else:
                divided_string = each_line.split(u'~')
                for line in divided_string:
                    line = line.strip()
                    if line:
                        line = clean_up_string(line)
                        mishna_level_list.append(line)

        rb_yonah_on_avot.append(perek_level_list)
        post_the_text(rb_yonah_on_avot)
    return rb_yonah_on_avot
def parse_and_post(file_name):
    mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})')
    rb_yonah_on_avot, perek_level_list, mishna_level_list = [], [], []
    new_perek, first_perek = True, True
    last_mishna = 0
    with codecs.open(file_name, 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if not first_perek:
                    perek_level_list.append(mishna_level_list)
                    rb_yonah_on_avot.append(perek_level_list)
                    perek_level_list, mishna_level_list = [], []
                    new_perek = True

                else:
                    first_perek = False

            elif "@22" in each_line:
                if not new_perek:
                    perek_level_list.append(mishna_level_list)
                    mishna_level_list = []

                    match_object = mishna_number_regex.search(each_line)
                    mishna_number = util.getGematria(match_object.group(1))
                    diff = mishna_number - last_mishna
                    while diff > 1:
                        perek_level_list.append([])
                        diff -= 1

                    last_mishna = mishna_number

                else:
                    new_perek = False
                    last_mishna = 1

            else:
                divided_string = each_line.split(u'~')
                for line in divided_string:
                    line = line.strip()
                    if line:
                        line = clean_up_string(line)
                        mishna_level_list.append(line)

        rb_yonah_on_avot.append(perek_level_list)
        post_the_text(rb_yonah_on_avot)
    return rb_yonah_on_avot
Example #53
0
def parse():
    mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})')
    gra_on_avot, perek_level_list, mishna_level_list = [], [], []
    new_perek, first_perek = True, True
    last_mishna = 0
    with codecs.open('gra_on_avot.txt', 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if not first_perek:
                    perek_level_list.append(mishna_level_list)
                    gra_on_avot.append(perek_level_list)
                    perek_level_list, mishna_level_list = [], []
                    new_perek = True

                else:
                    first_perek = False

            elif "@22" in each_line:
                if not new_perek:
                    perek_level_list.append(mishna_level_list)
                    mishna_level_list = []

                    match_object = mishna_number_regex.search(each_line)
                    mishna_number = util.getGematria(match_object.group(1))
                    diff = mishna_number - last_mishna
                    while diff > 1:
                        perek_level_list.append([])
                        diff -= 1

                    last_mishna = mishna_number

                else:
                    new_perek = False
                    last_mishna = 1

            else:
                each_line = clean_up_string(each_line)
                mishna_level_list.append(each_line)


        gra_on_avot.append(perek_level_list)

    return gra_on_avot
Example #54
0
def parse():
    mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})')
    gra_on_avot, perek_level_list, mishna_level_list = [], [], []
    new_perek, first_perek = True, True
    last_mishna = 0
    with codecs.open('gra_on_avot.txt', 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if not first_perek:
                    perek_level_list.append(mishna_level_list)
                    gra_on_avot.append(perek_level_list)
                    perek_level_list, mishna_level_list = [], []
                    new_perek = True

                else:
                    first_perek = False

            elif "@22" in each_line:
                if not new_perek:
                    perek_level_list.append(mishna_level_list)
                    mishna_level_list = []

                    match_object = mishna_number_regex.search(each_line)
                    mishna_number = util.getGematria(match_object.group(1))
                    diff = mishna_number - last_mishna
                    while diff > 1:
                        perek_level_list.append([])
                        diff -= 1

                    last_mishna = mishna_number

                else:
                    new_perek = False
                    last_mishna = 1

            else:
                each_line = clean_up_string(each_line)
                mishna_level_list.append(each_line)

        gra_on_avot.append(perek_level_list)

    return gra_on_avot
Example #55
0
def regs_devide(lines, regs, eof=None):
    reg = regs[0]
    ja = []
    letter = []
    siman = []
    for line in lines:
        comb_letter = ' '.join(letter)
        if re.search(reg, line) or (eof and re.search(eof, line)):
            siman.append(comb_letter)
            letter = []
            if re.search(reg, line):
                gim = getGematria(re.search(reg, line).group(1))
            if gim == 1 or (eof and re.search(eof, line)):
                ja.append(siman)
                if siman == ['']:
                    ja.pop()
                siman = []
        letter.append(line)
    return ja
Example #56
0
    def _set_he_section_transitions(self):
        transition_list = []
        current_segment = 1

        for seg_num, segment in enumerate(self._hebrew_segments):
            match = re.match(u'^([\u05d0-\u05d8]|[\u05d9-\u05dc][\u05d0-\u05d8]?|\u05d8[\u05d5\u05d6])\.\s', segment)
            if not match:
                continue
            next_segment = getGematria(match.group(1))

            if next_segment == 1:
                pass
            elif next_segment - current_segment != 1:
                print "Bad hebrew section transition found in chapter {}".format(self.number)
                raise AssertionError
            else:
                transition_list.append(seg_num)
                current_segment = next_segment

        self._he_section_transitions = tuple(transition_list)