def set_current_perek_pasuk(self, text):
     perek_pasuk = re.compile(u"(פרק .{1,8})?.{0,3}פסוק .{1,8}")
     match = perek_pasuk.match(text)
     if match:
         second_group = match.group(1) # should be Perek
         if second_group:
             perek_pos = text.split().index(u"פרק")
             self.current_perek = getGematria(text.split()[perek_pos+1])
         pasuk_pos = text.split().index(u"פסוק")
         assert pasuk_pos != -1, "Assumed that pasuk info was here but there isn't any."
         self.current_pasuk = getGematria(text.split()[pasuk_pos+1])
         self.quotation_stack.append(u"{} {}:{}".format(self.current_sefer, self.current_perek, self.current_pasuk))
         return True
     else:
         return False
Exemple #2
0
def get_pasuk_chapter(en, he, curr_pasuk, curr_chapter):
    he = getGematria(he)
    assert int(en) == he
    new_pasuk = int(en)
    if new_pasuk < curr_pasuk:
        curr_chapter = curr_chapter + 1
    return new_pasuk, curr_chapter
Exemple #3
0
    def __init__(self, filepath, regex):
        with codecs.open(filepath, 'r', 'utf-8') as infile:
            lines = infile.readlines()

        sections, section_mapping = [], {}
        current_section, section_num, section_index = [], None, 0

        for line in lines:
            match = re.search(regex, line)
            if match:
                if len(current_section) > 0:
                    sections.append(u''.join(current_section))
                    if section_num:
                        section_mapping[section_num] = section_index
                    section_index += 1
                    current_section = []
                section_num = getGematria(match.group(1))

            current_section.append(line)
        else:
            sections.append(u''.join(current_section))
            section_mapping[section_num] = section_index

        self._sections = sections
        self._section_mapping = section_mapping
def determine_match(commentary_name, commentary_regex):
    issues = 0
    full_pattern = u'{} (?>[@!/*][^ ]* )*(?P<dh>[^ ]+)'.format(commentary_regex)
    full_mechaber = Root('../../Even_HaEzer.xml').get_base_text()
    error_counter = Counter()

    for siman_num, siman in enumerate(full_mechaber.get_simanim()):
        for seif_num, seif in enumerate(siman.get_child()):
            matches = regex.finditer(full_pattern, unicode(seif))

            for regex_match in matches:
                c_ref = Ref(u'{} {}:{}'.format(commentary_name, siman_num+1, getGematria(regex_match.group('ref'))))
                try:
                    c_text = c_ref.text('he').text.split()[0]
                except IndexError:
                    continue
                c_text = re.sub(u'[^\u05d0-\u05ea]', u'', c_text)
                dh_text = re.sub(u'[^\u05d0-\u05ea]', u'',regex_match.group('dh'))

                ratio = fuzz.ratio(dh_text, c_text)

                if ratio < 75.0:
                    issues += 1
                    print u"Potential mismatch:"
                    print u"Shulchan Arukh, Even HaEzer {}:{}   {}".format(siman_num+1, seif_num+1, dh_text)
                    print u"{}   {}".format(c_ref.normal(), c_text)
                    print u"Score: {}".format(ratio)
                    error_counter[(dh_text, c_text)] += 1
    print u"Total issues: {}".format(issues)
    return error_counter
    def __init__(self, filepath, regex):
        with codecs.open(filepath, 'r', 'utf-8') as infile:
            lines = infile.readlines()

        sections, section_mapping = [], {}
        current_section, section_num, section_index = [], None, 0

        for line in lines:
            match = re.search(regex, line)
            if match:
                if len(current_section) > 0:
                    sections.append(u''.join(current_section))
                    if section_num:
                        section_mapping[section_num] = section_index
                    section_index += 1
                    current_section = []
                section_num = getGematria(match.group(1))

            current_section.append(line)
        else:
            sections.append(u''.join(current_section))
            section_mapping[section_num] = section_index

        self._sections = sections
        self._section_mapping = section_mapping
Exemple #6
0
def determine_match(commentary_name, commentary_regex):
    issues = 0
    full_pattern = u'{} (?>[@!/*][^ ]* )*(?P<dh>[^ ]+)'.format(
        commentary_regex)
    full_mechaber = Root('../../Even_HaEzer.xml').get_base_text()
    error_counter = Counter()

    for siman_num, siman in enumerate(full_mechaber.get_simanim()):
        for seif_num, seif in enumerate(siman.get_child()):
            matches = regex.finditer(full_pattern, unicode(seif))

            for regex_match in matches:
                c_ref = Ref(u'{} {}:{}'.format(
                    commentary_name, siman_num + 1,
                    getGematria(regex_match.group('ref'))))
                try:
                    c_text = c_ref.text('he').text.split()[0]
                except IndexError:
                    continue
                c_text = re.sub(u'[^\u05d0-\u05ea]', u'', c_text)
                dh_text = re.sub(u'[^\u05d0-\u05ea]', u'',
                                 regex_match.group('dh'))

                ratio = fuzz.ratio(dh_text, c_text)

                if ratio < 75.0:
                    issues += 1
                    print u"Potential mismatch:"
                    print u"Shulchan Arukh, Even HaEzer {}:{}   {}".format(
                        siman_num + 1, seif_num + 1, dh_text)
                    print u"{}   {}".format(c_ref.normal(), c_text)
                    print u"Score: {}".format(ratio)
                    error_counter[(dh_text, c_text)] += 1
    print u"Total issues: {}".format(issues)
    return error_counter
def check_links(seif, pattern, commentary):
    """

    :param Seif seif:
    :param pattern:
    :param commentary
    :return:
    """
    add, remove = [], []
    siman = seif.get_parent().num
    reflinks = seif.grab_references(pattern)
    for l in reflinks:
        comment_num = getGematria(l.group(1))
        comment_ref = Ref(u'{} {}:{}'.format(commentary, siman, comment_num))
        comment_links = LinkSet(comment_ref).filter(u"Shulchan Arukh, Orach Chayim")

        if len(comment_links) == 1:
            prod_seif = comment_links[0].ref_opposite(comment_ref)
            if prod_seif.sections[-1] != seif.num:
                remove.append(comment_links[0])
                add.append((u'Shulchan Arukh, Orach Chayim {}:{}'.format(siman, seif.num), comment_ref.normal()))
        elif len(comment_links) == 0:
            add.append((u'Shulchan Arukh, Orach Chayim {}:{}'.format(siman, seif.num), comment_ref.normal()))
        else:
            raise AssertionError("{} has {} comments".format(comment_ref.normal(), len(comment_links)))
    return {'add': add, 'remove': remove}
Exemple #8
0
def createStringForReference(everySelfReference, theSource):
    theDafNumber = functions.getGematria(everySelfReference[1])
    if (everySelfReference[2] == '.'):
        whichAmud = 'a'
    else:
        whichAmud = 'b'
    splitString = theSource.split(' ')
    if (splitString[3].isalpha()):
        return u'{} {} {}{}'.format(splitString[2], splitString[3], theDafNumber, whichAmud)
    return u'{} {}{}'.format(splitString[2], theDafNumber, whichAmud)
Exemple #9
0
 def __init__(self, html, parasha, title, year, ref=None):
     self.html = html
     self.title = title
     self.parasha = parasha
     self.haftarah = parasha.haftarah
     self.he_year = re.sub(u"שנת", u"", year).strip()
     self.year = getGematria(self.he_year)+5000  # +1240, jewish year is more accurate
     self.sections = []
     self.pesukim = self.get_ref(ref)  # (re.sub(u"(פרק(ים)?|פסוק(ים)?)", u"", ref).strip())
     self.sheet_remark = u""
     self.header_links = None  # this will link to other  nechama sheets (if referred).
def parse_boaz(input_file):

    expression = u'@00(?:\u05e4\u05e8\u05e7 |\u05e4")([\u05d0-\u05ea"]{1,3})'

    simple_parse = file_to_ja([[]], input_file, [expression], boaz_align)

    # reset file
    input_file.seek(0)

    headers = [functions.getGematria(x) for x in grab_section_names(expression, input_file, 1)]

    comp_parse = simple_to_complex(headers, simple_parse.array())

    full_parse = functions.convertDictToArray(comp_parse)

    return full_parse
Exemple #11
0
def parse_boaz(input_file):

    expression = u'@00(?:\u05e4\u05e8\u05e7 |\u05e4")([\u05d0-\u05ea"]{1,3})'

    simple_parse = file_to_ja([[]], input_file, [expression], boaz_align)

    # reset file
    input_file.seek(0)

    headers = [
        functions.getGematria(x)
        for x in grab_section_names(expression, input_file, 1)
    ]

    comp_parse = simple_to_complex(headers, simple_parse.array())

    full_parse = functions.convertDictToArray(comp_parse)

    return full_parse
Exemple #12
0
def parse_main():
    with open("main.txt") as f:
        numbers = set()
        daf = 0
        full_text = {}
        text_dicts = {}
        text_dicts["gemara"] = {}
        text_dicts["rashi"] = {}
        text_dicts["tosafot"] = {}
        prev_dict = "gemara"
        map_full_text_to_commentary = {}
        lines = [line for line in list(f) if line.split()]
        for line_n, line in enumerate(lines):
            line = line.decode('utf-8')
            for num in re.findall(u"\d+", line):
                numbers.add(num)
            if line.split()[0].find("40") >= 0:
                if u"""ע"ב""" in line:
                    daf += 1
                    full_text[daf] = []
                    text_dicts["gemara"][daf] = []
                    text_dicts["tosafot"][daf] = []
                    text_dicts["rashi"][daf] = []
                    map_full_text_to_commentary[daf] = []
                elif u"דף" in line:
                    assert len(line.split()) == 2
                    new_daf = getGematria(line.split()[1]) * 2 - 1
                    assert new_daf > daf
                    daf = new_daf
                    full_text[daf] = []
                    text_dicts["gemara"][daf] = []
                    text_dicts["tosafot"][daf] = []
                    text_dicts["rashi"][daf] = []
                    map_full_text_to_commentary[daf] = []
            else:
                line = line.replace("\r", "").replace("\n", "")
                if len(line) > 2:
                    line = parse(line)
                    prev_dict = add_line(line, daf, prev_dict, full_text,
                                         text_dicts,
                                         map_full_text_to_commentary)
    return full_text, text_dicts, map_full_text_to_commentary
Exemple #13
0
def parse_main():
    with open("main.txt") as f:
        numbers = set()
        daf = 0
        full_text = {}
        text_dicts = {}
        text_dicts["gemara"] = {}
        text_dicts["rashi"] = {}
        text_dicts["tosafot"] = {}
        prev_dict = "gemara"
        map_full_text_to_commentary = {}
        lines = [line for line in list(f) if line.split()]
        for line_n, line in enumerate(lines):
            line = line.decode('utf-8')
            for num in re.findall(u"\d+", line):
                numbers.add(num)
            if line.split()[0].find("40") >= 0:
                if u"""ע"ב""" in line:
                    daf += 1
                    full_text[daf] = []
                    text_dicts["gemara"][daf] = []
                    text_dicts["tosafot"][daf] = []
                    text_dicts["rashi"][daf] = []
                    map_full_text_to_commentary[daf] = []
                elif u"דף" in line:
                    assert len(line.split()) == 2
                    new_daf = getGematria(line.split()[1]) * 2 - 1
                    assert new_daf > daf
                    daf = new_daf
                    full_text[daf] = []
                    text_dicts["gemara"][daf] = []
                    text_dicts["tosafot"][daf] = []
                    text_dicts["rashi"][daf] = []
                    map_full_text_to_commentary[daf] = []
            else:
                line = line.replace("\r", "").replace("\n", "")
                if len(line) > 2:
                    line = parse(line)
                    prev_dict = add_line(line, daf, prev_dict, full_text, text_dicts, map_full_text_to_commentary)
    return full_text, text_dicts, map_full_text_to_commentary
Exemple #14
0
 def load_sheets(self):
     page_missing = u'דף שגיאות'
     for i in self.bereshit_parshiot:
         content = BeautifulSoup(open("{}.html".format(i)), "lxml")
         header = content.find('div', {'id': 'contentTop'})
         if page_missing in header.text:
             continue
         hebrew_year = content.find("div", {"id": "year"}).text.replace(u"שנת", u"")
         roman_year = getGematria(hebrew_year) + 1240
         parsha = content.find("div", {"id": "paging"}).text
         self.current_sefer, self.current_perakim = self.extract_perek_info(content)
         print "Sheet {}".format(i)
         text = content.find("div", {"id": "contentBody"})
         if parsha not in self.sheets:
             self.sheets[parsha] = {}
         assert roman_year not in self.sheets[parsha].keys()
         self.parsha_and_year_to_url[parsha+" "+str(roman_year)] = i
         self.current_url = i
         self.current_perek = self.current_perakim[0]
         self.quotation_stack.append(u"{} {}".format(self.current_sefer, self.current_perek))
         self.sheets[parsha][roman_year] = (hebrew_year, self.current_sefer, self.current_perakim, self.parse_as_text(text))
         pass
def chapter_in_order(infile, tag, tag_reg, group=0):
    """
    Check that the chapters run in order
    :param infile: input file to examine
    :param tag: Exact form of tag
    :param tag_reg: A regular expression to use to find chapters
    :param group: Capture group for regex if necessary
    :return: A list of lines where order is broken
    """

    # grab all chapter headers and convert to numbers
    tester = tests.TagTester(tag, infile, tag_reg)
    tester.skip_to_next_segment(u'@00')
    all_chapters = []
    while not tester.eof:
        titles = tester.grab_each_header(u'@00', group)
        chap_numbers = [functions.getGematria(txt) for txt in titles]
        all_chapters.append(chap_numbers)

    # check that chapters match index
    for book_num, book in enumerate(all_chapters):
        for index, chapter in enumerate(book):
            if chapter - index != 1:
                print 'error in {} chapter {}'.format(book_num+1, chapter)
Exemple #16
0
def chapter_in_order(infile, tag, tag_reg, group=0):
    """
    Check that the chapters run in order
    :param infile: input file to examine
    :param tag: Exact form of tag
    :param tag_reg: A regular expression to use to find chapters
    :param group: Capture group for regex if necessary
    :return: A list of lines where order is broken
    """

    # grab all chapter headers and convert to numbers
    tester = tests.TagTester(tag, infile, tag_reg)
    tester.skip_to_next_segment(u'@00')
    all_chapters = []
    while not tester.eof:
        titles = tester.grab_each_header(u'@00', group)
        chap_numbers = [functions.getGematria(txt) for txt in titles]
        all_chapters.append(chap_numbers)

    # check that chapters match index
    for book_num, book in enumerate(all_chapters):
        for index, chapter in enumerate(book):
            if chapter - index != 1:
                print 'error in {} chapter {}'.format(book_num + 1, chapter)
Exemple #17
0
def getSeifNumber(txt):
    assert u"סעיף" in txt
    seif_number_he = txt.split(' ')[1]
    return getGematria(seif_number_he)
Exemple #18
0
 def extract_perek_info(self, content):
     perek_info = content.find("p", {"id": "pasuk"}).text
     sefer = perek_info.split()[0]
     pereks = re.findall(u"פרק\s+(.*?)\s+", perek_info)
     return (sefer, [getGematria(perek) for perek in pereks])