def set_current_perek_pasuk(self, text): perek_pasuk = re.compile(u"(פרק .{1,8})?.{0,3}פסוק .{1,8}") match = perek_pasuk.match(text) if match: second_group = match.group(1) # should be Perek if second_group: perek_pos = text.split().index(u"פרק") self.current_perek = getGematria(text.split()[perek_pos+1]) pasuk_pos = text.split().index(u"פסוק") assert pasuk_pos != -1, "Assumed that pasuk info was here but there isn't any." self.current_pasuk = getGematria(text.split()[pasuk_pos+1]) self.quotation_stack.append(u"{} {}:{}".format(self.current_sefer, self.current_perek, self.current_pasuk)) return True else: return False
def get_pasuk_chapter(en, he, curr_pasuk, curr_chapter): he = getGematria(he) assert int(en) == he new_pasuk = int(en) if new_pasuk < curr_pasuk: curr_chapter = curr_chapter + 1 return new_pasuk, curr_chapter
def __init__(self, filepath, regex): with codecs.open(filepath, 'r', 'utf-8') as infile: lines = infile.readlines() sections, section_mapping = [], {} current_section, section_num, section_index = [], None, 0 for line in lines: match = re.search(regex, line) if match: if len(current_section) > 0: sections.append(u''.join(current_section)) if section_num: section_mapping[section_num] = section_index section_index += 1 current_section = [] section_num = getGematria(match.group(1)) current_section.append(line) else: sections.append(u''.join(current_section)) section_mapping[section_num] = section_index self._sections = sections self._section_mapping = section_mapping
def determine_match(commentary_name, commentary_regex): issues = 0 full_pattern = u'{} (?>[@!/*][^ ]* )*(?P<dh>[^ ]+)'.format(commentary_regex) full_mechaber = Root('../../Even_HaEzer.xml').get_base_text() error_counter = Counter() for siman_num, siman in enumerate(full_mechaber.get_simanim()): for seif_num, seif in enumerate(siman.get_child()): matches = regex.finditer(full_pattern, unicode(seif)) for regex_match in matches: c_ref = Ref(u'{} {}:{}'.format(commentary_name, siman_num+1, getGematria(regex_match.group('ref')))) try: c_text = c_ref.text('he').text.split()[0] except IndexError: continue c_text = re.sub(u'[^\u05d0-\u05ea]', u'', c_text) dh_text = re.sub(u'[^\u05d0-\u05ea]', u'',regex_match.group('dh')) ratio = fuzz.ratio(dh_text, c_text) if ratio < 75.0: issues += 1 print u"Potential mismatch:" print u"Shulchan Arukh, Even HaEzer {}:{} {}".format(siman_num+1, seif_num+1, dh_text) print u"{} {}".format(c_ref.normal(), c_text) print u"Score: {}".format(ratio) error_counter[(dh_text, c_text)] += 1 print u"Total issues: {}".format(issues) return error_counter
def determine_match(commentary_name, commentary_regex): issues = 0 full_pattern = u'{} (?>[@!/*][^ ]* )*(?P<dh>[^ ]+)'.format( commentary_regex) full_mechaber = Root('../../Even_HaEzer.xml').get_base_text() error_counter = Counter() for siman_num, siman in enumerate(full_mechaber.get_simanim()): for seif_num, seif in enumerate(siman.get_child()): matches = regex.finditer(full_pattern, unicode(seif)) for regex_match in matches: c_ref = Ref(u'{} {}:{}'.format( commentary_name, siman_num + 1, getGematria(regex_match.group('ref')))) try: c_text = c_ref.text('he').text.split()[0] except IndexError: continue c_text = re.sub(u'[^\u05d0-\u05ea]', u'', c_text) dh_text = re.sub(u'[^\u05d0-\u05ea]', u'', regex_match.group('dh')) ratio = fuzz.ratio(dh_text, c_text) if ratio < 75.0: issues += 1 print u"Potential mismatch:" print u"Shulchan Arukh, Even HaEzer {}:{} {}".format( siman_num + 1, seif_num + 1, dh_text) print u"{} {}".format(c_ref.normal(), c_text) print u"Score: {}".format(ratio) error_counter[(dh_text, c_text)] += 1 print u"Total issues: {}".format(issues) return error_counter
def check_links(seif, pattern, commentary): """ :param Seif seif: :param pattern: :param commentary :return: """ add, remove = [], [] siman = seif.get_parent().num reflinks = seif.grab_references(pattern) for l in reflinks: comment_num = getGematria(l.group(1)) comment_ref = Ref(u'{} {}:{}'.format(commentary, siman, comment_num)) comment_links = LinkSet(comment_ref).filter(u"Shulchan Arukh, Orach Chayim") if len(comment_links) == 1: prod_seif = comment_links[0].ref_opposite(comment_ref) if prod_seif.sections[-1] != seif.num: remove.append(comment_links[0]) add.append((u'Shulchan Arukh, Orach Chayim {}:{}'.format(siman, seif.num), comment_ref.normal())) elif len(comment_links) == 0: add.append((u'Shulchan Arukh, Orach Chayim {}:{}'.format(siman, seif.num), comment_ref.normal())) else: raise AssertionError("{} has {} comments".format(comment_ref.normal(), len(comment_links))) return {'add': add, 'remove': remove}
def createStringForReference(everySelfReference, theSource): theDafNumber = functions.getGematria(everySelfReference[1]) if (everySelfReference[2] == '.'): whichAmud = 'a' else: whichAmud = 'b' splitString = theSource.split(' ') if (splitString[3].isalpha()): return u'{} {} {}{}'.format(splitString[2], splitString[3], theDafNumber, whichAmud) return u'{} {}{}'.format(splitString[2], theDafNumber, whichAmud)
def __init__(self, html, parasha, title, year, ref=None): self.html = html self.title = title self.parasha = parasha self.haftarah = parasha.haftarah self.he_year = re.sub(u"שנת", u"", year).strip() self.year = getGematria(self.he_year)+5000 # +1240, jewish year is more accurate self.sections = [] self.pesukim = self.get_ref(ref) # (re.sub(u"(פרק(ים)?|פסוק(ים)?)", u"", ref).strip()) self.sheet_remark = u"" self.header_links = None # this will link to other nechama sheets (if referred).
def parse_boaz(input_file): expression = u'@00(?:\u05e4\u05e8\u05e7 |\u05e4")([\u05d0-\u05ea"]{1,3})' simple_parse = file_to_ja([[]], input_file, [expression], boaz_align) # reset file input_file.seek(0) headers = [functions.getGematria(x) for x in grab_section_names(expression, input_file, 1)] comp_parse = simple_to_complex(headers, simple_parse.array()) full_parse = functions.convertDictToArray(comp_parse) return full_parse
def parse_boaz(input_file): expression = u'@00(?:\u05e4\u05e8\u05e7 |\u05e4")([\u05d0-\u05ea"]{1,3})' simple_parse = file_to_ja([[]], input_file, [expression], boaz_align) # reset file input_file.seek(0) headers = [ functions.getGematria(x) for x in grab_section_names(expression, input_file, 1) ] comp_parse = simple_to_complex(headers, simple_parse.array()) full_parse = functions.convertDictToArray(comp_parse) return full_parse
def parse_main(): with open("main.txt") as f: numbers = set() daf = 0 full_text = {} text_dicts = {} text_dicts["gemara"] = {} text_dicts["rashi"] = {} text_dicts["tosafot"] = {} prev_dict = "gemara" map_full_text_to_commentary = {} lines = [line for line in list(f) if line.split()] for line_n, line in enumerate(lines): line = line.decode('utf-8') for num in re.findall(u"\d+", line): numbers.add(num) if line.split()[0].find("40") >= 0: if u"""ע"ב""" in line: daf += 1 full_text[daf] = [] text_dicts["gemara"][daf] = [] text_dicts["tosafot"][daf] = [] text_dicts["rashi"][daf] = [] map_full_text_to_commentary[daf] = [] elif u"דף" in line: assert len(line.split()) == 2 new_daf = getGematria(line.split()[1]) * 2 - 1 assert new_daf > daf daf = new_daf full_text[daf] = [] text_dicts["gemara"][daf] = [] text_dicts["tosafot"][daf] = [] text_dicts["rashi"][daf] = [] map_full_text_to_commentary[daf] = [] else: line = line.replace("\r", "").replace("\n", "") if len(line) > 2: line = parse(line) prev_dict = add_line(line, daf, prev_dict, full_text, text_dicts, map_full_text_to_commentary) return full_text, text_dicts, map_full_text_to_commentary
def load_sheets(self): page_missing = u'דף שגיאות' for i in self.bereshit_parshiot: content = BeautifulSoup(open("{}.html".format(i)), "lxml") header = content.find('div', {'id': 'contentTop'}) if page_missing in header.text: continue hebrew_year = content.find("div", {"id": "year"}).text.replace(u"שנת", u"") roman_year = getGematria(hebrew_year) + 1240 parsha = content.find("div", {"id": "paging"}).text self.current_sefer, self.current_perakim = self.extract_perek_info(content) print "Sheet {}".format(i) text = content.find("div", {"id": "contentBody"}) if parsha not in self.sheets: self.sheets[parsha] = {} assert roman_year not in self.sheets[parsha].keys() self.parsha_and_year_to_url[parsha+" "+str(roman_year)] = i self.current_url = i self.current_perek = self.current_perakim[0] self.quotation_stack.append(u"{} {}".format(self.current_sefer, self.current_perek)) self.sheets[parsha][roman_year] = (hebrew_year, self.current_sefer, self.current_perakim, self.parse_as_text(text)) pass
def chapter_in_order(infile, tag, tag_reg, group=0): """ Check that the chapters run in order :param infile: input file to examine :param tag: Exact form of tag :param tag_reg: A regular expression to use to find chapters :param group: Capture group for regex if necessary :return: A list of lines where order is broken """ # grab all chapter headers and convert to numbers tester = tests.TagTester(tag, infile, tag_reg) tester.skip_to_next_segment(u'@00') all_chapters = [] while not tester.eof: titles = tester.grab_each_header(u'@00', group) chap_numbers = [functions.getGematria(txt) for txt in titles] all_chapters.append(chap_numbers) # check that chapters match index for book_num, book in enumerate(all_chapters): for index, chapter in enumerate(book): if chapter - index != 1: print 'error in {} chapter {}'.format(book_num+1, chapter)
def chapter_in_order(infile, tag, tag_reg, group=0): """ Check that the chapters run in order :param infile: input file to examine :param tag: Exact form of tag :param tag_reg: A regular expression to use to find chapters :param group: Capture group for regex if necessary :return: A list of lines where order is broken """ # grab all chapter headers and convert to numbers tester = tests.TagTester(tag, infile, tag_reg) tester.skip_to_next_segment(u'@00') all_chapters = [] while not tester.eof: titles = tester.grab_each_header(u'@00', group) chap_numbers = [functions.getGematria(txt) for txt in titles] all_chapters.append(chap_numbers) # check that chapters match index for book_num, book in enumerate(all_chapters): for index, chapter in enumerate(book): if chapter - index != 1: print 'error in {} chapter {}'.format(book_num + 1, chapter)
def getSeifNumber(txt): assert u"סעיף" in txt seif_number_he = txt.split(' ')[1] return getGematria(seif_number_he)
def extract_perek_info(self, content): perek_info = content.find("p", {"id": "pasuk"}).text sefer = perek_info.split()[0] pereks = re.findall(u"פרק\s+(.*?)\s+", perek_info) return (sefer, [getGematria(perek) for perek in pereks])