def parse_Raph_simanim(alinged_list): ''' note: although there is (not often) a differentiation in the original txt file, raph letters can be divided into smaller segments. In this code we combined those segments. returning, every raph letter as a line. ''' ja = [] siman = [] i = 1 prev_siman = u'א' for obj in alinged_list: if obj['siman'] == prev_siman: siman.append(obj['raph']) continue else: ja.append(siman) while getGematria(obj['siman']) != (getGematria(prev_siman) + i): ja.append([]) i += 1 i = 1 siman = [] siman.append(obj['raph']) prev_siman = obj['siman'] ja.append(siman) ja_to_xml(ja, ['siman', 'letter'], 'raph_simanim.xml') return ja
def parse_hagahot_by_letter(filename): def cleaner(my_text): replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u''} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'\$', line) and not line.isspace(): line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] try: ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[True, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' new_ja = regs_devide(cleaned, regs, u'(נשלם מלאכת שבעת הימים)') ja_to_xml(new_ja, ['siman', 'letter', 'segments'], 'hagahot_letters.xml') return new_ja
def fifty_parse(lines, replace_dict): # start the parsing of part fifty arr = [] perek = [] peska = [] new_lines = [] for line in lines: line = split_lines(line) new_lines.extend(line) for line in new_lines: if line.find(ur'@05') is not -1: if perek: perek.append(peska) peska = [] arr.append(perek) perek = [] else: if (line.find(u'@13') is not -1) and (peska): perek.append(peska) peska = [] line = multiple_replace(line, replace_dict, using_regex=True) peska.append(line) perek.append(peska) arr.append(perek) ja_to_xml(arr,['perek', 'piska', 'break'], 'raavad_50.xml') return arr
def parse_he(filename): """ :returns a dictionary, key: name of book, value: JaggadArray obj of the ja for the book """ replace_dict = { u'@(11|44|99)': u'<b>', u'@(33|55)': u'</b>', ur'@22\(([\u05d0-\u05ea]{1,3})\)': u'', ur'@(22|77)': u'' } def cleaner(my_text): new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ ur'@00(?P<gim>)', ur'@02(?P<gim>[\u05d0-\u05ea]{1,3})', ur'@22\((?P<gim>[\u05d0-\u05ea]{1,3})\)' ] # ,ur'@77' with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of Parasha start with @01 cleaned = [] dh_list = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 if starting and not re.search(u'@01', line) and not line.isspace(): dh_recognize = re.compile(ur'@11(.*?)@33') if dh_recognize.search(line): dh_list.append(dh_recognize.search(line).group(1)) line = re.sub(dh_recognize, ur'#<b>\1</b>', line) line = re.split(ur'#', line) if isinstance(line, basestring): cleaned.append(line) else: cleaned.extend(line) tt_ja = file_to_ja_g(4, cleaned, regs, cleaner, gimatria=True, group_name='gim', grab_all=[False, False, False]).array() Pentateuch = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy'] parsed_texts = dict({book: ja for book, ja in zip(Pentateuch, tt_ja)}) for book, ja in zip(Pentateuch, tt_ja): ja_to_xml(ja, ['perek', 'pasuk', 'comment'], 'tur_{}.xml'.format(book)) # for str in dh_list: # print str return parsed_texts
def parse_hagahot_by_letter(filename): def cleaner(my_text): replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@77': u''} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'\$', line) and not line.isspace(): line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] try: ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[True, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' new_ja = regs_devide(cleaned, regs, u'(נשלם מלאכת שבעת הימים)') ja_to_xml(new_ja, ['siman', 'letter', 'segments'], 'hagahot_letters.xml') return new_ja
def raavad_perush_parse(lines, replace_dict): # start the parsing of part raavad text itself arr = [] first_p = True first_m = True first_d = True perek = [] mishna = [] dibur = [] for line in lines: if line.find(u'@00') is not -1: # perek if first_p: first_p = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] arr.append(perek) perek = [] first_m = True # since this is opening a new perek first_d = True elif line.find( u'@22' ) is not -1: # notice that this parsing is given that there is no text on same line with @22 and @00 # mishna if first_m: first_m = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] first_d = True # since this is opening a new mishna else: # this line is going to be part of the dibur # Dibur Hamatchil if re.search(u'@(31|98)', line) and ( not first_d ): # and not first_d: # probably start a new dibur dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] else: if first_d: first_d = False # segment ocr tag fixing line = multiple_replace(line, replace_dict, using_regex=True) dibur.append(line) dibur = ' '.join(dibur) mishna.append(dibur) perek.append(mishna) arr.append(perek) ja_to_xml(arr, ['perek', 'mishna', 'dibur'], 'raavad_text.xml') return arr
def parse_raph(filename, smk_ja): ''' :param filename: raph source txt file :param smk_ja: JA obj smk parsed [siman,segment] :return: JA obj parsed [siman, letter] some simanim will be empty ''' def cleaner(my_text): replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@(33|22)': u''} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] try: ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml') d1 = 0 aligned = [] siman = [] segment = [] for letter in smk_ja.array(): for seg in letter: for ff in re.finditer(u'@55[\u05d0-\u05ea]{0,3}', seg): # segment.append(ja[d1]) siman.append(ja[d1]) d1 += 1 if segment != []: siman.extend(segment) #rather then append # segment = [] aligned.append(siman) siman = [] ja_to_xml(aligned, ['siman', 'letter', 'segment'], 'raph_simanim_24.xml') return JaggedArray(aligned)
def test_jagged_array_to_xml(): xml_buffer = StringIO() util.ja_to_xml(['foo'], ['foo'], xml_buffer) xml_buffer.seek(0) assert xml_buffer.read() == '<root><foo index="1">foo</foo></root>' with pytest.raises(TypeError): util.ja_to_xml(['foo', {'bar'}], ['foo'], xml_buffer) xml_buffer.close()
def raavad_parse(): with codecs.open('yitzira_raavad.txt', 'r', 'utf-8') as fp: lines = fp.readlines() # init JA and there depths ja_sp = [] # JA starting points in the txt # dictionary for line ocr tag fixing replace_dict = { u'@(44|13|31|41)': u'<b>', u'@(45|14|32|42)': u'</b>', # bold in text u'@(03|04|10|11|98|99|56)': u'', # u'@55': ur'<img src = " " height = "100" width = "100">', # image tag ur'(\*\[.*?\])': ur'<small>\1</small>' # notes in the text } # check if we got to the end of the legend and change to started startJA = None for line_num, line in enumerate(lines): if line == u'\n': startJA = line_num + 1 # ignoring the book name from text ja_sp.append(startJA) ja_fifty = fifty_parse(lines[ja_sp[0] + 1:ja_sp[1]], replace_dict) ja_32_hakdama_n = threty_two_parse(lines[ja_sp[1]:ja_sp[2]], replace_dict, 'hakdama_n') ja_32_netivot = threty_two_parse(lines[ja_sp[2]:ja_sp[3]], replace_dict, 'netivot') ja_32_hakdama_p = threty_two_parse(lines[ja_sp[3]:ja_sp[4]], replace_dict, 'hakdama_p') ja_32_perush = threty_two_parse(lines[ja_sp[4]:ja_sp[5]], replace_dict, 'perush') ja_old_parse = raavad_perush_parse(lines[ja_sp[5]:], replace_dict) ja_raavad_perush = raavad_new_parse(ja_old_parse) # not nice fixing of segments into break tags ja_32_netivot[31] = ja_32_netivot[31] + '<br><small>' + ja_32_netivot[ 32] + '</small>' ja_32_netivot = ja_32_netivot[:32] ja_32_perush[0] = ja_32_hakdama_p[0] + '<br>' + ja_32_hakdama_p[ 1] + '<br><br>' + ja_32_perush[0] ja_to_xml(ja_32_perush, ['netiv'], 'test.xml') return { 'Raavad on Sefer Yetzirah': ja_raavad_perush, 'Raavad on Sefer Yetzirah, Introduction, The Fifty Gates of Understanding': ja_fifty, 'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom, Introduction': ja_32_hakdama_n, 'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom': ja_32_netivot, 'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom, The Thirty Two Paths Explained': ja_32_perush, 'old_parsing_of_perush': ja_old_parse # outputing the old parse since it is used for the linking and linking is an outer function }
def parse_semak(filename): def cleaner(my_text): replace_dict = { u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1', u"@44": u"" } new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@22(?P<gim>[\u05d0-\u05ea]{1,3})'] #, u'@(11|23|33)(?P<gim>)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] letter_section = [] alt_day = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if re.search(u'@00', line): alt_day.append(line_num) if not re.search(u'@00', line) and not line.isspace(): if re.search(u'@22', line): line = re.split(u'(@22[\u05d0-\u05ea]{1,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st) for st in line if st] else: cleaned.append(line) alt_day.append(len(lines)) print alt_day try: smk_ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False, True, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(smk_ja, ['letter', 'segments'], 'smk.xml') return smk_ja
def raavad_perush_parse(lines, replace_dict): # start the parsing of part raavad text itself arr = [] first_p = True first_m = True first_d = True perek = [] mishna = [] dibur = [] for line in lines: if line.find(u'@00') is not -1: # perek if first_p: first_p = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] arr.append(perek) perek = [] first_m = True # since this is opening a new perek first_d = True elif line.find(u'@22') is not -1: # notice that this parsing is given that there is no text on same line with @22 and @00 # mishna if first_m: first_m = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] first_d = True # since this is opening a new mishna else: # this line is going to be part of the dibur # Dibur Hamatchil if re.search(u'@(31|98)', line) and (not first_d):# and not first_d: # probably start a new dibur dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] else: if first_d: first_d = False # segment ocr tag fixing line = multiple_replace(line, replace_dict, using_regex=True) dibur.append(line) dibur = ' '.join(dibur) mishna.append(dibur) perek.append(mishna) arr.append(perek) ja_to_xml(arr,['perek', 'mishna', 'dibur'], 'raavad_text.xml') return arr
def parse_general(filename): def cleaner(my_text): return my_text regs = [u'@22 ?(\u05e4\u05e8\u05e7)? ?(?P<gim>[\u05d0-\u05ea]{1,3})', u'\{(?P<gim>[\u05d0-\u05ea]{1,3})\}'] with codecs.open(filename, 'r', 'utf-8') as infile: bs_ja = file_to_ja_g(3, infile, regs, cleaner).array() print bs_ja ja_to_xml(bs_ja,['perek','pasuk','comment'], '{}xml'.format(re.search(u'.*\.',filename).group())) return bs_ja
def raavad_new_parse(ja): newJa, m1, p1 = [], [], [] for p in ja: for m in p: d1 = [split_lines(d) for d in m] p1.extend(d1) newJa.append(p1) p1 = [] ja_to_xml(newJa,['perek', 'dibur','paragraph'], 'new_parse.xml') return newJa
def parse_general(filename): def cleaner(my_text): return my_text regs = [u'@22 ?(\u05e4\u05e8\u05e7)? ?(?P<gim>[\u05d0-\u05ea]{1,3})', u'\{(?P<gim>[\u05d0-\u05ea]{1,3})\}'] with codecs.open(filename, 'r', 'utf-8') as infile: bs_ja = file_to_ja_g(3, infile, regs, cleaner).array() print(bs_ja) ja_to_xml(bs_ja,['perek','pasuk','comment'], '{}xml'.format(re.search(u'.*\.',filename).group())) return bs_ja
def raavad_new_parse(ja): newJa, m1, p1 = [], [], [] for p in ja: for m in p: d1 = [split_lines(d) for d in m] p1.extend(d1) newJa.append(p1) p1 = [] ja_to_xml(newJa, ['perek', 'dibur', 'paragraph'], 'new_parse.xml') return newJa
def basic_test_suite(): root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True) basetext = root.getBaseTextArray() ja_to_xml(basetext, ['Section', 'Segment'], 'base_text.xml') # root.review_commentaries() # root.check_commentary_chapters() comms = root.body.commentaries for c in comms.get_commentary(): if comms.is_linked_commentary(c) and c.get_author() != 'UNKNOWN': parsed = c.parse_linked() ja_to_xml(parsed, ['Chapter', 'Verse', 'Comment'], 'commentary.xml') break
def parse_Raph(filename): def cleaner(my_text): replace_dict = { u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u'' } #{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ ur'@77(?P<gim>[\u05d0-\u05ea]{0,3})', ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})' ] # (?P<gim>[\u05d0-\u05ea]{1,3}) with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] letter_section = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@(?:77|11)[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] #(st and not re.search(u'@(77)', st))] # else: # cleaned.append(line) try: ja = file_to_ja_g(3, cleaned, regs, cleaner, gimatria=True, grab_all=[False, False, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(ja, ['page', 'letter', 'segments'], 'raph.xml') return ja
def parse_smk(filename): ''' :param filename: smk source txt file :return: JA obj smk parsed to depth 2 [siman, segment] (including a citation segment at the top of each siman) ''' def cleaner(my_text): replace_dict = {u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} # , u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@22(?P<gim>[\u05d0-\u05ea]{1,3})'] # , u'@(11|23|33)(?P<gim>)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] letter_section = [] alt_day = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if re.search(u'@00', line): alt_day.append(line_num) if not re.search(u'@00', line) and not line.isspace(): if re.search(u'@22', line): line = re.split(u'(@22[\u05d0-\u05ea]{1,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st) for st in line if st] else: cleaned.append(line) alt_day.append(len(lines)) print alt_day try: smk_ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False, True, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(smk_ja, ['letter', 'segments'], 'smk.xml') return JaggedArray(smk_ja)
def parse_hagahot(filename, smk_ja, raph_ja): ''' :param filename: hagahot source txt file :param smk_ja: smk JA obj [siman, segment] :param raph_ja: raph JA obj [siman, letter] :return: JA obj ''' ja_hagahot = [] def cleaner(my_text): #todo: deal with @44 and @00 (@00 maybe should be only in smk base text? - ask Shmuel) replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@(33|77|88|99)': u'', u'@55(.*?)@66': u'<b>\1</b>'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line): line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line) if isinstance(line, basestring) and line != u'': cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] try: ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(ja, ['siman', 'letter'], 'hagahot_letters_25.xml') #, 'segments' # for hghds in return JaggedArray(ja_hagahot)
def hagahot_parse(ja_hagahot, hagahot_dict_lst): def num_haghot_in_siman(siman_dict): return len(siman_dict['smk']) + len(siman_dict['raph']) ja_hagahot = JaggedArray(ja_hagahot) ja_hagahot = ja_hagahot.flatten_to_array() hg_ja = [] p_hg = 0 for dict in hagahot_dict_lst: if re.search(u"^@[^1]", ja_hagahot[p_hg]): p_hg += 1 p_hg_end = p_hg + num_haghot_in_siman(dict) hg_ja.append(ja_hagahot[p_hg:p_hg_end]) p_hg = p_hg_end hg_ja.append(ja_hagahot[p_hg::]) ja_to_xml(hg_ja, ['siman', 'letter'], 'haghot_by_smk_simanim.xml') return hg_ja
def test(book): qa_issues = open('Ibn Ezra on {} misalignments.txt'.format(book), 'w') levi = parse(file_data[book]) vtitle = 'Devarim' if book == 'Deuteronomy' else book torat_emet = Ref("Ibn Ezra on {}".format(book)).text('he', 'Ibn Ezra on {} -- Torat Emet'.format(vtitle)).ja().array() count = 0 for c_index, (my_chapter, thier_chapter) in enumerate(zip(levi, torat_emet)): for v_index, (my_verse, their_verse) in enumerate(zip(my_chapter, thier_chapter)): if len(my_verse) != len(their_verse): qa_issues.write('issue found at {}:{}\n'.format(c_index+1, v_index+1)) count += 1 if len(my_chapter) != len(thier_chapter): by_length = sorted((my_chapter, thier_chapter), key=lambda x:len(x)) for i in range(len(by_length[0]), len(by_length[1])): qa_issues.write('issue found at {}:{}\n'.format(c_index+1, i+1)) count += 1 qa_issues.close() print '{} issues found'.format(count) ja_to_xml(levi, ['Chapter', 'Verse', 'Comment'])
def parse_Raph_by_letter(filename): '''parsing according to the letters, is the main ja, to post for the raph''' def cleaner(my_text): replace_dict = { u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u'' } #{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] new_ja = regs_devide(cleaned, regs) try: # ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[True, True], group_name='gim').array() ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array() except AttributeError: print 'there are more regs then levels...' # ja_to_xml(new_ja, ['Alef', 'letter', 'segments'], 'raph_letters.xml') ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml') return ja
def clean(JA, replace_dict): ''' :param JA: JA obj of the text to be cleand :param replace_dict: a dictionary of what to replace :return: cleaned JA ''' # replace_dict = {u'@23': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', # u'@66(.*?)@67': ur'\1'} # , u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'} lstlst = JA.array() new = [] nd1 = [] for d1 in lstlst: for d2 in d1: nd2 = multiple_replace(d2, replace_dict, using_regex=True) nd1.append(nd2) new.append(nd1) nd1 = [] ja_to_xml(new, ['letter', 'segments'], 'clean_smk.xml') return JaggedArray(new)
def threty_two_parse(lines, replace_dict, str): # start the parsing of 32 netivot arr = [] netiv = [] first = True for line in lines: if re.search(u'@(13|03)', line): # and (netiv): if first: first = False else: netiv = ' '.join(netiv) arr.append(netiv) netiv = [] line = multiple_replace(line, replace_dict, using_regex=True) netiv.append(line.strip()) netiv = ' '.join(netiv) arr.append(netiv) ja_to_xml(arr, ['netiv'], '{}{}'.format(str, '_32.xml')) return arr
def threty_two_parse(lines, replace_dict, str): # start the parsing of 32 netivot arr = [] netiv = [] first = True for line in lines: if re.search(u'@(13|03)', line):# and (netiv): if first: first = False else: netiv = ' '.join(netiv) arr.append(netiv) netiv = [] line = multiple_replace(line, replace_dict, using_regex=True) netiv.append(line.strip()) netiv = ' '.join(netiv) arr.append(netiv) ja_to_xml(arr, ['netiv'], '{}{}'.format(str,'_32.xml')) return arr
def raavad_parse(): with codecs.open('yitzira_raavad.txt', 'r', 'utf-8') as fp: lines = fp.readlines() # init JA and there depths ja_sp = [] # JA starting points in the txt # dictionary for line ocr tag fixing replace_dict = {u'@(44|13|31|41)': u'<b>', u'@(45|14|32|42)': u'</b>',# bold in text u'@(03|04|10|11|98|99|56)' : u'', # u'@55' : ur'<img src = " " height = "100" width = "100">', # image tag ur'(\*\[.*?\])': ur'<small>\1</small>' # notes in the text } # check if we got to the end of the legend and change to started startJA = None for line_num, line in enumerate(lines): if line == u'\n': startJA = line_num + 1 # ignoring the book name from text ja_sp.append(startJA) ja_fifty = fifty_parse(lines[ja_sp[0]+1:ja_sp[1]],replace_dict) ja_32_hakdama_n = threty_two_parse(lines[ja_sp[1]:ja_sp[2]], replace_dict, 'hakdama_n') ja_32_netivot = threty_two_parse(lines[ja_sp[2]:ja_sp[3]], replace_dict, 'netivot') ja_32_hakdama_p = threty_two_parse(lines[ja_sp[3]:ja_sp[4]], replace_dict, 'hakdama_p') ja_32_perush = threty_two_parse(lines[ja_sp[4]:ja_sp[5]], replace_dict, 'perush') ja_old_parse = raavad_perush_parse(lines[ja_sp[5]:], replace_dict) ja_raavad_perush = raavad_new_parse(ja_old_parse) # not nice fixing of segments into break tags ja_32_netivot[31] = ja_32_netivot[31] + '<br><small>' + ja_32_netivot[32] + '</small>' ja_32_netivot = ja_32_netivot[:32] ja_32_perush[0] = ja_32_hakdama_p[0] + '<br>' + ja_32_hakdama_p[1] + '<br><br>' + ja_32_perush[0] ja_to_xml(ja_32_perush, ['netiv'], 'test.xml') return {'Raavad on Sefer Yetzirah': ja_raavad_perush, 'Raavad on Sefer Yetzirah, Introduction, The Fifty Gates of Understanding': ja_fifty, 'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom, Introduction': ja_32_hakdama_n, 'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom': ja_32_netivot, 'Raavad on Sefer Yetzirah, Introduction, The Thirty Two Paths of Wisdom, The Thirty Two Paths Explained': ja_32_perush, 'old_parsing_of_perush' : ja_old_parse # outputing the old parse since it is used for the linking and linking is an outer function }
def parse_Raph_by_letter(filename): '''parsing according to the letters, is the main ja, to post for the raph''' def cleaner(my_text): replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] new_ja = regs_devide(cleaned, regs) try: # ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[True, True], group_name='gim').array() ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array() except AttributeError: print 'there are more regs then levels...' # ja_to_xml(new_ja, ['Alef', 'letter', 'segments'], 'raph_letters.xml') ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml') return ja
def parse_Raph(filename): def cleaner(my_text): replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line,replace_dict,using_regex=True) new.append(line) return new regs = [ur'@77(?P<gim>[\u05d0-\u05ea]{0,3})', ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'] # (?P<gim>[\u05d0-\u05ea]{1,3}) with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] letter_section = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@(?:77|11)[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st]#(st and not re.search(u'@(77)', st))] # else: # cleaned.append(line) try: ja = file_to_ja_g(3, cleaned, regs, cleaner, gimatria=True, grab_all=[False, False, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(ja, ['page', 'letter', 'segments'], 'raph.xml') return ja
# -*- coding: utf-8 -*- import codecs from sefaria.model import * import regex from sources import functions from data_utilities import util from sources.Rif_on_Nedarim import rif_nedarim_functions """ index record parse text text record link clean """ index = rif_nedarim_functions.create_index() functions.post_index(index) rif_nedarim = rif_nedarim_functions.parse() ref = 'Rif_Nedarim' text = rif_nedarim_functions.create_text(rif_nedarim) functions.post_text(ref, text) testing_file = codecs.open("testing_file.txt", 'w', 'utf-8') util.jagged_array_to_file(testing_file, rif_nedarim, ['Daf', 'Line']) testing_file.close() util.ja_to_xml(rif_nedarim, ['Daf', 'Line'])
# text = rasag_commentaries_functions.create_text(positive_commandments) # functions.post_text(ref, text) # # ref = 'Commentary on Sefer Hamitzvot of Rasag, Negative Commandments' # text = rasag_commentaries_functions.create_text(negative_commandments) # functions.post_text(ref, text) # # ref = 'Commentary on Sefer Hamitzvot of Rasag, Laws of the Courts, Introduction' # text = rasag_commentaries_functions.create_text(punishments[0]) # functions.post_text(ref, text) # # ref = 'Commentary on Sefer Hamitzvot of Rasag, Laws of the Courts' # text = rasag_commentaries_functions.create_text(punishments[1]) # functions.post_text(ref, text) # util.ja_to_xml(communal[1], ['FIRST', 'SECOND', 'THIRD', 'FOURTH', 'FIFTH']) # # ref = 'Commentary on Sefer Hamitzvot of Rasag, Communal Laws, Introduction' # text = rasag_commentaries_functions.create_text(communal[0]) # functions.post_text(ref, text) # ref = 'Commentary on Sefer Hamitzvot of Rasag, Communal Laws' text = rasag_commentaries_functions.create_text(communal[1]) functions.post_text(ref, text) # # ref = 'Commentary on Sefer Hamitzvot of Rasag, Appendix, Introduction' # text = rasag_commentaries_functions.create_text(miluim[0]) # functions.post_text(ref, text) # # ref = 'Commentary on Sefer Hamitzvot of Rasag, Appendix' # text = rasag_commentaries_functions.create_text(miluim[1])
regexes = [chapter_regex, halacha_regex] for the_file in [x for x in os.listdir(folder) if "xml" not in x]: name = the_file.replace(".txt", "").split("-")[2] sefer = the_file.split("-")[1] sefarim.add(sefer) if name in processed: # Skip second version of Nashim continue file_path = os.path.join(folder, the_file) with codecs.open(file_path, "r", "utf-8") as infile: j = file_to_ja_g(3, infile, regexes, clean_segments, gimatria=True) processed[name] = {"cat": sefer, "text": j.array()} ja_to_xml(j.array(), ["Chapter","Halacha","Comment"], file_path.replace("txt","xml")) processed[u"הלכות שופר וסוכה ולולב"] = { "cat": processed[u"הלכות שופר"]["cat"], "text": processed[u"הלכות שופר"]["text"][:3] + processed[u"הלכות סוכה"]["text"][3:6] + processed[u"הלכות לולב"]["text"][6:] } del processed[u"הלכות שופר"] del processed[u"הלכות סוכה"] del processed[u"הלכות לולב"] processed[u"הלכות מגילה וחנוכה"] = { "cat": processed[u"הלכות מגילה"]["cat"], "text": processed[u"הלכות מגילה"]["text"][:2] + processed[u"הלכות חנוכה"]["text"][2:] } del processed[u"הלכות מגילה"] del processed[u"הלכות חנוכה"]
from sources import functions from data_utilities import util from sources.Ralbag_on_Ruth import ralbag_ruth_functions """ index record parse text text record link clean """ index = ralbag_ruth_functions.create_index() functions.post_index(index) ralbag_ruth_dict = ralbag_ruth_functions.parse() for key in ralbag_ruth_dict: ref = 'Ralbag Ruth' if key == 'Benefits': ref += ',_Benefits' text = ralbag_ruth_functions.create_text(ralbag_ruth_dict[key]) functions.post_text(ref, text) list_of_links = ralbag_ruth_functions.create_links(ralbag_ruth_dict['Commentary']) functions.post_link(list_of_links) ralbag_ruth = [ralbag_ruth_dict['Commentary'], ralbag_ruth_dict['Benefits']] util.ja_to_xml(ralbag_ruth, ['FIRST', 'SECOND', 'THIRD', 'FOURTH'])
from sources import functions from data_utilities import util from sources.Ralbag_on_Esther import ralbag_esther_functions """ index record parse text text record link clean """ index = ralbag_esther_functions.create_index() functions.post_index(index) ralbag_esther_dict = ralbag_esther_functions.parse() for key in ralbag_esther_dict: ref = 'Ralbag Esther,_{}'.format(key) if key == 'Commentary': ref = 'Ralbag Esther' text = ralbag_esther_functions.create_text(ralbag_esther_dict[key]) functions.post_text(ref, text) list_of_links = ralbag_esther_functions.create_links(ralbag_esther_dict['Commentary']) functions.post_link(list_of_links) ralbag_esther = [ralbag_esther_dict['Introduction'], ralbag_esther_dict['Commentary'], ralbag_esther_dict['Benefits']] util.ja_to_xml(ralbag_esther, ['FIRST', 'SECOND', 'THIRD', 'FOURTH'])
if line.find(ur'@05') is not -1: if perek: perek.append(peska) peska = [] arr.append(perek) perek = [] else: if (line.find(u'@13') is not -1) and (peska): perek.append(peska) peska = [] line = multiple_replace(line, replace_dict, using_regex=True) peska.append(line) perek.append(peska) arr.append(perek) ja_to_xml(arr, ['perek', 'piska', 'break'], 'raavad_50.xml') return arr # split a long line into a list of smaller lines useing key words like "VeHene". # note: if you use this spliting methos you want to be sure not to use the "".join() def split_lines(line): # for line in lines: line = re.sub(u'\. (\u05d5?(\u05d4\u05e0\u05d4|\u05e2\u05d5\u05d3))', ur'. ~\1', line) line_list = re.split(ur'~', line) return line_list
ja.set_element(indices, temp, []) temp = [] if pasuk_dh: indices = [ int(pasuk_dh.group(1)) - 1, int(pasuk_dh.group(2)) - 1, indices[2] ] indices[2] = 0 elif reg_dh: indices[2] += 1 if not line.isspace() and not re.match( ur' *Parshat *(\S+) *(\S+)? *', line): # don't put into array names of Parasha or empty lines temp.append(line) ja_to_xml(ja.array(), ['perek', 'pasuk', 'comment'], '{}.xml'.format(re.match('(.*)\.', filename).group(1))) return ja def parse_all_en(): en_texts = {} pentateuch = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy'] for book in pentateuch: parsed_book = parse_en('en_tur_{}.txt'.format(book.lower())) en_texts[book] = parsed_book return en_texts def tt_schema(): record_root = SchemaNode() record_root.add_title('Tur HaAroch', 'en', True)
if sefer == u"ספר קרבנות": sefer = u"ספר קורבנות" if sefer == u"ספר קנין": sefer = u"ספר קניין" sefarim.add(sefer) if name in processed: # Skip second version of Nashim continue file_path = os.path.join(folder, the_file) with codecs.open(file_path, "r", "utf-8") as infile: j = file_to_ja_g(3, infile, regexes, clean_segments, gimatria=True) processed[name] = {"cat": sefer, "text": j.array()} ja_to_xml(j.array(), ["Chapter","Halacha","Comment"], file_path.replace("txt","xml")) processed[u"הלכות תפילה וברכת כהנים"] = { "cat": processed[u"הלכות תפלה"]["cat"], "text": processed[u"הלכות תפלה"]["text"][:13] + processed[u"הלכות נשיאת כפים"]["text"][13:] } del processed[u"הלכות תפלה"] del processed[u"הלכות נשיאת כפים"] processed[u"הלכות תפילין ומזוזה וספר תורה"] = { "cat": processed[u"הלכות תפילין"]["cat"], "text": processed[u"הלכות תפילין"]["text"][:4] + processed[u"הלכות מזוזה"]["text"][4:6] + processed[u"הלכות ספר תורה"]["text"][6:] } del processed[u"הלכות תפילין"]
def text_parse(): # open, read, close the original txt file with codecs.open('yitzira_pri_yitzhak.txt', 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break # init section lists and flags parsed = [] perek = [] mishna = [] dibur = [] first_p = True # first perek flag first_m = True # first mishna flag first_d = True # first dibur flag ofen = False # 'ofen' flag # dictionary for line ocr tag fixing replace_dict = { u'@11': u'', # not necessary ocr tag u'@31': u'<b>', u'@32': u'</b>', # bold dibur hamatchil u'@44': u'<b>', u'@45': u'</b>', # was bold in text u'@98': u'<small>', u'@99': u'</small>', # the slik at the end ur'\[\*(.*?)\]': ur'<small>[\1]</small>' # footnotes } # loop on lines and creat the jagged array for line in lines[starting:]: if line.find(u'@00') is not -1: # perek if first_p: first_p = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] parsed.append(perek) perek = [] first_m = True # since this is opening a new perek elif line.find(u'@22') == 0: # notice that this parsing is given that there is no text on same line with @22 and @00 # mishna if first_m: first_m = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] first_d = True # since this is opening a new mishna else: # this line is going to be part of the dibur # Dibur Hamatchil if regex.search(u'@(03|31|98)', line): # probably start a new dibur if (not ofen) and (not first_d): # prob close prev dibur dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] else: if ofen: ofen = False if first_d: first_d = False if regex.search(u'@03', line): ofen = True # segment ocr tag fixing line = multiple_replace(line, replace_dict, using_regex = True) dibur.append(line) # once reached the end close all what was opened dibur = ' '.join(dibur) mishna.append(dibur) perek.append(mishna) parsed.append(perek) ja_to_xml(parsed,['perek','mishna','dibur'],filename = 'pri.xml') return parsed
if siman_num is 5 or siman_num is 6: print "not real", siman_num continue # 5 & 6 are simanim with no text but a page soup = soupAndOpen("./pages/%s" % (filename)) if siman_num is 3 or siman_num is 4 or siman_num is 7: #siman numbers that did not conform to be able to parse print "outlier", siman_num outlierParse(soup, siman_num) else: print "regular", siman_num regularParse(soup, siman_num) ja_to_xml(simanim_ja.array(), ["siman", "seif", "comment"]) links = [] for comment in traverse_ja(simanim_ja.array()): links.append({ 'refs': [ 'Shulchan_Arukh, Orach_Chayim.{}.{}'.format( comment['indices'][0] - 1, comment['indices'][1] - 1), 'Biur Halacha.{}.{}.{}'.format( *[i - 1 for i in comment['indices']]) ], 'type': 'commentary', 'auto': True,
from sources import functions from data_utilities import util from sources.Ralbag_on_Shir_HaShirim import ralbag_shir_hashirim_functions """ index record parse text text record link clean """ index = ralbag_shir_hashirim_functions.create_index() functions.post_index(index) ralbag_shir_hashirim_dict = ralbag_shir_hashirim_functions.parse() for key in ralbag_shir_hashirim_dict: ref = 'Ralbag Song of Songs' if key == 'Introduction': ref += ',_Introduction' text = ralbag_shir_hashirim_functions.create_text(ralbag_shir_hashirim_dict[key]) functions.post_text(ref, text) list_of_links = ralbag_shir_hashirim_functions.create_links(ralbag_shir_hashirim_dict['Commentary']) functions.post_link(list_of_links) ralbag_ruth = [ralbag_shir_hashirim_dict['Introduction'], ralbag_shir_hashirim_dict['Commentary']] util.ja_to_xml(ralbag_ruth, ['FIRST', 'SECOND', 'THIRD', 'FOURTH'])