def run_parser(text): parsed_text=[] cur_chapter = 1 cur_verse = 1 text=re.sub(ur'@20.{6}','',text) chapters = re.split(ur'@10([^@]*)', text) for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): if chapter_num.strip() != '': chapter_num = re.sub("[\(,\)]", "", chapter_num) cur_chapter = hebrew.heb_string_to_int(chapter_num.strip()) parsed_chapter = [] expand_list_assign(parsed_text, cur_chapter - 1, parsed_chapter) psukim = re.split(ur'@11([^@]*)', chapter) for pasuk_num, pasuk in zip(psukim[1::2], psukim[2::2]): if pasuk.strip() != '': parsed_verse = [] pasuk_num = re.sub("[\(,\)]", "", pasuk_num) cur_verse = hebrew.heb_string_to_int(pasuk_num.strip()) expand_list_assign(parsed_chapter, cur_verse - 1, parsed_verse) DH = pasuk.split(ur'@12')[1:] for dibur_hamatchil in DH: comments = dibur_hamatchil.split('@33')[1:] comment1="" for comment in comments: if comment.strip() != '': key = re.split(ur'@00', comment) if len(key) > 1: comment = '<b>' + key[0] + '</b>' + key[1] comment1 = comment1 + comment parsed_verse.append(comment1) return parsed_text
def run_parser(text): parasha_num = 0 cur_chapter = 1 cur_verse = 1 chapters = re.split(ur'@09([^@]*)', text) parashot = [[], [], [], [], []] for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): if chapter_num.strip() != '': cur_chapter = hebrew.heb_string_to_int(chapter_num.strip()) names = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy'] parsed_chapter = [] if cur_chapter == 1: parasha_num += 1 parasha_name = names[parasha_num - 1] expand_list_assign(parashot[parasha_num - 1], cur_chapter - 1, parsed_chapter) psukim = re.split(ur'@97([^@]*)', chapter) for pasuk_num, pasuk in zip(psukim[1::2], psukim[2::2]): if pasuk.strip() != '': parsed_verse = [] pasuk_num = re.sub("[\(,\)]", "", pasuk_num) cur_verse = hebrew.heb_string_to_int(pasuk_num.strip()) expand_list_assign(parsed_chapter, cur_verse - 1, parsed_verse) comments = pasuk.split('@98')[1:] for comment in comments: if comment.strip() != '': key = re.split(ur'@87', comment) if len(key) > 1: comment = '<b>' + key[0] + '</b>' + key[1] parsed_verse.append(comment) return parashot
def run_parser(text): parsed_text = [] cur_chapter = 1 cur_verse = 1 text = re.sub(ur"@20.{6}", "", text) chapters = re.split(ur"@10([^@]*)", text) for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): if chapter_num.strip() != "": chapter_num = re.sub("[\(,\)]", "", chapter_num) cur_chapter = hebrew.heb_string_to_int(chapter_num.strip()) parsed_chapter = [] expand_list_assign(parsed_text, cur_chapter - 1, parsed_chapter) psukim = re.split(ur"@11([^@]*)", chapter) for pasuk_num, pasuk in zip(psukim[1::2], psukim[2::2]): if pasuk.strip() != "": parsed_verse = [] pasuk_num = re.sub("[\(,\)]", "", pasuk_num) cur_verse = hebrew.heb_string_to_int(pasuk_num.strip()) expand_list_assign(parsed_chapter, cur_verse - 1, parsed_verse) comments = pasuk.split("@33")[1:] for comment in comments: if comment.strip() != "": key = re.split(ur"@00", comment) if len(key) > 1: comment = "<b>" + key[0] + "</b>" + key[1] parsed_verse.append(comment) return parsed_text
def run_parser(): print "running parser" parsed_text = [] cur_chapter = 1 cur_verse = 1 #regex = re.compile(ur'@11(.*)@22',re.UNICODE) with open("source/Radak_on_Genesis.txt", 'r') as filep: file_text = filep.read() ucd_text = unicode(file_text, 'utf-8').strip() #get rid of some unhelpful markup ucd_text = re.sub(ur'@11(.*?)@33', ur'@55\1', ucd_text) ucd_text = re.sub(ur'@00([^@]*)\n', '', ucd_text) ucd_text = ucd_text.replace(u'@44(שם)@55', u'(שם)') #split according to chapter. Will also include the chapter letters in the results. chapters = re.split(ur'@22([^@]*)', ucd_text) for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): if chapter_num.strip() != '': cur_chapter = hebrew.heb_string_to_int(chapter_num.strip()) parsed_chapter = [] expand_list_assign(parsed_text, cur_chapter - 1, parsed_chapter) #now split on verse numbers, capturing the verse numbers as well verses = re.split(ur'@44\(([\u0590-\u05ea]{1,2})\)', chapter) for verse_num, verse in zip(verses[1::2], verses[2::2]): if verse_num.strip() != '': parsed_verse = [] cur_verse = hebrew.heb_string_to_int(verse_num.strip()) expand_list_assign(parsed_chapter, cur_verse - 1, parsed_verse) comments = verse.split('@55')[1:] for comment in comments: if comment.strip() != '': parsed_verse.append(comment) pretty_print(parsed_text) save_parsed_text(parsed_text)
def run_parser(text): parasha_num = 0 cur_chapter = 1 cur_verse = 1 chapters = re.split(ur'@09([^@]*)', text) parashot = [[], [], [], [], []] for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): if chapter_num.strip() != '': cur_chapter = hebrew.heb_string_to_int(chapter_num.strip()) names = [ 'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy' ] parsed_chapter = [] if cur_chapter == 1: parasha_num += 1 parasha_name = names[parasha_num - 1] expand_list_assign(parashot[parasha_num - 1], cur_chapter - 1, parsed_chapter) psukim = re.split(ur'@97([^@]*)', chapter) for pasuk_num, pasuk in zip(psukim[1::2], psukim[2::2]): if pasuk.strip() != '': parsed_verse = [] pasuk_num = re.sub("[\(,\)]", "", pasuk_num) cur_verse = hebrew.heb_string_to_int(pasuk_num.strip()) expand_list_assign(parsed_chapter, cur_verse - 1, parsed_verse) comments = pasuk.split('@98')[1:] for comment in comments: if comment.strip() != '': key = re.split(ur'@87', comment) if len(key) > 1: comment = '<b>' + key[0] + '</b>' + key[1] parsed_verse.append(comment) return parashot
def search2(parsed, part): for k, seif in enumerate(parsed): for i, pasuk in enumerate(seif): found = re.finditer(ur'@44[\[\(](.*?)[\]\)]@55(.*?)\.', pasuk) for find in found: daf = find.group(1) if daf.strip().split(' ')[0] == u"מנחות" and len( daf.strip().split(' ')) < 6: if len(daf.strip().split(' ')) == 3: daf = daf.strip().split(' ')[2] elif len(daf.strip().split(' ')) == 2: daf = daf.strip().split(' ')[1] if daf[-1] == ".": amud = "a" elif daf[-1] == ":": amud = "b" daf_num = hebrew.heb_string_to_int(daf[0:-1]) #print daf_num, amud elif daf.strip().split(' ')[0] == u"דף": daf = daf.strip().split(' ')[1] if daf[-1] == ".": amud = "a" elif daf[-1] == ":": amud = "b" daf_num = hebrew.heb_string_to_int(daf[0:-1]) #print daf_num, amud elif ur"שם" not in daf and ur"דף" in daf: #print daf pass else: pass #print daf text = find.group(2) try: print str(k + 1), str(i + 1), daf_num, amud found = matchobj(daf_num, amud, text) line = found[1][0] if line > 0: #print "Rosh on {}".format(masechet), daf_num, amud, found[1][0], str(i+1), ",", str(j+1) + ",", str(k+1) talmud = "{}".format(masechet) + "." + str( daf_num) + amud + "." + str(line) roash = "Rosh on {}".format( masechet) + ", " + part + "." + str( k + 1) + "." + str(1) links.append(makeLink(talmud, roash)) except Exception as e: print e
def parse(text): old_num =0 dibbur ="" #simanim = re.finditer(ur'(@[0-9][0-9])\n?(@[0-9][0-9])(.*\n*)', text) simanim = re.split("@77",text) bayit_chadash = [] perek =[] i=1 for siman in simanim: simans = re.finditer("@11(.*)@33(.*)",siman) for s in simans: dibbur ="(" + str(i) + ")" + "<b>" + s.group(1) + '</b>'+ s.group(2) print i i = i +1 if "@22" not in siman: perek.append(dibbur) elif "@22" in siman: #i = 1 num = re.findall("@22(.*)",siman) [0] new_num = hebrew.heb_string_to_int(num.strip()) #print new_num if new_num - old_num != 1: for k in range(1,new_num - old_num): bayit_chadash.append([]) old_num= new_num bayit_chadash.append(perek) perek =[] perek.append(dibbur) i=1 bayit_chadash.append(perek) #print len(bayit_chadash) return bayit_chadash[1:len(bayit_chadash)]
def search(text, shas): for i, seif in enumerate(text): for j, siman in enumerate(seif): if siman.endswith(ur'5 '): print "yes" linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman) if '@44' not in siman[0:10] and len(siman) > 8: start = re.sub('([\[\*\]]|@..|#)',"",siman) start_of_siman = re.split(" ", start) matching(start_of_siman, shas, i, j, index, daf, amud) for match in linked: lookfor = match.group(2) tagged = re.split(" ", lookfor.strip()) daf_amud = re.split(ur' ', match.group(1).strip()) daf = hebrew.heb_string_to_int(daf_amud[1]) amud = daf_amud[2] index = ((daf-2)*2)+1 if amud[2].strip() == ur'א': amud = 'a' index = index - 1 else: amud = 'b' if len(lookfor) < 5: print "short", daf, amud break else: matching(tagged, shas, i, j, index, daf, amud)
def search2(parsed): for i,perek in enumerate(parsed): for j, pasuk in enumerate(perek): for k, seif in enumerate(pasuk): found = re.finditer(ur'@44\[דף(.*?)\](.*?)\.', seif) for find in found: daf = find.group(1) text = find.group(2) if daf.strip().split(" ")[1] == u'ע"א': amud = 'a' elif daf.strip().split(" ")[1] == u'ע"ב': amud = 'b' new_daf = daf.strip().split(" ")[0] try: daf_num = hebrew.heb_string_to_int(new_daf) #print str(daf_num) + amud found = matchobj(daf_num, amud, text) line = found[1][0] if line >0: #print "Rosh on {}".format(masechet), daf_num, amud, found[1][0], str(i+1), ",", str(j+1) + ",", str(k+1) talmud = "{}".format(masechet) + "." + str(daf_num) + amud + "." + str(line) roash = "Rosh on {}".format(masechet) +"." + str(i+1) + "." + str(j+1) + "." + str(k+1) links.append(link(talmud,roash)) except KeyError: pass
def parse(text): old_num = 0 simanim = re.finditer(ur'@22\n*(.*)\n*@11(\n?.*)', text) for siman in simanim: #print siman.group(1) new_num = hebrew.heb_string_to_int(siman.group(1).strip()) if new_num - old_num != 1: print siman.group(1) print new_num old_num = new_num print new_num bet_yosef = [] simanim = re.split("@22", text) for siman in simanim: seif = [] dh = re.split("@66", siman) i = 1 for text in dh: bolded_dh = re.finditer(ur'@11(.*)@33(.*)', text) for bold in bolded_dh: new_dh = "[" + str(i) + "]" + "<b>" + bold.group( 1) + '</b>' + bold.group(2) seif.append(new_dh) i += 1 bet_yosef.append(seif) print "length bet yosef", len(bet_yosef) #print bet_yosef[len(bet_yosef)-1][0] if len(bet_yosef[0]) < 2: return bet_yosef[1:len(bet_yosef) - 1] else: return bet_yosef
def parse(text): old_num =0 simanim = re.finditer(ur'@22\n*(.*)\n*@11(\n?.*)', text) for siman in simanim: si = siman.group(1) num = re.split(" ", si.strip())[1] num=re.sub("'","",num) new_num = hebrew.heb_string_to_int(num.strip()) if new_num - old_num !=1: print new_num old_num = new_num prisha=[] simanim = re.split("@22", text) for siman in simanim: seif =[] dh = re.split(ur"\(\S\S?\)]", siman) print dh[0] i = 1 for text in dh: bolded_dh = re.finditer(ur'@11(.*)@33(.*)',text) for bold in bolded_dh: #new_dh = "{" + str(i) +"}" + "<b>" + bold.group(1) + '</b>' + bold.group(2) new_dh = "<b>" + bold.group(1) + '</b>' + bold.group(2) seif.append(new_dh) i += 1 prisha.append(seif) print "length prisha", len(prisha) #print bet_yosef[len(bet_yosef)-1][0] if len(prisha[0])<2: return prisha[1:len(prisha)-1] else: return prisha
def search(text, shas): for i, seif in enumerate(text): for j, siman in enumerate(seif): if siman.endswith(ur'5 '): print "yes" linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman) if '@44' not in siman[0:10] and len(siman) > 8: start = re.sub('([\[\*\]]|@..|#)', "", siman) start_of_siman = re.split(" ", start) matching(start_of_siman, shas, i, j, index, daf, amud) for match in linked: lookfor = match.group(2) tagged = re.split(" ", lookfor.strip()) daf_amud = re.split(ur' ', match.group(1).strip()) daf = hebrew.heb_string_to_int(daf_amud[1]) amud = daf_amud[2] index = ((daf - 2) * 2) + 1 if amud[2].strip() == ur'א': amud = 'a' index = index - 1 else: amud = 'b' if len(lookfor) < 5: print "short", daf, amud break else: matching(tagged, shas, i, j, index, daf, amud)
def search1(text, shas): for k, perek in enumerate(text): for i, seif in enumerate(perek): for j, siman in enumerate(seif): if siman.endswith(ur'5 '): print "yes" linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman) if '@44' not in siman[0:10] and len(siman) > 8: start = re.sub('([\[\*\]]|@..|#)', "", siman) start_of_siman = re.split(" ", start) if 'index' in locals(): if index >= len(shas): break # print "line number: 203", daf, amud matching1(start_of_siman, shas, i, j, k, index, daf, amud) for match in linked: lookfor = match.group(2) # print "lookfor is", lookfor tagged = re.split(" ", lookfor.strip()) if len(tagged) <= 1: break daf_amud = re.split(ur' ', match.group(1).strip()) #print daf_amud[0] if len(daf_amud) >= 2: daf = re.sub(ur'[^א-ת]', "", daf_amud[1]) else: # print "no daf " break # print daf daf = hebrew.heb_string_to_int(daf) if daf > len(shas) or len(daf_amud) < 3: break else: # print len(daf_amud) # print daf_amud amud = daf_amud[2] index = ((daf - 2) * 2) + 1 #if index > len(shas): # break if len(amud) < 3: #print "short amud" break else: #print amud if amud[2].strip() == ur'א': amud = 'a' index = index - 1 else: amud = 'b' #print daf, amud if index >= len(shas): #print "short", daf, amud, lookfor #return pass else: #print "else" # print "tagged is", tagged matching1(tagged, shas, i, j, k, index, daf, amud)
def read_rashi(): for f in os.listdir(u'%s' % commentator): if masechet_he in f: pf = os.path.join(u'%s' % commentator, f) print pf split = re.split("_",f.strip()) if "_" in masechet_he: daf_he = split[2] amud_he = split[3][0] else: daf_he = split[1] amud_he = split[2][0] daf =hebrew.heb_string_to_int(daf_he) if amud_he ==u'א': amud= "a" elif amud_he ==u'ב': amud="b" else: print "we have a problam" index = convert_daf_to_index(daf,amud) print index print daf print amud with open(pf, 'r') as filep: file_text = filep.read() list = re.split("\n",file_text) for liner in list: if "-" in liner or "–" in liner: #print line dh = re.split("(?:-|–)",liner)[0] match(dh.decode('utf-8'),shas[index],index,liner.decode('utf-8'))
def parse(text): old_num =0 simanim = re.finditer(ur'@22\n*(.*)\n*@11(\n?.*)', text) for siman in simanim: #print siman.group(1) new_num = hebrew.heb_string_to_int(siman.group(1).strip()) if new_num - old_num !=1: print siman.group(1) print new_num old_num = new_num print new_num bet_yosef=[] simanim = re.split("@22", text) for siman in simanim: seif =[] dh = re.split("@66", siman) i = 1 for text in dh: bolded_dh = re.finditer(ur'@11(.*)@33(.*)',text) for bold in bolded_dh: new_dh = "[" + str(i) +"]" + "<b>" + bold.group(1) + '</b>' + bold.group(2) seif.append(new_dh) i += 1 bet_yosef.append(seif) print "length bet yosef", len(bet_yosef) #print bet_yosef[len(bet_yosef)-1][0] if len(bet_yosef[0])<2: return bet_yosef[1:len(bet_yosef)-1] else: return bet_yosef
def search1(parsed, part): for i, perek in enumerate(parsed): for k, seif in enumerate(perek): found = re.finditer(ur'@44\[דף(.*?)\](.*?)\.', seif) for find in found: daf = find.group(1) text = find.group(2) if daf.strip().split(" ")[1] == u'ע"א': amud = 'a' elif daf.strip().split(" ")[1] == u'ע"ב': amud = 'b' new_daf = daf.strip().split(" ")[0] try: daf_num = hebrew.heb_string_to_int(new_daf) #print str(daf_num) + amud found = matchobj(daf_num, amud, text) line = found[1][0] if line > 0: #print "Rosh on {}".format(masechet), daf_num, amud, found[1][0], str(i+1), ",", str(j+1) + ",", str(k+1) talmud = "{}".format(masechet) + "." + str( daf_num) + amud + "." + str(line) roash = "Rosh on {}".format( masechet) + ", " + part + "." + str( i + 1) + "." + str(k + 1) links.append(link(talmud, roash)) except KeyError: pass
def search(parsed): for i, perek in enumerate(parsed): for j, pasuk in enumerate(perek): found = re.finditer(ur'\(דף(.*?)\)', pasuk) for find in found: daf = find.group(1) #text = find.group(2) if daf[len(daf) - 1] == '.': #print daf amud = 'a' elif daf[len(daf) - 1] == ':': amud = 'b' new_daf = daf[0:len(daf.strip())].strip() #print new_daf try: daf_num = hebrew.heb_string_to_int(new_daf) #print str(daf_num) + amud + " " + str(i) + " " + str(j+1) links.append( link( "{}".format(masechet) + "." + str(daf_num) + amud, "Rosh on {}, Hilchot Seder Avodat Yom HaKippurim". format(masechet) + "." + str(i) + "." + str(j + 1))) #links.append(link( #match(daf_num, amud, text) except KeyError: pass
def read_rashi(): for f in os.listdir(u'%s' % commentator): if masechet_he in f: pf = os.path.join(u'%s' % commentator, f) print pf split = re.split("_", f.strip()) if "_" in masechet_he: daf_he = split[2] amud_he = split[3][0] else: daf_he = split[1] amud_he = split[2][0] daf = hebrew.heb_string_to_int(daf_he) if amud_he == u'א': amud = "a" elif amud_he == u'ב': amud = "b" else: print "we have a problam" index = convert_daf_to_index(daf, amud) print index print daf print amud with open(pf, 'r') as filep: file_text = filep.read() list = re.split("\n", file_text) for liner in list: if "-" in liner or "–" in liner: if commentator in "Rashi": dh = re.split("(?:-|–)", liner)[0] elif "Tosafot" in commentator: dh = re.split(ur"\.", liner)[0] match(dh.decode('utf-8'), shas[index], index, liner.decode('utf-8'))
def search1(text, shas): for k, perek in enumerate(text): for i, seif in enumerate(perek): for j, siman in enumerate(seif): if siman.endswith(ur'5 '): print "yes" linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman) if '@44' not in siman[0:10] and len(siman) > 8: start = re.sub('([\[\*\]]|@..|#)',"",siman) start_of_siman = re.split(" ", start) if 'index' in locals(): if index >= len(shas): break # print "line number: 203", daf, amud matching1(start_of_siman, shas, i, j, k, index, daf, amud) for match in linked: lookfor = match.group(2) # print "lookfor is", lookfor tagged = re.split(" ", lookfor.strip()) if len(tagged)<=1: break daf_amud = re.split(ur' ', match.group(1).strip()) #print daf_amud[0] if len(daf_amud) >=2: daf = re.sub(ur'[^א-ת]',"",daf_amud[1]) else: # print "no daf " break # print daf daf = hebrew.heb_string_to_int(daf) if daf > len(shas) or len(daf_amud) < 3: break else: # print len(daf_amud) # print daf_amud amud = daf_amud[2] index = ((daf-2)*2)+1 #if index > len(shas): # break if len(amud) < 3: #print "short amud" break else: #print amud if amud[2].strip() == ur'א': amud = 'a' index = index - 1 else: amud = 'b' #print daf, amud if index >= len(shas): #print "short", daf, amud, lookfor #return pass else: #print "else" # print "tagged is", tagged matching1(tagged, shas, i, j, k, index, daf, amud)
def parse(text): older_siman = 0 arbaturim=[] tur = [] hilchos = re.split(ur'@00', text) #split to names of parts for halacha in hilchos: if len(halacha) >0: halacha_name = halacha.splitlines()[0] #print halacha_name #get the name of the part simanim = re.finditer(ur'(@?[0-9]?[0-9]?@?[0-9]?[0-9]?)@22(.*)@11(.*)',halacha) #cut the text to simanim, get kletter of siman and tags to commentary i = 1 for simans in simanim: localbet_yosef = 0 siman = simans.group(2) siman = re.sub(ur'[\(\[].*?[\)\]]',"", siman) siman = re.sub(ur'[^\u05d0-\u05ea]',"", siman) if len(siman)> 4: # print simans.group(2) #print simans.group(3) pass roman_siman = hebrew.heb_string_to_int(siman.strip()) bold = re.split(ur'@33',simans.group(3)) if len(bold) ==2: text = simans.group(1) +"<b>" + bold[0] + "</b>" + bold[1] else: text =simans.group(1) + simans.group(3) #text1 = re.split(u"(.*?[.:])", text) #text1 = filter(None, text1) #taking care of links try: for k in range(0,len(karo[len(tur)])): #print len(tur)+1,k #for k in range(1,len(re.findall("@66",simans.group(0)))): #print simans.group(0) if "@66" in simans.group(1): links.append(addlink(len(tur)+1,1, k+1 )) localbet_yosef += len(re.findall("@66",simans.group(2))) if "@66" in simans.group(2): links.append(addlink(len(tur)+1,1, k+1 )) for sif_num,sifs in enumerate(text, start =1): for a in range(1,len(re.findall("@66", sifs))): links.append(addlink(len(tur)+1,sif_num , a+k+1 )) #if localbet_yosef - len(karo[len(tur)+1]) != -1: #print simans.group(2),roman_siman, localbet_yosef, len(karo[len(tur)+1]) # pass if roman_siman - older_siman != 1: print siman print roman_siman older_siman = roman_siman text = re.sub(ur'@66', lambda m, c=count(1): '[{}]'.format(next(c)), text) tur.append([text]) except IndexError: print "out of index" arbaturim.append(tur) depth = lambda L: isinstance(L, list) and max(map(depth, L))+1 print depth(tur) return tur
def parse(text): if os.path.isfile("../source/Korban_Netanel_on_{}.txt".format(masechet)) or os.path.isfile( "../source/Pilpula_Charifta_on_{}.txt".format(masechet) ): # print "has korban netanel 2" nose_kelim = nosekelim.open_file() fixed = nosekelim.parse(nose_kelim) links_netanel = [] netanel = 0 rosh = [] a = re.split(ur"@22([^@]*)", text) for seif, cont in zip(a[1::2], a[2::2]): si = [] korban = [] if ur"[*]" in seif and ( os.path.isfile( "../source/Korban_Netanel_on_{}.txt".format(masechet) or os.path.isfile("../source/PilPula_Charifta_on_{}.txt".format(masechet)) ) and netanel <= len(fixed) ): if os.path.isfile("../source/Korban_Netanel_on_{}.txt".format(masechet)): commentator = "Korban Netanel on " if os.path.isfile("../source/PilPula_Charifta_on_{}.txt".format(masechet)): commentator = "Pilpula Charifta on " korban.append(fixed[netanel]) # print len(links_netanel) roash = "Rosh on %s." % masechet + str(len(links_netanel) + 1) + ".1" netanelink = commentator + masechet + "." + str(len(links_netanel) + 1) + ".1" links.append(link(netanelink, roash)) netanel += 1 # print "netanel one seif", seif, netanel # print fixed[netanel] content = re.split("@66", cont) seif = re.sub(ur"[^א-ת]", "", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): if ur"[*]" in co: # print co a = re.findall("\[\*\](.{6})", co) for b in a: if ( os.path.isfile("../source/Korban_netanel_on_{}.txt".format(masechet)) or os.path.isfile("../source/Pilpula_Charifta_on_{}.txt".format(masechet)) ) and netanel < len(fixed): if os.path.isfile("../source/Korban_netanel_on_{}.txt".format(masechet)): commentator = "Korban Netanel " if os.path.isfile("../source/Pilpula_Charifta_on_{}.txt".format(masechet)): commentator = "Pilpula Charifta " korban.append(fixed[netanel]) roash = "Rosh on %s." % masechet + str(len(links_netanel) + 1) + "." + str(num + 1) netanelink = ( commentator + "on " + masechet + "." + str(len(links_netanel) + 1) + "." + str(len(korban)) ) links.append(link(netanelink, roash)) netanel += 1 si.append(co)
def parse1(text): if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)): nose_kelim = nosekelim.open_file() fixed = nosekelim.parse(nose_kelim) links_netanel = [] netanel = 0 rosh = [] chapters = re.split(ur'@00', text) for chapter_num, chapter in enumerate(chapters): print chapter_num, chapter[0:10] if len(chapter)<=1: pass else: perek = [] a = re.split(ur'@22([^@]*)', chapter) for seif, cont in zip(a[1::2], a[2::2]): si = [] korban =[] print seif if ur'[*]' in seif: print "hello1" if ur'[*]' in seif and (os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel <= len(fixed): print "hello", seif, netanel, len(fixed) if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)): commentator = "Korban Netanel" if os.path.isfile('source/PilPula_Charifta_on_{}.txt'.format(masechet)): commentator = "Pilpula Charifta" korban.append(fixed[netanel]) roash = "Rosh on %s." % masechet +str(len(rosh)+2) + "." + str(len(perek)+1) + ".1" netanelink = commentator + " on " + masechet +"."+ str(len(links_netanel)+1) + ".1" print roash, netanelink links.append(link(netanelink, roash)) netanel += 1 content = re.split('@66', cont) seif = re.sub(ur'[^א-ת]',"", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall('\[\*\]', co) for b in a: print b, seif if (os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel < len(fixed): if os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)): commentator = "Korban Netanel " if os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)): commentator = "Pilpula Charifta " korban.append(fixed[netanel]) roash = "Rosh on %s." % masechet + str(len(rosh)+2) + "." + str(len(perek)+1) + "." + str(num+1) netanelink = commentator + "on " + masechet + "." + str(len(links_netanel)+1)+ "."+ str(len(korban)) print roash, netanelink links.append(link(netanelink, roash)) netanel +=1 si.append(co) if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)): links_netanel.append(korban) perek.append(si) rosh.append(perek)
def parse_dapim(text): old_num = 1 shas = 0 count = 1 ncount = 1 tosafos = 0 rashi = 0 no_b = False amud_num = 'b' daf = re.split(ur'@[0-9][0-9](דף[^@]*)', text) print "length daf", len(daf) #print len(daf) chidushei_halachot = [[], []] for daf_num, content in zip(daf[1::2], daf[2::2]): #print daf_num count += 1 cut_books = re.split(ur'@66([^@]*)', content) if len(re.findall(ur'[0-9][0-9]ע"ב', cut_books[0])) == 0: #print "is zero", daf_num no_b = True amudim = re.split(ur'(?:@44|@11)ע"ב', cut_books[0]) for amud in amudim: if len(amudim) < 2: print len(amudim), daf_num DH = [] if amud_num == 'b': amud_num = 'a' elif no_b == True: amud_num = 'a' else: amud_num = 'b' halachot = re.split(ur'@44', amud) for i, verse in enumerate(halachot): pverse = re.sub(ur'@..', "", verse) if len(pverse) < 3: pverse = " " DH.append(pverse) if len(daf_num[3:]) > 3: print "longer than 3", daf_num[3:], number number = hebrew.heb_string_to_int( re.sub("'", "", daf_num[3:].strip())) if (number - old_num) < 0 or (number - old_num) > 1: print "diff", number - old_num, daf_num old_num = number #print number if ur'רש"י' in verse[0:10]: search_rashi(verse, number, amud_num, i + 1) rashi += 1 pass elif ur'תוס' in verse[0:10]: # search_tosafot(verse, number , amud_num, i+1) tosafos += 1 pass else: search_gemara(verse, number, amud_num, i + 1) shas += 1 pass
def search2(parsed, part): for k, seif in enumerate(parsed): for i,pasuk in enumerate(seif): found = re.finditer(ur'@44[\[\(](.*?)[\]\)]@55(.*?)\.', pasuk) for find in found: daf = find.group(1) if daf.strip().split(' ')[0] == u"מנחות"and len(daf.strip().split(' '))<6: if len(daf.strip().split(' ')) ==3: daf = daf.strip().split(' ')[2] elif len(daf.strip().split(' ')) ==2: daf = daf.strip().split(' ')[1] if daf[-1] ==".": amud ="a" elif daf[-1] == ":": amud = "b" daf_num = hebrew.heb_string_to_int(daf[0:-1]) #print daf_num, amud elif daf.strip().split(' ')[0] == u"דף": daf = daf.strip().split(' ')[1] if daf[-1] ==".": amud ="a" elif daf[-1] == ":": amud = "b" daf_num = hebrew.heb_string_to_int(daf[0:-1]) #print daf_num, amud elif ur"שם" not in daf and ur"דף" in daf: #print daf pass else: pass #print daf text = find.group(2) try: print str(k+1), str(i+1),daf_num, amud found = matchobj(daf_num, amud, text) line = found[1][0] if line >0: #print "Rosh on {}".format(masechet), daf_num, amud, found[1][0], str(i+1), ",", str(j+1) + ",", str(k+1) talmud = "{}".format(masechet) + "." + str(daf_num) + amud + "." + str(line) roash = "Rosh on {}".format(masechet) + ", " + part + "." + str(k+1) + "." + str(1) links.append(makeLink(talmud,roash)) except Exception as e: print e
def parse_dapim(text): old_num = 1 shas = 0 count = 1 ncount = 1 tosafos = 0 rashi = 0 no_b=False amud_num = 'b' daf = re.split(ur'@[0-9][0-9](דף[^@]*)', text) print "length daf", len(daf) #print len(daf) chidushei_halachot = [[],[]] for daf_num, content in zip(daf[1::2], daf[2::2]): #print daf_num count+=1 cut_books = re.split(ur'@66([^@]*)', content) if len(re.findall(ur'[0-9][0-9]ע"ב',cut_books[0] ))==0: #print "is zero", daf_num no_b= True amudim = re.split(ur'(?:@44|@11)ע"ב', cut_books[0]) for amud in amudim: if len(amudim)<2: print len(amudim), daf_num DH = [] if amud_num == 'b': amud_num = 'a' elif no_b == True: amud_num = 'a' else: amud_num='b' halachot = re.split(ur'@44',amud) for i, verse in enumerate(halachot): pverse = re.sub(ur'@..', "", verse) if len(pverse)<3: pverse= " " DH.append(pverse) if len(daf_num[3:])>3: print "longer than 3", daf_num[3:], number number = hebrew.heb_string_to_int(re.sub("'","",daf_num[3:].strip())) if (number - old_num) <0 or (number - old_num) >1: print "diff", number - old_num, daf_num old_num = number #print number if ur'רש"י' in verse[0:10]: search_rashi(verse, number , amud_num, i+1) rashi += 1 pass elif ur'תוס' in verse[0:10]: # search_tosafot(verse, number , amud_num, i+1) tosafos += 1 pass else: search_gemara(verse, number , amud_num, i+1) shas += 1 pass
def divrey_chamuot2(text): chamudotlinks = [] count = 0 file = tiferet_shmuel.open_file(record="chamudot") parsed = tiferet_shmuel.parse(file) Helper.createBookRecord(tiferet_shmuel.book_record(record="chamudot")) tiferet_shmuel.save_parsed_text(parsed, record="chamudot") tiferet_shmuel.run_post_to_api(record="chamudot") commentator = "Divrey Chamudot" rosh = [] chapters = re.split(ur"(?:@00|@99)", text) for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): print chapter_num if len(chapter) <= 1: pass else: perek = [] a = re.split(ur"@22([^@]*)", chapter) for seif, cont in zip(a[1::2], a[2::2]): si = [] print seif if ur"(*)" in seif: print "hello1" if ur"(*)" in seif: count += 1 roash = "Rosh on %s." % masechet + str(len(rosh) + 1) + "." + str(len(perek) + 1) + ".1" shmuel = commentator + " on " + masechet + "." + str(count) chamudotlinks.append(link(roash, shmuel)) print roash, shmuel content = re.split("@66", cont) seif = re.sub(ur"[^א-ת]", "", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall("\(\*\)", co) for b in a: count += 1 roash = ( "Rosh on %s." % masechet + str(len(rosh) + 1) + "." + str(len(perek) + 1) + "." + str(num + 1) ) shmuel = commentator + " on " + masechet + "." + str(count) print roash, shmuel chamudotlinks.append(link(shmuel, roash)) # print parsed[count] si.append(co) perek.append(si) rosh.append(perek)
def divrey_chamuot2(text): chamudotlinks = [] count = 0 file = tiferet_shmuel.open_file(record="chamudot") parsed = tiferet_shmuel.parse(file) Helper.createBookRecord(tiferet_shmuel.book_record(record="chamudot")) tiferet_shmuel.save_parsed_text(parsed, record="chamudot") tiferet_shmuel.run_post_to_api(record="chamudot") commentator = "Divrey Chamudot" rosh = [] chapters = re.split(ur'(?:@00|@99)', text) for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): print chapter_num if len(chapter) <= 1: pass else: perek = [] a = re.split(ur'@22([^@]*)', chapter) for seif, cont in zip(a[1::2], a[2::2]): si = [] print seif if ur'(*)' in seif: print "hello1" if ur'(*)' in seif: count += 1 roash = "Rosh on %s." % masechet + str( len(rosh) + 1) + "." + str(len(perek) + 1) + ".1" shmuel = commentator + " on " + masechet + "." + str(count) chamudotlinks.append(link(roash, shmuel)) print roash, shmuel content = re.split('@66', cont) seif = re.sub(ur'[^א-ת]', "", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall('\(\*\)', co) for b in a: count += 1 roash = "Rosh on %s." % masechet + str( len(rosh) + 1) + "." + str(len(perek) + 1) + "." + str(num + 1) shmuel = commentator + " on " + masechet + "." + str( count) print roash, shmuel chamudotlinks.append(link(shmuel, roash)) #print parsed[count] si.append(co) perek.append(si) rosh.append(perek)
def links(clean_text, shas): for i, page in enumerate (clean_text): for j,chapter in enumerate(page): a = re.finditer(ur"@88(.+?)@77(.+)", chapter) for link in a: heb_links= re.split(" ",link.group(1)) daf =heb_links[1] amud = heb_links[2] if amud[2]==ur'א': eng_amud = 'a' elif amud[2]==ur'ב': eng_amud=ur'b' daf = hebrew.heb_string_to_int(daf) quote = re.split(" ",link.group(2).strip()) matching(quote, daf, eng_amud,i,j, shas,words =len(quote), ratio = 70)
def run_parser(): print "running parser" parsed_text = [] cur_chapter = 1 cur_verse = 1 #regex = re.compile(ur'@11(.*)@22',re.UNICODE) with open("source/Radak_on_Genesis.txt", 'r') as filep: file_text = filep.read() ucd_text = unicode(file_text, 'utf-8').strip() #get rid of some unhelpful markup ucd_text = re.sub(ur'@11(.*?)@33', ur'@55\1', ucd_text) ucd_text = re.sub(ur'@00([^@]*)\n', '', ucd_text) ucd_text = ucd_text.replace(u'@44(שם)@55', u'(שם)') #split according to chapter. Will also include the chapter letters in the results. chapters = re.split(ur'@22([^@]*)', ucd_text) for chapter_num, chapter in zip(chapters[1::2],chapters[2::2]): if chapter_num.strip() != '': cur_chapter = hebrew.heb_string_to_int(chapter_num.strip()) parsed_chapter = [] expand_list_assign(parsed_text, cur_chapter-1, parsed_chapter) #now split on verse numbers, capturing the verse numbers as well verses = re.split(ur'@44\(([\u0590-\u05ea]{1,2})\)',chapter) for verse_num, verse in zip(verses[1::2], verses[2::2]): if verse_num.strip() != '': parsed_verse = [] cur_verse = hebrew.heb_string_to_int(verse_num.strip()) expand_list_assign(parsed_chapter, cur_verse-1, parsed_verse) comments = verse.split('@55')[1:] for comment in comments: if comment.strip() != '': parsed_verse.append(comment) pretty_print(parsed_text) save_parsed_text(parsed_text)
def link_to_link(link): if len(link.strip().split(" ")) ==2 and link[0]==ur'ד': dafamud = link.strip().split(" ")[1] amods = dafamud[len(dafamud)-1] if amods == ":": amod = "b" elif amods ==".": amod ="a" dap = dafamud[0:len(dafamud)-1] roman_daf = hebrew.heb_string_to_int(dap.strip()) return masechet + "." + str(roman_daf) + amod elif link[0]==ur'ד': pass #print link elif '.' in link or ":" in link: pass
def link_to_link(link): if len(link.strip().split(" ")) == 2 and link[0] == ur'ד': dafamud = link.strip().split(" ")[1] amods = dafamud[len(dafamud) - 1] if amods == ":": amod = "b" elif amods == ".": amod = "a" dap = dafamud[0:len(dafamud) - 1] roman_daf = hebrew.heb_string_to_int(dap.strip()) return masechet + "." + str(roman_daf) + amod elif link[0] == ur'ד': pass #print link elif '.' in link or ":" in link: pass
def search(text, shas): for i, seif in enumerate(text): for j, siman in enumerate(seif): if siman.endswith(ur'5 '): print "yes" linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman) if '@44' not in siman[0:10] and len(siman) > 8: start = re.sub('([\[\*\]]|@..|#)',"",siman) start_of_siman = re.split(" ", start) # print "start of siman", start_of_siman[0] if 'index' in locals(): # print "matching", index if index > len(shas): break matching(start_of_siman, shas, i, j, index, daf, amud) for match in linked: lookfor = match.group(2) tagged = re.split(" ", lookfor.strip()) daf_amud = re.split(ur' ', match.group(1).strip()) daf = re.sub(ur'[^א-ת]',"",daf_amud[1]) daf = hebrew.heb_string_to_int(daf) if daf > len(shas) or len(daf_amud) <= 2: # print "daf", daf, "is longer than needs to" break else: print daf_amud[0] amud = daf_amud[2] # print "amud", amud index = ((daf-2)*2)+1 #if index > len(shas): # break if amud[2].strip() == ur'א': amud = 'a' index = index - 1 else: amud = 'b' #print "daf", daf, amud if index >= len(shas): # print "short", daf, amud, lookfor return #break else: # print "else" matching(tagged, shas, i, j, index, daf, amud)
def search(text, shas): for i, seif in enumerate(text): for j, siman in enumerate(seif): if siman.endswith(ur'5 '): print "yes" linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman) if '@44' not in siman[0:10] and len(siman) > 8: start = re.sub('([\[\*\]]|@..|#)', "", siman) start_of_siman = re.split(" ", start) # print "start of siman", start_of_siman[0] if 'index' in locals(): # print "matching", index if index > len(shas): break matching(start_of_siman, shas, i, j, index, daf, amud) for match in linked: lookfor = match.group(2) tagged = re.split(" ", lookfor.strip()) daf_amud = re.split(ur' ', match.group(1).strip()) daf = re.sub(ur'[^א-ת]', "", daf_amud[1]) daf = hebrew.heb_string_to_int(daf) if daf > len(shas) or len(daf_amud) <= 2: # print "daf", daf, "is longer than needs to" break else: print daf_amud[0] amud = daf_amud[2] # print "amud", amud index = ((daf - 2) * 2) + 1 #if index > len(shas): # break if amud[2].strip() == ur'א': amud = 'a' index = index - 1 else: amud = 'b' #print "daf", daf, amud if index >= len(shas): # print "short", daf, amud, lookfor pass #break else: # print "else" matching(tagged, shas, i, j, index, daf, amud)
def yomtov2(text): chamudotlinks = [] count = 0 file = tiferet_shmuel.open_file(record = "yomtov") parsed = tiferet_shmuel.parse(file) Helper.createBookRecord(tiferet_shmuel.book_record(record = "yomtov")) tiferet_shmuel.save_parsed_text(parsed, record = "yomtov") tiferet_shmuel.run_post_to_api(record = "yomtov") commentator = "Maadaney Yom Tov" rosh = [] chapters = re.split(ur'(?:@00|@99)', text) for chapter_num, chapter in enumerate(chapters): if len(chapter)<=1: pass else: perek = [] a = re.split(ur'@22([^@]*)', chapter) for seif, cont in zip(a[1::2], a[2::2]): si = [] print seif if ur'[*]' in seif: print "hello1" if ur'[*]' in seif: count+=1 roash = "Rosh on %s." % masechet +str(len(rosh)+1) + "." + str(len(perek)+1) + ".1" shmuel = commentator + " on " + masechet +"."+ str(count) chamudotlinks.append(link(roash, shmuel)) print roash, shmuel content = re.split('@66', cont) seif = re.sub(ur'[^א-ת]',"", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall('\[\*\]', co) for b in a: count+=1 roash = "Rosh on %s." % masechet + str(len(rosh)+1) + "." + str(len(perek)+1) + "." + str(num+1) shmuel = commentator + " on " + masechet + "." + str(count) print roash, shmuel chamudotlinks.append(link(shmuel, roash)) #print parsed[count] si.append(co) perek.append(si) rosh.append(perek)
def parse(text): if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)): # print "has korban netanel 2" nose_kelim = nosekelim.open_file() fixed = nosekelim.parse(nose_kelim) links_netanel = [] netanel = 0 rosh = [] a = re.split(ur'@22([^@]*)', text) for seif, cont in zip(a[1::2], a[2::2]): si = [] korban =[] if ur'[*]' in seif and (os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet) or os.path.isfile('source/PilPula_Charifta_on_{}.txt'.format(masechet))) and netanel <= len(fixed)): if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)): commentator = "Korban Netanel on " if os.path.isfile('source/PilPula_Charifta_on_{}.txt'.format(masechet)): commentator = "Pilpula Charifta on " korban.append(fixed[netanel]) #print len(links_netanel) roash = "Rosh on %s." % masechet + str(len(links_netanel)+1) + ".1" netanelink = commentator + masechet +"."+ str(len(links_netanel)+1) + ".1" links.append(link(netanelink, roash)) netanel += 1 #print "netanel one seif", seif, netanel #print fixed[netanel] content = re.split('@66', cont) seif = re.sub(ur'[^א-ת]',"", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): if ur'[*]' in co: print co a = re.findall('\[\*\](.{6})', co) for b in a: if (os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel < len(fixed): if os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)): commentator = "Korban Netanel " if os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)): commentator = "Pilpula Charifta " korban.append(fixed[netanel]) roash = "Rosh on %s." % masechet + str(len(links_netanel)+1) + "." + str(num+1) netanelink = commentator + "on " +masechet + "." + str(len(links_netanel)+1)+ "."+ str(len(korban)) links.append(link(netanelink, roash)) netanel +=1 si.append(co)
def search(parsed): for i in parsed: for j in i: found = re.finditer(ur'\(דף(.*?)\)(.*?)\(', j) for find in found: daf = find.group(1) text = find.group(2) if daf[len(daf)-1] == '.': #print daf amud = 'a' elif daf[len(daf)-1] == ':': amud = 'b' new_daf = daf[0:len(daf.strip())].strip() #print new_daf try: daf_num = hebrew.heb_string_to_int(new_daf) #print str(daf_num) + amud match(daf_num, amud, text) except KeyError: pass
def search1(parsed): for i in parsed: for k in i: for j in k: found = re.finditer(ur'@44\[דף(.*?)\](.*?)\(', j) for find in found: daf = find.group(1) text = find.group(2) if daf.strip().split(" ")[1] == u'ע"א': amud = 'a' elif daf.strip().split(" ")[1] == u'ע"ב': amud = 'b' new_daf = daf.strip().split(" ")[0] try: daf_num = hebrew.heb_string_to_int(new_daf) #print str(daf_num) + amud match(daf_num, amud, text) except KeyError: pass
def links(clean_text, shas): for i, page in enumerate(clean_text): for j, chapter in enumerate(page): a = re.finditer(ur"@88(.+?)@77(.+)", chapter) for link in a: heb_links = re.split(" ", link.group(1)) daf = heb_links[1] amud = heb_links[2] if amud[2] == ur'א': eng_amud = 'a' elif amud[2] == ur'ב': eng_amud = ur'b' daf = hebrew.heb_string_to_int(daf) quote = re.split(" ", link.group(2).strip()) matching(quote, daf, eng_amud, i, j, shas, words=len(quote), ratio=70)
def search(parsed): for i,perek in enumerate(parsed): for j,pasuk in enumerate(perek): found = re.finditer(ur'\(דף(.*?)\)', pasuk) for find in found: daf = find.group(1) #text = find.group(2) if daf[len(daf)-1] == '.': #print daf amud = 'a' elif daf[len(daf)-1] == ':': amud = 'b' new_daf = daf[0:len(daf.strip())].strip() #print new_daf try: daf_num = hebrew.heb_string_to_int(new_daf) #print str(daf_num) + amud + " " + str(i) + " " + str(j+1) links.append(link("{}".format(masechet) + "." + str(daf_num) + amud, "Rosh on {}, Hilchot Seder Avodat Yom HaKippurim".format(masechet) + "." + str(i) + "." + str(j+1))) #links.append(link( #match(daf_num, amud, text) except KeyError: pass
def parse(text): links_netanel = [] netanel = 0 rosh = [] a = re.split(ur'@22([^@]*)', text) for seif, cont in zip(a[1::2], a[2::2]): si = [] if ur'[*]' in seif: print seif netanel += 1 #si.append(seif) content = re.split('@66', cont) seif = re.sub(ur'[\s\[\*\]]',"", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall('\[\*\](.{6})', co) #for b in a: #print b netanel +=len(a) #print seif, num, netanel - len(a), netanel #print len(a) si.append(co) rosh.append(si)
def parse(text): links_netanel = [] netanel = 0 rosh = [] a = re.split(ur'@22([^@]*)', text) for seif, cont in zip(a[1::2], a[2::2]): si = [] if ur'[*]' in seif: print seif netanel += 1 #si.append(seif) content = re.split('@66', cont) seif = re.sub(ur'[\s\[\*\]]', "", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall('\[\*\](.{6})', co) #for b in a: #print b netanel += len(a) #print seif, num, netanel - len(a), netanel #print len(a) si.append(co) rosh.append(si)
def parse(text): old_num = 0 dibbur = "" #simanim = re.finditer(ur'(@[0-9][0-9])\n?(@[0-9][0-9])(.*\n*)', text) simanim = re.split("@77", text) bayit_chadash = [] perek = [] i = 1 for siman in simanim: simans = re.finditer("@11(.*)@33(.*)", siman) for s in simans: dibbur = "(" + str(i) + ")" + "<b>" + s.group( 1) + '</b>' + s.group(2) print i i = i + 1 if "@22" not in siman: perek.append(dibbur) elif "@22" in siman: #i = 1 num = re.findall("@22(.*)", siman)[0] new_num = hebrew.heb_string_to_int(num.strip()) #print new_num if new_num - old_num != 1: for k in range(1, new_num - old_num): bayit_chadash.append([]) old_num = new_num bayit_chadash.append(perek) perek = [] perek.append(dibbur) i = 1 bayit_chadash.append(perek) #print len(bayit_chadash) return bayit_chadash[1:len(bayit_chadash)]
def parse(text): agadot=[[],[]] old_number = 1 dappim = re.split(ur'@[0-9][0-9]ח"א([^@]*)', text) for daf, content in zip(dappim[1::2],dappim[2::2]): same = False ab = False seifim =[] if len(daf.split(" ")) > 4: if len(daf.split(" "))>=5: string= daf.split(" ")[4] #print string daf_n = daf.split(" ")[2] print daf_n amud = daf.split(" ")[3].strip() if amud[2].strip()== ur"א": amuds = 'a' elif amud[2].strip() == ur"ב": amuds ='b' else: print "did it get here", amud else: continue number = hebrew.heb_string_to_int(daf_n) if number - old_number==0: same =True if number - old_number>1: for i in range(1,number-old_number): agadot.append([]) agadot.append([]) old_number = number simanim = re.finditer(ur'(?:@[0-9][0-9]|[0-9])(.*)',content) for match in simanim: if re.search(ur'[0-9]דף', match.group(0)) is not None: print match.group(0) break siman = match.group(0) siman = re.split("@77([^(?:@|[0-9]]*)",siman) for simans in siman: if simans != "": simanim = re.split('(?:@[0-9][0-9]|[0-9])',simans) if len(simanim[0])>1: simanim_string = "<b>" +string + " " + '</b>'+ simanim[0] seifim.append(simanim_string) if ur'<b>ע"ב' in simanim_string: amuds="b" print simanim_string agadot.append(seifim) seifim=[] seifim.append(simanim_string) ab = True if len(simanim) > 1: for i in range(1,len(simanim)-1,2): simanim_string = ur'<b>' + simanim[i] + " " + ur'</b>' + simanim[i+1] if u'<b>ע"ב' in simanim_string: print daf_n amuds="b" print simanim_string agadot.append(seifim) seifim=[] ab = True if len(simanim_string) > 1: seifim.append(simanim_string)
def parse(text): i = 0 kb = re.split(ur"@00הלכות כלאי בגדים(.*?)@00פרק תשיעי", text) begadim = kb[1] ending = re.split(ur"@00הלכות מקוואות", kb[2]) bdy = kb[0] + ending[0] mikva = ending[1] old_numeri = 0 rosh = [] kileiggadim = [] hilchotmikvaot = [] chapters = re.split(ur'(?:@00|@99)([^@]*)', bdy) for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): mispar = chapter_num.strip().split(" ")[1] if mispar.encode('utf-8') in misparim.keys(): mispar_numeri = misparim[mispar.encode('utf-8')] print mispar_numeri if mispar_numeri - old_numeri > 1: for i in range(1, mispar_numeri - old_numeri): rosh.append([]) old_numeri = mispar_numeri print mispar #if len(chapter)<=1: # pass #else: perek = [] a = re.split(ur'@22([^@]*)', chapter) for seif, cont in zip(a[1::2], a[2::2]): si = [] content = re.split('@66', cont) seif = re.sub(ur'[^א-ת]', "", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall('\[\*\]', co) #for b in a: #print b, seif si.append(co) perek.append(si) if len(perek) is not 0: rosh.append(perek) # print len(rosh) search2(rosh) #take care of begadim b = re.split(ur'@22([^@]*)', begadim) for sei, con in zip(b[1::2], b[2::2]): si = [] conten = re.split('@66', con) sei = re.sub(ur'[^א-ת]', "", sei) sei = hebrew.heb_string_to_int(sei.strip()) for num, co in enumerate(conten): b = re.findall('\[\*\]', co) #for c in b: #print c, sei si.append(co) kileiggadim.append(si) b = re.split(ur'@22([^@]*)', mikva) for sei, con in zip(b[1::2], b[2::2]): si = [] conten = re.split('@66', con) sei = re.sub(ur'[^א-ת]', "", sei) sei = hebrew.heb_string_to_int(sei.strip()) for num, co in enumerate(conten): b = re.findall('\[\*\]', co) for c in b: print c, sei si.append(co) hilchotmikvaot.append(si) #take care of mikva return rosh, kileiggadim, hilchotmikvaot
def parse(text): i=0 kb = re.split(ur"@00הלכות כלאי בגדים(.*?)@00פרק תשיעי", text) begadim = kb[1] ending = re.split(ur"@00הלכות מקוואות",kb[2]) bdy = kb[0] + ending[0] mikva = ending[1] old_numeri = 0 rosh = [] kileiggadim =[] hilchotmikvaot =[] chapters = re.split(ur'(?:@00|@99)([^@]*)', bdy) for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): mispar = chapter_num.strip().split(" ")[1] if mispar.encode('utf-8') in misparim.keys(): mispar_numeri = misparim[mispar.encode('utf-8')] print mispar_numeri if mispar_numeri - old_numeri > 1: for i in range(1,mispar_numeri-old_numeri): rosh.append([]) old_numeri = mispar_numeri print mispar #if len(chapter)<=1: # pass #else: perek = [] a = re.split(ur'@22([^@]*)', chapter) for seif, cont in zip(a[1::2], a[2::2]): si = [] content = re.split('@66', cont) seif = re.sub(ur'[^א-ת]',"", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall('\[\*\]', co) #for b in a: #print b, seif si.append(co) perek.append(si) if len(perek) is not 0: rosh.append(perek) # print len(rosh) search2(rosh) #take care of begadim b = re.split(ur'@22([^@]*)',begadim ) for sei, con in zip(b[1::2], b[2::2]): si = [] conten = re.split('@66', con) sei = re.sub(ur'[^א-ת]',"", sei) sei = hebrew.heb_string_to_int(sei.strip()) for num, co in enumerate(conten): b = re.findall('\[\*\]', co) #for c in b: #print c, sei si.append(co) kileiggadim.append(si) b = re.split(ur'@22([^@]*)', mikva) for sei, con in zip(b[1::2], b[2::2]): si = [] conten = re.split('@66', con) sei = re.sub(ur'[^א-ת]',"", sei) sei = hebrew.heb_string_to_int(sei.strip()) for num, co in enumerate(conten): b = re.findall('\[\*\]', co) for c in b: print c, sei si.append(co) hilchotmikvaot.append(si) #take care of mikva return rosh , kileiggadim,hilchotmikvaot
def parse1(text): old_numeri = 0 if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)): nose_kelim = nosekelim.open_file() fixed = nosekelim.parse(nose_kelim) links_netanel = [] netanel = 0 rosh = [] chapters = re.split(ur'(?:@00|@99)([^@]*)', text) for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): mispar = chapter_num.strip().split(" ")[1] if mispar.encode('utf-8') in misparim.keys(): mispar_numeri = misparim[mispar.encode('utf-8')] print mispar_numeri if mispar_numeri - old_numeri > 1: for i in range(1,mispar_numeri-old_numeri): rosh.append([]) #print "length of rosh", len(rosh) old_numeri = mispar_numeri print mispar #if len(chapter)<=1: # pass #else: perek = [] a = re.split(ur'@22([^@]*)', chapter) for seif, cont in zip(a[1::2], a[2::2]): si = [] korban =[] #print seif if ur'[*]' in seif and (os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel < len(fixed): # print "hello", seif, netanel, len(fixed) if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)): commentator = "Korban Netanel" if os.path.isfile('source/PilPula_Charifta_on_{}.txt'.format(masechet)): commentator = "Pilpula Charifta" korban.append(fixed[netanel]) roash = "Rosh on %s." % masechet +str(len(rosh)+1) + "." + str(len(perek)+1) + ".1" netanelink = commentator + " on " + masechet +"."+ str(len(links_netanel)+1) + ".1" #print roash, netanelink links.append(link(netanelink, roash)) netanel += 1 content = re.split('@66', cont) seif = re.sub(ur'[^א-ת]',"", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall('\[\*\]', co) for b in a: # print b, seif if (os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel < len(fixed): if os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)): commentator = "Korban Netanel " if os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)): commentator = "Pilpula Charifta " korban.append(fixed[netanel]) roash = "Rosh on %s." % masechet + str(len(rosh)+1) + "." + str(len(perek)+1) + "." + str(num+1) netanelink = commentator + "on " + masechet + "." + str(len(links_netanel)+1)+ "."+ str(len(korban)) #print roash, netanelink links.append(link(netanelink, roash)) netanel +=1 si.append(co) if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)): links_netanel.append(korban) perek.append(si) if len(perek) is not 0: rosh.append(perek)
def parse(text): older_siman = 0 arbaturim = [] tur = [] hilchos = re.split(ur'@00', text) #split to names of parts for halacha in hilchos: if len(halacha) > 0: halacha_name = halacha.splitlines()[0] #print halacha_name #get the name of the part simanim = re.finditer( ur'(@?[0-9]?[0-9]?@?[0-9]?[0-9]?)@22(.*)@11(.*)', halacha ) #cut the text to simanim, get kletter of siman and tags to commentary i = 1 for simans in simanim: localbet_yosef = 0 siman = simans.group(2) siman = re.sub(ur'[\(\[].*?[\)\]]', "", siman) siman = re.sub(ur'[^\u05d0-\u05ea]', "", siman) if len(siman) > 4: # print simans.group(2) #print simans.group(3) pass roman_siman = hebrew.heb_string_to_int(siman.strip()) bold = re.split(ur'@33', simans.group(3)) if len(bold) == 2: text = simans.group(1) + "<b>" + bold[0] + "</b>" + bold[1] else: text = simans.group(1) + simans.group(3) #text1 = re.split(u"(.*?[.:])", text) #text1 = filter(None, text1) #taking care of links try: for k in range(0, len(karo[len(tur)])): #print len(tur)+1,k #for k in range(1,len(re.findall("@66",simans.group(0)))): #print simans.group(0) if "@66" in simans.group(1): links.append(addlink(len(tur) + 1, 1, k + 1)) localbet_yosef += len(re.findall("@66", simans.group(2))) if "@66" in simans.group(2): links.append(addlink(len(tur) + 1, 1, k + 1)) for sif_num, sifs in enumerate(text, start=1): for a in range(1, len(re.findall("@66", sifs))): links.append( addlink(len(tur) + 1, sif_num, a + k + 1)) #if localbet_yosef - len(karo[len(tur)+1]) != -1: #print simans.group(2),roman_siman, localbet_yosef, len(karo[len(tur)+1]) # pass if roman_siman - older_siman != 1: print siman print roman_siman older_siman = roman_siman text = re.sub(ur'@66', lambda m, c=count(1): '[{}]'.format(next(c)), text) tur.append([text]) except IndexError: print "out of index" arbaturim.append(tur) depth = lambda L: isinstance(L, list) and max(map(depth, L)) + 1 print depth(tur) return tur
def parse1(text): old_numeri = 0 if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format( masechet)) or os.path.isfile( 'source/Pilpula_Charifta_on_{}.txt'.format(masechet)): nose_kelim = nosekelim.open_file() fixed = nosekelim.parse(nose_kelim) links_netanel = [] netanel = 0 rosh = [] chapters = re.split(ur'(?:@00|@99)([^@]*)', text) for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]): mispar = chapter_num.strip().split(" ")[1] if mispar.encode('utf-8') in misparim.keys(): mispar_numeri = misparim[mispar.encode('utf-8')] print mispar_numeri if mispar_numeri - old_numeri > 1: for i in range(1, mispar_numeri - old_numeri): rosh.append([]) #print "length of rosh", len(rosh) old_numeri = mispar_numeri print mispar #if len(chapter)<=1: # pass #else: perek = [] a = re.split(ur'@22([^@]*)', chapter) for seif, cont in zip(a[1::2], a[2::2]): si = [] korban = [] #print seif if ur'[*]' in seif and ( os.path.isfile( 'source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile( 'source/Pilpula_Charifta_on_{}.txt'.format(masechet)) ) and netanel < len(fixed): # print "hello", seif, netanel, len(fixed) if os.path.isfile( 'source/Korban_Netanel_on_{}.txt'.format(masechet)): commentator = "Korban Netanel" if os.path.isfile( 'source/PilPula_Charifta_on_{}.txt'.format(masechet)): commentator = "Pilpula Charifta" korban.append(fixed[netanel]) roash = "Rosh on %s." % masechet + str( len(rosh) + 1) + "." + str(len(perek) + 1) + ".1" netanelink = commentator + " on " + masechet + "." + str( len(links_netanel) + 1) + ".1" #print roash, netanelink links.append(link(netanelink, roash)) netanel += 1 content = re.split('@66', cont) seif = re.sub(ur'[^א-ת]', "", seif) seif = hebrew.heb_string_to_int(seif.strip()) for num, co in enumerate(content): a = re.findall('\[\*\]', co) for b in a: # print b, seif if (os.path.isfile( 'source/Korban_netanel_on_{}.txt'.format(masechet)) or os.path.isfile( 'source/Pilpula_Charifta_on_{}.txt'.format( masechet))) and netanel < len(fixed): if os.path.isfile( 'source/Korban_netanel_on_{}.txt'.format( masechet)): commentator = "Korban Netanel " if os.path.isfile( 'source/Pilpula_Charifta_on_{}.txt'.format( masechet)): commentator = "Pilpula Charifta " korban.append(fixed[netanel]) roash = "Rosh on %s." % masechet + str( len(rosh) + 1) + "." + str(len(perek) + 1) + "." + str(num + 1) netanelink = commentator + "on " + masechet + "." + str( len(links_netanel) + 1) + "." + str(len(korban)) #print roash, netanelink links.append(link(netanelink, roash)) netanel += 1 si.append(co) if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format( masechet)) or os.path.isfile( 'source/Pilpula_Charifta_on_{}.txt'.format(masechet)): links_netanel.append(korban) perek.append(si) if len(perek) is not 0: rosh.append(perek)