def do_book_comparison(book_index_xml): #check number of chapters. #compare to the parsed Leningrad itself #compare to the current sefaria versions. canonical_name = book_index_xml.find('./names/name').text print canonical_name diff_file = open("results/%s_wlc_koren.html" % canonical_name, 'wb+') length_results = open("results/length_comparison.txt", 'ab+') wlc_chapter_count = int(book_index_xml.find( './cs').text) #listed length of the leningrad chapters with open("preprocess_json/%s.json" % canonical_name, 'r') as filep: wlc_text = json.load(filep)['text'] wlc_real_chapter_count = len( wlc_text) #physical length of the parsed leningrad chapters sefaria_book = Helper.getKnownTexts(canonical_name) sefria_chapter_count = sefaria_book['lengths'][0] sefaria_text = Helper.api_get_text( "%s 1-%s" % (sefaria_book['title'], sefria_chapter_count), 'he', "Tanach with Ta'amei Hamikra")['he'] if sefria_chapter_count == 1: sefaria_text = [sefaria_text] sefaria_real_chapter_count = len(sefaria_text) if not all_same([ wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count ]): ch_res_str = "%s: Leningrad has %s chapters listed and %s chapters in text. Sefaria version has %s chapters listed and %s chapters in text\n" % ( canonical_name.encode('utf-8'), wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count) length_results.write(ch_res_str) for chapter in book_index_xml.findall('c'): ch_num = int(chapter.get('n')) wlc_verse_count = int(chapter.find('vs').text) wlc_real_verse_count = len(wlc_text[ch_num - 1]) sefaria_real_verse_count = len(sefaria_text[ch_num - 1]) if not all_same( [wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count]): v_res_str = "%s:%s Leningrad has %s verses listed and %s verses in text. Sefaria version has %s verses\n" % ( canonical_name.encode('utf-8'), ch_num, wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count) length_results.write(v_res_str) html_diff = difflib.HtmlDiff().make_file( flatten_text(make_consonantal_text(wlc_text)), flatten_text(make_consonantal_text(sefaria_text)), 'Leningrad Codex', 'Sefaria/Koren') html_diff = html_diff.replace('charset=ISO-8859-1', 'charset=utf-8') diff_file.write(html_diff.encode('utf-8')) length_results.close() diff_file.close()
def get_biblical_text(ref): api_res = Helper.api_get_text(ref) portion = {} portion['book'] = api_res['book'] portion['text'] = api_res['he'] portion['sections'] = api_res['sections'] portion['searchBeginSections'] = api_res['sections'] portion['toSections'] = api_res['toSections'] if portion['sections'][0] == portion['toSections'][0]: #the api returns a list of strings, and not a 2d array if only one chapter #adjust to unify structure portion['text'] = [portion['text']] for i,chapter in enumerate(portion['text']): #if a chapter only has one verse, it is not an array if not isinstance(chapter, list): portion['text'][i] = [chapter] print "Ref: %s sections: %s to sections: %s" % (ref, portion['sections'], portion['toSections']) return portion
def get_biblical_text(ref): api_res = Helper.api_get_text(ref) portion = {} portion['book'] = api_res['book'] portion['text'] = api_res['he'] portion['sections'] = api_res['sections'] portion['searchBeginSections'] = api_res['sections'] portion['toSections'] = api_res['toSections'] if portion['sections'][0] == portion['toSections'][ 0]: #the api returns a list of strings, and not a 2d array if only one chapter #adjust to unify structure portion['text'] = [portion['text']] for i, chapter in enumerate( portion['text'] ): #if a chapter only has one verse, it is not an array if not isinstance(chapter, list): portion['text'][i] = [chapter] print "Ref: %s sections: %s to sections: %s" % (ref, portion['sections'], portion['toSections']) return portion
def do_book_comparison(book_index_xml): # check number of chapters. # compare to the parsed Leningrad itself # compare to the current sefaria versions. canonical_name = book_index_xml.find("./names/name").text print canonical_name diff_file = open("results/%s_wlc_koren.html" % canonical_name, "wb+") length_results = open("results/length_comparison.txt", "ab+") wlc_chapter_count = int(book_index_xml.find("./cs").text) # listed length of the leningrad chapters with open("preprocess_json/%s.json" % canonical_name, "r") as filep: wlc_text = json.load(filep)["text"] wlc_real_chapter_count = len(wlc_text) # physical length of the parsed leningrad chapters sefaria_book = Helper.getKnownTexts(canonical_name) sefria_chapter_count = sefaria_book["lengths"][0] sefaria_text = Helper.api_get_text( "%s 1-%s" % (sefaria_book["title"], sefria_chapter_count), "he", "Tanach with Ta'amei Hamikra" )["he"] if sefria_chapter_count == 1: sefaria_text = [sefaria_text] sefaria_real_chapter_count = len(sefaria_text) if not all_same([wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count]): ch_res_str = ( "%s: Leningrad has %s chapters listed and %s chapters in text. Sefaria version has %s chapters listed and %s chapters in text\n" % ( canonical_name.encode("utf-8"), wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count, ) ) length_results.write(ch_res_str) for chapter in book_index_xml.findall("c"): ch_num = int(chapter.get("n")) wlc_verse_count = int(chapter.find("vs").text) wlc_real_verse_count = len(wlc_text[ch_num - 1]) sefaria_real_verse_count = len(sefaria_text[ch_num - 1]) if not all_same([wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count]): v_res_str = ( "%s:%s Leningrad has %s verses listed and %s verses in text. Sefaria version has %s verses\n" % ( canonical_name.encode("utf-8"), ch_num, wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count, ) ) length_results.write(v_res_str) html_diff = difflib.HtmlDiff().make_file( flatten_text(make_consonantal_text(wlc_text)), flatten_text(make_consonantal_text(sefaria_text)), "Leningrad Codex", "Sefaria/Koren", ) html_diff = html_diff.replace("charset=ISO-8859-1", "charset=utf-8") diff_file.write(html_diff.encode("utf-8")) length_results.close() diff_file.close()