def do_book_comparison(book_index_xml): #check number of chapters. #compare to the parsed Leningrad itself #compare to the current sefaria versions. canonical_name = book_index_xml.find('./names/name').text print canonical_name diff_file = open("results/%s_wlc_koren.html" % canonical_name, 'wb+') length_results = open("results/length_comparison.txt", 'ab+') wlc_chapter_count = int(book_index_xml.find( './cs').text) #listed length of the leningrad chapters with open("preprocess_json/%s.json" % canonical_name, 'r') as filep: wlc_text = json.load(filep)['text'] wlc_real_chapter_count = len( wlc_text) #physical length of the parsed leningrad chapters sefaria_book = Helper.getKnownTexts(canonical_name) sefria_chapter_count = sefaria_book['lengths'][0] sefaria_text = Helper.api_get_text( "%s 1-%s" % (sefaria_book['title'], sefria_chapter_count), 'he', "Tanach with Ta'amei Hamikra")['he'] if sefria_chapter_count == 1: sefaria_text = [sefaria_text] sefaria_real_chapter_count = len(sefaria_text) if not all_same([ wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count ]): ch_res_str = "%s: Leningrad has %s chapters listed and %s chapters in text. Sefaria version has %s chapters listed and %s chapters in text\n" % ( canonical_name.encode('utf-8'), wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count) length_results.write(ch_res_str) for chapter in book_index_xml.findall('c'): ch_num = int(chapter.get('n')) wlc_verse_count = int(chapter.find('vs').text) wlc_real_verse_count = len(wlc_text[ch_num - 1]) sefaria_real_verse_count = len(sefaria_text[ch_num - 1]) if not all_same( [wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count]): v_res_str = "%s:%s Leningrad has %s verses listed and %s verses in text. Sefaria version has %s verses\n" % ( canonical_name.encode('utf-8'), ch_num, wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count) length_results.write(v_res_str) html_diff = difflib.HtmlDiff().make_file( flatten_text(make_consonantal_text(wlc_text)), flatten_text(make_consonantal_text(sefaria_text)), 'Leningrad Codex', 'Sefaria/Koren') html_diff = html_diff.replace('charset=ISO-8859-1', 'charset=utf-8') diff_file.write(html_diff.encode('utf-8')) length_results.close() diff_file.close()
def run_post_to_api(sub_directory=None): directory = "preprocess_json/%s" % sub_directory if sub_directory else "preprocess_json" wlc_index_xml = ET.parse("source/TanachIndex.xml") # this lists num of chapters and verses for all books in the WLC books_xml_r = wlc_index_xml.getroot().find("tanach") for book in books_xml_r.findall("book"): canonical_name = book.find("./names/name").text print canonical_name with open("%s/%s.json" % (directory, canonical_name), "r") as filep: file_text = filep.read() sefaria_book = Helper.getKnownTexts(canonical_name) Helper.postText(sefaria_book["title"], file_text, False)
def run_post_to_api(sub_directory=None): directory = "preprocess_json/%s" % sub_directory if sub_directory else "preprocess_json" wlc_index_xml = ET.parse( 'source/TanachIndex.xml' ) #this lists num of chapters and verses for all books in the WLC books_xml_r = wlc_index_xml.getroot().find('tanach') for book in books_xml_r.findall('book'): canonical_name = book.find('./names/name').text print canonical_name with open("%s/%s.json" % (directory, canonical_name), 'r') as filep: file_text = filep.read() sefaria_book = Helper.getKnownTexts(canonical_name) Helper.postText(sefaria_book['title'], file_text, False)
def do_book_comparison(book_index_xml): # check number of chapters. # compare to the parsed Leningrad itself # compare to the current sefaria versions. canonical_name = book_index_xml.find("./names/name").text print canonical_name diff_file = open("results/%s_wlc_koren.html" % canonical_name, "wb+") length_results = open("results/length_comparison.txt", "ab+") wlc_chapter_count = int(book_index_xml.find("./cs").text) # listed length of the leningrad chapters with open("preprocess_json/%s.json" % canonical_name, "r") as filep: wlc_text = json.load(filep)["text"] wlc_real_chapter_count = len(wlc_text) # physical length of the parsed leningrad chapters sefaria_book = Helper.getKnownTexts(canonical_name) sefria_chapter_count = sefaria_book["lengths"][0] sefaria_text = Helper.api_get_text( "%s 1-%s" % (sefaria_book["title"], sefria_chapter_count), "he", "Tanach with Ta'amei Hamikra" )["he"] if sefria_chapter_count == 1: sefaria_text = [sefaria_text] sefaria_real_chapter_count = len(sefaria_text) if not all_same([wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count]): ch_res_str = ( "%s: Leningrad has %s chapters listed and %s chapters in text. Sefaria version has %s chapters listed and %s chapters in text\n" % ( canonical_name.encode("utf-8"), wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count, ) ) length_results.write(ch_res_str) for chapter in book_index_xml.findall("c"): ch_num = int(chapter.get("n")) wlc_verse_count = int(chapter.find("vs").text) wlc_real_verse_count = len(wlc_text[ch_num - 1]) sefaria_real_verse_count = len(sefaria_text[ch_num - 1]) if not all_same([wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count]): v_res_str = ( "%s:%s Leningrad has %s verses listed and %s verses in text. Sefaria version has %s verses\n" % ( canonical_name.encode("utf-8"), ch_num, wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count, ) ) length_results.write(v_res_str) html_diff = difflib.HtmlDiff().make_file( flatten_text(make_consonantal_text(wlc_text)), flatten_text(make_consonantal_text(sefaria_text)), "Leningrad Codex", "Sefaria/Koren", ) html_diff = html_diff.replace("charset=ISO-8859-1", "charset=utf-8") diff_file.write(html_diff.encode("utf-8")) length_results.close() diff_file.close()