Esempio n. 1
0
def do_book_comparison(book_index_xml):
    #check number of chapters.
    #compare to the parsed Leningrad itself
    #compare to the current sefaria versions.

    canonical_name = book_index_xml.find('./names/name').text
    print canonical_name
    diff_file = open("results/%s_wlc_koren.html" % canonical_name, 'wb+')
    length_results = open("results/length_comparison.txt", 'ab+')

    wlc_chapter_count = int(book_index_xml.find(
        './cs').text)  #listed length of the leningrad chapters
    with open("preprocess_json/%s.json" % canonical_name, 'r') as filep:
        wlc_text = json.load(filep)['text']
    wlc_real_chapter_count = len(
        wlc_text)  #physical length of the parsed leningrad chapters

    sefaria_book = Helper.getKnownTexts(canonical_name)
    sefria_chapter_count = sefaria_book['lengths'][0]
    sefaria_text = Helper.api_get_text(
        "%s 1-%s" % (sefaria_book['title'], sefria_chapter_count), 'he',
        "Tanach with Ta'amei Hamikra")['he']
    if sefria_chapter_count == 1:
        sefaria_text = [sefaria_text]
    sefaria_real_chapter_count = len(sefaria_text)

    if not all_same([
            wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count,
            sefaria_real_chapter_count
    ]):
        ch_res_str = "%s: Leningrad has %s chapters listed and %s chapters in text. Sefaria version has %s chapters listed and %s chapters in text\n" % (
            canonical_name.encode('utf-8'), wlc_chapter_count,
            wlc_real_chapter_count, sefria_chapter_count,
            sefaria_real_chapter_count)
        length_results.write(ch_res_str)

    for chapter in book_index_xml.findall('c'):
        ch_num = int(chapter.get('n'))
        wlc_verse_count = int(chapter.find('vs').text)
        wlc_real_verse_count = len(wlc_text[ch_num - 1])
        sefaria_real_verse_count = len(sefaria_text[ch_num - 1])
        if not all_same(
            [wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count]):
            v_res_str = "%s:%s Leningrad has %s verses listed and %s verses in text. Sefaria version has %s verses\n" % (
                canonical_name.encode('utf-8'), ch_num, wlc_verse_count,
                wlc_real_verse_count, sefaria_real_verse_count)
            length_results.write(v_res_str)

    html_diff = difflib.HtmlDiff().make_file(
        flatten_text(make_consonantal_text(wlc_text)),
        flatten_text(make_consonantal_text(sefaria_text)), 'Leningrad Codex',
        'Sefaria/Koren')
    html_diff = html_diff.replace('charset=ISO-8859-1', 'charset=utf-8')
    diff_file.write(html_diff.encode('utf-8'))

    length_results.close()
    diff_file.close()
def get_biblical_text(ref):
    api_res = Helper.api_get_text(ref)
    portion = {}
    portion['book'] = api_res['book']
    portion['text'] = api_res['he']
    portion['sections'] = api_res['sections']
    portion['searchBeginSections'] = api_res['sections']
    portion['toSections'] = api_res['toSections']
    if portion['sections'][0] == portion['toSections'][0]: #the api returns a list of strings, and not a 2d array if only one chapter
        #adjust to unify structure
        portion['text'] = [portion['text']]
    for i,chapter in enumerate(portion['text']): #if a chapter only has one verse, it is not an array
        if not isinstance(chapter, list):
            portion['text'][i] = [chapter]

    print "Ref: %s sections: %s to sections: %s" % (ref, portion['sections'], portion['toSections'])
    return portion
def get_biblical_text(ref):
    api_res = Helper.api_get_text(ref)
    portion = {}
    portion['book'] = api_res['book']
    portion['text'] = api_res['he']
    portion['sections'] = api_res['sections']
    portion['searchBeginSections'] = api_res['sections']
    portion['toSections'] = api_res['toSections']
    if portion['sections'][0] == portion['toSections'][
            0]:  #the api returns a list of strings, and not a 2d array if only one chapter
        #adjust to unify structure
        portion['text'] = [portion['text']]
    for i, chapter in enumerate(
            portion['text']
    ):  #if a chapter only has one verse, it is not an array
        if not isinstance(chapter, list):
            portion['text'][i] = [chapter]

    print "Ref: %s sections: %s to sections: %s" % (ref, portion['sections'],
                                                    portion['toSections'])
    return portion
def do_book_comparison(book_index_xml):
    # check number of chapters.
    # compare to the parsed Leningrad itself
    # compare to the current sefaria versions.

    canonical_name = book_index_xml.find("./names/name").text
    print canonical_name
    diff_file = open("results/%s_wlc_koren.html" % canonical_name, "wb+")
    length_results = open("results/length_comparison.txt", "ab+")

    wlc_chapter_count = int(book_index_xml.find("./cs").text)  # listed length of the leningrad chapters
    with open("preprocess_json/%s.json" % canonical_name, "r") as filep:
        wlc_text = json.load(filep)["text"]
    wlc_real_chapter_count = len(wlc_text)  # physical length of the parsed leningrad chapters

    sefaria_book = Helper.getKnownTexts(canonical_name)
    sefria_chapter_count = sefaria_book["lengths"][0]
    sefaria_text = Helper.api_get_text(
        "%s 1-%s" % (sefaria_book["title"], sefria_chapter_count), "he", "Tanach with Ta'amei Hamikra"
    )["he"]
    if sefria_chapter_count == 1:
        sefaria_text = [sefaria_text]
    sefaria_real_chapter_count = len(sefaria_text)

    if not all_same([wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count]):
        ch_res_str = (
            "%s: Leningrad has %s chapters listed and %s chapters in text. Sefaria version has %s chapters listed and %s chapters in text\n"
            % (
                canonical_name.encode("utf-8"),
                wlc_chapter_count,
                wlc_real_chapter_count,
                sefria_chapter_count,
                sefaria_real_chapter_count,
            )
        )
        length_results.write(ch_res_str)

    for chapter in book_index_xml.findall("c"):
        ch_num = int(chapter.get("n"))
        wlc_verse_count = int(chapter.find("vs").text)
        wlc_real_verse_count = len(wlc_text[ch_num - 1])
        sefaria_real_verse_count = len(sefaria_text[ch_num - 1])
        if not all_same([wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count]):
            v_res_str = (
                "%s:%s Leningrad has %s verses listed and %s verses in text. Sefaria version has %s verses\n"
                % (
                    canonical_name.encode("utf-8"),
                    ch_num,
                    wlc_verse_count,
                    wlc_real_verse_count,
                    sefaria_real_verse_count,
                )
            )
            length_results.write(v_res_str)

    html_diff = difflib.HtmlDiff().make_file(
        flatten_text(make_consonantal_text(wlc_text)),
        flatten_text(make_consonantal_text(sefaria_text)),
        "Leningrad Codex",
        "Sefaria/Koren",
    )
    html_diff = html_diff.replace("charset=ISO-8859-1", "charset=utf-8")
    diff_file.write(html_diff.encode("utf-8"))

    length_results.close()
    diff_file.close()