コード例 #1
0
ファイル: JPS1985.py プロジェクト: JonMosenkis/Sefaria-Data
def count_foot(books, regexes):
    """
    loop through books and count number of times regex appears

    :param books: dictionary containing entire jps 1985 translation
    :param regexes: list of regular expressions to compare
    :return: number of appearances of the regular expression
    """

    all_books = library.get_indexes_in_category('Tanach')
    total = 0

    # define replacement dictionary
    replacements = {
        u'\n': u'',
        u'\r': u'',
        u'\n\r': u'',
    }

    # loop through books, and combine each chapter into one string
    for book in all_books:
        for num, chapter in enumerate(books[book]):

            # make chapter into single string (to account for footnotes spanning multiple lines and verses)
            text = []
            for verse in chapter:
                text.append(functions.multiple_replace(verse, replacements))

            text = ' '.join(text)
            counts = []
            for reg in regexes:
                counts.append(len(re.findall(reg, text)))

            cannon = counts[0]
            for index, count in enumerate(counts):
                if count != cannon:
                    print '{}: {}; {}'.format(book, num+1, index)
                    total += 1
                    break

    return total
コード例 #2
0
ファイル: JPS1985.py プロジェクト: smontagu/Sefaria-Data
def count_foot(books, regexes):
    """
    loop through books and count number of times regex appears

    :param books: dictionary containing entire jps 1985 translation
    :param regexes: list of regular expressions to compare
    :return: number of appearances of the regular expression
    """

    all_books = library.get_indexes_in_category('Tanach')
    total = 0

    # define replacement dictionary
    replacements = {
        u'\n': u'',
        u'\r': u'',
        u'\n\r': u'',
    }

    # loop through books, and combine each chapter into one string
    for book in all_books:
        for num, chapter in enumerate(books[book]):

            # make chapter into single string (to account for footnotes spanning multiple lines and verses)
            text = []
            for verse in chapter:
                text.append(functions.multiple_replace(verse, replacements))

            text = ' '.join(text)
            counts = []
            for reg in regexes:
                counts.append(len(re.findall(reg, text)))

            cannon = counts[0]
            for index, count in enumerate(counts):
                if count != cannon:
                    print '{}: {}; {}'.format(book, num + 1, index)
                    total += 1
                    break

    return total
コード例 #3
0
ファイル: JPS1985.py プロジェクト: JonMosenkis/Sefaria-Data
def footnote_linker(jps, jps_footnotes):
    """
    Create a list of link objects with the anchorText field set to the corresponding words in the verse.
    Once the corresponding text has been found, edit the jps text so as to replace the footnote mark with
    an <i> tag. Trailing footnote markers at the end of an enclosed text fragment will be scrubbed.

    :param jps: jps Tanach data structure - jps[<book_name>][chapter_index][verse_index]
    :param jps_footnotes: jps footnotes data struct -
    footnotes[<book_name>][chap_index][footnote]. Note that this returns a dictionary with the
    keys [footnote], which gives the footnote text, and [links], which give the verses to which the
    footnote needs to link to.
    :return: A list of link objects. This function will edit jps to replace inline footnote markers
    with <i> tags, as well as remove the beginning footnote characters used to match the note to printed
    text.
    """

    # get books of Tanach
    books = library.get_indexes_in_category('Tanach')

    # open error file
    errors = codecs.open('footnote_errors.txt', 'w', 'utf-8')

    # declare link array
    links = []

    # iterate through jps_footnotes
    for book in books:
        for chap_num, chapter in enumerate(jps_footnotes[book]):
            for index, note in enumerate(chapter):

                # get tag to identify footnote in main text. Account for case where tag is 'aa'.
                if note['footnote'][0] == u'a' and note['footnote'][1] == u'a':
                    tag = u'aa'
                else:
                    tag = u'{}'.format(note['footnote'][0])

                # first word of footnote is the tag. That data has been saved - now strip from the footnote
                note['footnote'] = u' '.join(note['footnote'].split()[1:])

                # compile regexes to account for enclosed text
                open_tag_reg = re.compile(u'\[{}\]-'.format(tag))
                close_tag_reg = re.compile(u'-\[{}\]'.format(tag))
                enclosed_reg = re.compile(u'\[{0}\]-.*?-\[{0}\]'.format(tag))

                # <i> tag to replace footnote marker in text
                if tag == u'aa':
                    data_order = 27
                else:
                    data_order = ord(tag) - 96
                itag = u'<i data-commentator="JPS 1985 Footnotes" data-order="{}"></i>'.format(data_order)

                # iterate over the links
                for link in note['links']:

                    # set flag for default footnote behaviour
                    default = True

                    # get verse in main text
                    try:
                        verse = jps[book][chap_num][link-1]
                    except IndexError:
                        print u'{},{},{}'.format(book, chap_num+1, link)
                        continue

                    # search using regexes
                    open_tag = open_tag_reg.search(verse)
                    close_tag = close_tag_reg.search(verse)
                    enclosed = enclosed_reg.search(verse)

                    # declare anchor for text
                    anchor = u''

                    # check for enclosed comment
                    if enclosed:
                        # catch enclosed text
                        anchor = enclosed.group()[len(tag)+3:-(len(tag)+3)]

                        # replace leading footnote tags with <i> tag, strip out trailing tag
                        replace = {u'[{}]-'.format(tag): itag, u'-[{}]'.format(tag): u''}
                        jps[book][chap_num][link-1] = functions.multiple_replace(verse, replace)

                        # sanity check - scrub out any remaining footnote tags
                        jps[book][chap_num][link-1] = jps[book][chap_num][link-1].replace(u'[{}]'.format(tag), u'')

                        default = False

                    # if not enclosed comment, make sanity check
                    elif open_tag:

                        # check if anomaly can be resolved by looking at the next verse
                        next_verse = jps[book][chap_num][link]
                        combined = u' '.join([verse, next_verse])
                        enclosed = enclosed_reg.search(combined)

                        if enclosed:
                            anchor = enclosed.group()[len(tag) + 3:-(len(tag) + 3)]
                            jps[book][chap_num][link-1] = verse.replace(u'[{}]-'.format(tag), itag)
                            jps[book][chap_num][link] = next_verse.replace(u'-[{}]'.format(tag), u'')
                            jps[book][chap_num][link-1] = jps[book][chap_num][link-1].replace(u'[{}]'.format(tag), u'')

                            default = False

                    if default:

                        # sanity check, make sure tag is in verse
                        if verse.find(u'[{}]'.format(tag)) == -1:
                            errors.write(u'tag not found\n')
                            errors.write(u'{}, {}, {}, {}\n'.format(book, chap_num+1, link, note['footnote']))
                            continue

                        else:
                            # remove footnote tag from main text
                            try:
                                jps[book][chap_num][link-1] = verse.replace(u'[{}]'.format(tag), itag)
                            except IndexError:
                                print u'{},{},{}'.format(book, chap_num+1, link)

                            # get preceding word
                            words = verse[:verse.find(u'[{}]'.format(tag))].split()

                            if len(words) > 0:
                                anchor = words[len(words)-1]
                            else:
                                anchor = u''

                    # create link object
                    links.append({
                        'refs': [u'{}.{}.{}'.format(book, chap_num+1, link),
                                 u'JPS 1985 Footnotes, {}.{}.{}'.format(book, chap_num+1, index+1)],
                        'type': 'commentary',
                        'auto': True,
                        'generated_by': 'JPS parse script',
                        'anchorText': anchor,
                                  })

    errors.close()
    return links
コード例 #4
0
ファイル: JPS1985.py プロジェクト: JonMosenkis/Sefaria-Data
def align_footnotes(books):
    """
    The footnotes need to be structured by book and chapter. Each footnote may refer to multiple verses.

    :param books: Dictionary, containing the entire JPS 1985 translation
    return: Dictionary, with books as keys and chapters as values. Each chapter is a list of dictionaries,
    with the key "footnote" set to the footnote and the key "links" being a list of verses where the
    footnote appears.
    """

    jps_footnotes = {}

    # define replacement dictionary
    replacements = {
        u'@': u'\u1e63',
        u'h%': u'\u1e25',
        u'H%': u'\u1e24',
        u'\n': u'',
        u'\r': u'',
        u'\n\r': u'',
    }

    # get list of book in tanach
    all_books = library.get_indexes_in_category('Tanach')

    # open footnote document and retrieve first footnote
    input_file = codecs.open('JPS1985_footnotes.txt', 'r', 'utf-8')
    note = {'footnote': functions.multiple_replace(input_file.readline(), replacements)}

    # iterate through Tanach
    for book in all_books:

        # set dictionary to with keys set to chapter numbers
        footnote_chaps = {}

        for chap_num, chapter in enumerate(books[book]):

            # set flag to indicate if any footnote markers have been found
            found_note = False

            # declare array to hold all note in chapter
            chap_notes = []

            # repeatedly loop through chapter, searching for cases where footnote appears
            while True:

                # account for the case where a footnote is tagged "aa"
                if note['footnote'].find(u'aa') == 0:
                    tag = u'aa'
                else:
                    tag = note['footnote'][0]

                found = []
                for verse_num, verse in enumerate(chapter):
                    if verse.find(u'[{}]'.format(tag)) != -1:
                        found.append(verse_num+1)
                        found_note = True
                    note['links'] = found

                # if footnote markers were found, get the next footnote
                if found_note:
                    chap_notes.append(note)
                    note = {'footnote': functions.multiple_replace(input_file.readline(), replacements)}

                    if note['footnote'] == u'':
                        footnote_chaps[chap_num] = chap_notes
                        break

                # if footnote begins with "a", this is a new chapter
                try:
                    if note['footnote'][0] == u'a' and note['footnote'][1] != u'a':
                        footnote_chaps[chap_num] = chap_notes
                        break

                except IndexError:
                    print 'error'
                    print note['footnote']
                    print u'{}, chapter {}'.format(book, chap_num+1)
                    input_file.close()
                    sys.exit(1)

        jps_footnotes[book] = functions.convertDictToArray(footnote_chaps)

    input_file.close()
    return jps_footnotes
コード例 #5
0
ファイル: JPS1985.py プロジェクト: JonMosenkis/Sefaria-Data
def parse():

    # declare variables
    Books, Chapters, Verses = {}, [], []
    previous_line = u''
    book_name = u''
    errors = []

    # regular expressions
    chapter_reg = re.compile(u'\d{1,3}\s')
    verse_reg = re.compile(u'\d{1,3}[a-zA-Z\-"“\[‘(—]')
    footnote_reg = re.compile(u'\[[a-z]\]')

    input_file = codecs.open('JPSTanakhMaster.txt', 'r', 'utf-8')

    # define replacement dictionary
    replacements = {
            u'H%': u'\u1e24',
            u'h%': u'\u1e25',
            u'\n': u'',
            u'\r': u'',
    }

    # loop through file
    for line in input_file:

        # if this line is a parsha name - do nothing
        if line == line.upper():
            continue

        # make necessary replacements and strip footnotes
        line = functions.multiple_replace(line, replacements)
        '''footnotes = footnote_reg.findall(line)
        for case in footnotes:
            line = line.replace(case, u'')'''

        # check if line is beginning of new chapter
        new_chap = chapter_reg.match(line)
        if new_chap:

            # get chapter num
            chap_number = int(new_chap.group())
            if chap_number == 1:

                # save previous book
                if book_name != u'':
                    Chapters.append((process_verses(u''.join(Verses), verse_reg)))
                    check_verses(Chapters[len(Chapters)-1], (book_name.replace(u'\r', u';'), len(Chapters)), errors)
                    Books[book_name] = Chapters
                    Chapters = []
                    Verses = []

                book_name = previous_line

            else:
                Verses.append(previous_line)
                Chapters.append(process_verses(u''.join(Verses), verse_reg))
                check_verses(Chapters[len(Chapters) - 1], (book_name.replace(u'\r', u';'), chap_number - 1), errors)
                Verses = []

            # check that chapters are incrementing correctly
            if chap_number - len(Chapters) != 1:
                errors.append((book_name.replace(u'\r', u';'), chap_number-1))
                Chapters.append([u'error'])

            # copy line into previous_line placeholder, excluding the chapter number itself

            # if chapter number is 1, this is a new book
            previous_line = line[new_chap.end():]

        else:

            # Add previous line to verses, and save current line
            Verses.append(previous_line)
            previous_line = line

    # add last book
    Chapters.append((process_verses(u''.join(Verses), verse_reg)))
    check_verses(Chapters[len(Chapters) - 1], (book_name.replace(u'\r', u';'), len(Chapters)), errors)
    Books[book_name] = Chapters

    input_file.close()
    out = codecs.open('output.txt', 'w', 'utf-8')
    write_to_file(Books, out)
    out.close()

    # write errors
    out = codecs.open('errors.txt', 'w', 'utf-8')
    for error in errors:
        out.write('Book: {0} Chapter: {1}\n'.format(*error))
    out.close()
    print len(errors)

    return Books
コード例 #6
0
ファイル: JPS1985.py プロジェクト: smontagu/Sefaria-Data
def footnote_linker(jps, jps_footnotes):
    """
    Create a list of link objects with the anchorText field set to the corresponding words in the verse.
    Once the corresponding text has been found, edit the jps text so as to replace the footnote mark with
    an <i> tag. Trailing footnote markers at the end of an enclosed text fragment will be scrubbed.

    :param jps: jps Tanach data structure - jps[<book_name>][chapter_index][verse_index]
    :param jps_footnotes: jps footnotes data struct -
    footnotes[<book_name>][chap_index][footnote]. Note that this returns a dictionary with the
    keys [footnote], which gives the footnote text, and [links], which give the verses to which the
    footnote needs to link to.
    :return: A list of link objects. This function will edit jps to replace inline footnote markers
    with <i> tags, as well as remove the beginning footnote characters used to match the note to printed
    text.
    """

    # get books of Tanach
    books = library.get_indexes_in_category('Tanach')

    # open error file
    errors = codecs.open('footnote_errors.txt', 'w', 'utf-8')

    # declare link array
    links = []

    # iterate through jps_footnotes
    for book in books:
        for chap_num, chapter in enumerate(jps_footnotes[book]):
            for index, note in enumerate(chapter):

                # get tag to identify footnote in main text. Account for case where tag is 'aa'.
                if note['footnote'][0] == u'a' and note['footnote'][1] == u'a':
                    tag = u'aa'
                else:
                    tag = u'{}'.format(note['footnote'][0])

                # first word of footnote is the tag. That data has been saved - now strip from the footnote
                note['footnote'] = u' '.join(note['footnote'].split()[1:])

                # compile regexes to account for enclosed text
                open_tag_reg = re.compile(u'\[{}\]-'.format(tag))
                close_tag_reg = re.compile(u'-\[{}\]'.format(tag))
                enclosed_reg = re.compile(u'\[{0}\]-.*?-\[{0}\]'.format(tag))

                # <i> tag to replace footnote marker in text
                if tag == u'aa':
                    data_order = 27
                else:
                    data_order = ord(tag) - 96
                itag = u'<i data-commentator="JPS 1985 Footnotes" data-order="{}"></i>'.format(
                    data_order)

                # iterate over the links
                for link in note['links']:

                    # set flag for default footnote behaviour
                    default = True

                    # get verse in main text
                    try:
                        verse = jps[book][chap_num][link - 1]
                    except IndexError:
                        print u'{},{},{}'.format(book, chap_num + 1, link)
                        continue

                    # search using regexes
                    open_tag = open_tag_reg.search(verse)
                    close_tag = close_tag_reg.search(verse)
                    enclosed = enclosed_reg.search(verse)

                    # declare anchor for text
                    anchor = u''

                    # check for enclosed comment
                    if enclosed:
                        # catch enclosed text
                        anchor = enclosed.group()[len(tag) + 3:-(len(tag) + 3)]

                        # replace leading footnote tags with <i> tag, strip out trailing tag
                        replace = {
                            u'[{}]-'.format(tag): itag,
                            u'-[{}]'.format(tag): u''
                        }
                        jps[book][chap_num][link -
                                            1] = functions.multiple_replace(
                                                verse, replace)

                        # sanity check - scrub out any remaining footnote tags
                        jps[book][chap_num][link - 1] = jps[book][chap_num][
                            link - 1].replace(u'[{}]'.format(tag), u'')

                        default = False

                    # if not enclosed comment, make sanity check
                    elif open_tag:

                        # check if anomaly can be resolved by looking at the next verse
                        next_verse = jps[book][chap_num][link]
                        combined = u' '.join([verse, next_verse])
                        enclosed = enclosed_reg.search(combined)

                        if enclosed:
                            anchor = enclosed.group()[len(tag) +
                                                      3:-(len(tag) + 3)]
                            jps[book][chap_num][link - 1] = verse.replace(
                                u'[{}]-'.format(tag), itag)
                            jps[book][chap_num][link] = next_verse.replace(
                                u'-[{}]'.format(tag), u'')
                            jps[book][chap_num][
                                link -
                                1] = jps[book][chap_num][link - 1].replace(
                                    u'[{}]'.format(tag), u'')

                            default = False

                    if default:

                        # sanity check, make sure tag is in verse
                        if verse.find(u'[{}]'.format(tag)) == -1:
                            errors.write(u'tag not found\n')
                            errors.write(u'{}, {}, {}, {}\n'.format(
                                book, chap_num + 1, link, note['footnote']))
                            continue

                        else:
                            # remove footnote tag from main text
                            try:
                                jps[book][chap_num][link - 1] = verse.replace(
                                    u'[{}]'.format(tag), itag)
                            except IndexError:
                                print u'{},{},{}'.format(
                                    book, chap_num + 1, link)

                            # get preceding word
                            words = verse[:verse.find(u'[{}]'.format(tag)
                                                      )].split()

                            if len(words) > 0:
                                anchor = words[len(words) - 1]
                            else:
                                anchor = u''

                    # create link object
                    links.append({
                        'refs': [
                            u'{}.{}.{}'.format(book, chap_num + 1, link),
                            u'JPS 1985 Footnotes, {}.{}.{}'.format(
                                book, chap_num + 1, index + 1)
                        ],
                        'type':
                        'commentary',
                        'auto':
                        True,
                        'generated_by':
                        'JPS parse script',
                        'anchorText':
                        anchor,
                    })

    errors.close()
    return links
コード例 #7
0
ファイル: JPS1985.py プロジェクト: smontagu/Sefaria-Data
def align_footnotes(books):
    """
    The footnotes need to be structured by book and chapter. Each footnote may refer to multiple verses.

    :param books: Dictionary, containing the entire JPS 1985 translation
    return: Dictionary, with books as keys and chapters as values. Each chapter is a list of dictionaries,
    with the key "footnote" set to the footnote and the key "links" being a list of verses where the
    footnote appears.
    """

    jps_footnotes = {}

    # define replacement dictionary
    replacements = {
        u'@': u'\u1e63',
        u'h%': u'\u1e25',
        u'H%': u'\u1e24',
        u'\n': u'',
        u'\r': u'',
        u'\n\r': u'',
    }

    # get list of book in tanach
    all_books = library.get_indexes_in_category('Tanach')

    # open footnote document and retrieve first footnote
    input_file = codecs.open('JPS1985_footnotes.txt', 'r', 'utf-8')
    note = {
        'footnote': functions.multiple_replace(input_file.readline(),
                                               replacements)
    }

    # iterate through Tanach
    for book in all_books:

        # set dictionary to with keys set to chapter numbers
        footnote_chaps = {}

        for chap_num, chapter in enumerate(books[book]):

            # set flag to indicate if any footnote markers have been found
            found_note = False

            # declare array to hold all note in chapter
            chap_notes = []

            # repeatedly loop through chapter, searching for cases where footnote appears
            while True:

                # account for the case where a footnote is tagged "aa"
                if note['footnote'].find(u'aa') == 0:
                    tag = u'aa'
                else:
                    tag = note['footnote'][0]

                found = []
                for verse_num, verse in enumerate(chapter):
                    if verse.find(u'[{}]'.format(tag)) != -1:
                        found.append(verse_num + 1)
                        found_note = True
                    note['links'] = found

                # if footnote markers were found, get the next footnote
                if found_note:
                    chap_notes.append(note)
                    note = {
                        'footnote':
                        functions.multiple_replace(input_file.readline(),
                                                   replacements)
                    }

                    if note['footnote'] == u'':
                        footnote_chaps[chap_num] = chap_notes
                        break

                # if footnote begins with "a", this is a new chapter
                try:
                    if note['footnote'][
                            0] == u'a' and note['footnote'][1] != u'a':
                        footnote_chaps[chap_num] = chap_notes
                        break

                except IndexError:
                    print 'error'
                    print note['footnote']
                    print u'{}, chapter {}'.format(book, chap_num + 1)
                    input_file.close()
                    sys.exit(1)

        jps_footnotes[book] = functions.convertDictToArray(footnote_chaps)

    input_file.close()
    return jps_footnotes
コード例 #8
0
ファイル: JPS1985.py プロジェクト: smontagu/Sefaria-Data
def parse():

    # declare variables
    Books, Chapters, Verses = {}, [], []
    previous_line = u''
    book_name = u''
    errors = []

    # regular expressions
    chapter_reg = re.compile(u'\d{1,3}\s')
    verse_reg = re.compile(u'\d{1,3}[a-zA-Z\-"“\[‘(—]')
    footnote_reg = re.compile(u'\[[a-z]\]')

    input_file = codecs.open('JPSTanakhMaster.txt', 'r', 'utf-8')

    # define replacement dictionary
    replacements = {
        u'H%': u'\u1e24',
        u'h%': u'\u1e25',
        u'\n': u'',
        u'\r': u'',
    }

    # loop through file
    for line in input_file:

        # if this line is a parsha name - do nothing
        if line == line.upper():
            continue

        # make necessary replacements and strip footnotes
        line = functions.multiple_replace(line, replacements)
        '''footnotes = footnote_reg.findall(line)
        for case in footnotes:
            line = line.replace(case, u'')'''

        # check if line is beginning of new chapter
        new_chap = chapter_reg.match(line)
        if new_chap:

            # get chapter num
            chap_number = int(new_chap.group())
            if chap_number == 1:

                # save previous book
                if book_name != u'':
                    Chapters.append((process_verses(u''.join(Verses),
                                                    verse_reg)))
                    check_verses(
                        Chapters[len(Chapters) - 1],
                        (book_name.replace(u'\r', u';'), len(Chapters)),
                        errors)
                    Books[book_name] = Chapters
                    Chapters = []
                    Verses = []

                book_name = previous_line

            else:
                Verses.append(previous_line)
                Chapters.append(process_verses(u''.join(Verses), verse_reg))
                check_verses(Chapters[len(Chapters) - 1],
                             (book_name.replace(u'\r', u';'), chap_number - 1),
                             errors)
                Verses = []

            # check that chapters are incrementing correctly
            if chap_number - len(Chapters) != 1:
                errors.append((book_name.replace(u'\r',
                                                 u';'), chap_number - 1))
                Chapters.append([u'error'])

            # copy line into previous_line placeholder, excluding the chapter number itself

            # if chapter number is 1, this is a new book
            previous_line = line[new_chap.end():]

        else:

            # Add previous line to verses, and save current line
            Verses.append(previous_line)
            previous_line = line

    # add last book
    Chapters.append((process_verses(u''.join(Verses), verse_reg)))
    check_verses(Chapters[len(Chapters) - 1],
                 (book_name.replace(u'\r', u';'), len(Chapters)), errors)
    Books[book_name] = Chapters

    input_file.close()
    out = codecs.open('output.txt', 'w', 'utf-8')
    write_to_file(Books, out)
    out.close()

    # write errors
    out = codecs.open('errors.txt', 'w', 'utf-8')
    for error in errors:
        out.write('Book: {0} Chapter: {1}\n'.format(*error))
    out.close()
    print len(errors)

    return Books