Python DCXMLsubs 예제들, DCXMLsubs Python 예제들

예제 #1

0

파일 보기

파일: parse_dcxml.py 프로젝트: YairRand/Sefaria-Data

    def parse_file(self, filename, en_title, he_title, include_commentaries=True):

        root = DCXMLsubs.parse('XML/{}'.format(filename), silence=True)
        version = self.get_version_skeleton()
        version['text'] = root.getBaseTextArray()
        self.versionList.append({'ref': en_title, 'version': version})
        self.base_indices.append(root.get_base_index(en_title, he_title))

        if include_commentaries:
            commentaries = root.body.commentaries
            for commentary in commentaries.get_commentary():
                author = commentary.get_author()
                if author == 'UNKNOWN':
                    continue
                self.commentaryIndices.append(commentary.build_index(en_title, he_title))
                self.terms.append(commentary.get_term_dict())

                # if self.commentarySchemas.get(author) is None:
                #     self.commentarySchemas[author] = commentary.build_node()
                # self.commentarySchemas[author].append(root.commentary_ja_node(en_title, he_title))

                version = self.get_version_skeleton()
                if commentaries.is_linked_commentary(commentary):
                    version['text'] = commentary.parse_linked()
                else:
                    version['text'] = commentary.parse_unlinked()

                ref = '{} on {}'.format(DCXMLsubs.commentatorNames[author], en_title)
                self.versionList.append({'ref': ref, 'version': version})
            self.linkSet.extend(root.get_stored_links(en_title))

예제 #2

0

파일 보기

파일: parse_dcxml.py 프로젝트: YairRand/Sefaria-Data

def fix_commentator_by_page(filename, commentator, overwrite=False):

    root = DCXMLsubs.parse("XML/{}.xml".format(filename), silence=True)
    commentary = root.body.commentaries.commentary[get_commentary_index(root, commentator)]
    assert isinstance(commentary, DCXMLsubs.commentarySub)

    page_map = book_by_page(root)
    phrases_by_page = commentary.phrases_by_page()

    for page in phrases_by_page.keys():  # Not every page necessarily has a commentary phrase
        dh_list = [re.sub(ur' (\.|:)', ur'\1', phrase.dh.get_valueOf_()) for phrase in phrases_by_page[page]]

예제 #3

0

파일 보기

def basic_test_suite():
    root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True)
    basetext = root.getBaseTextArray()
    ja_to_xml(basetext, ['Section', 'Segment'], 'base_text.xml')
    # root.review_commentaries()
    # root.check_commentary_chapters()
    comms = root.body.commentaries
    for c in comms.get_commentary():
        if comms.is_linked_commentary(c) and c.get_author() != 'UNKNOWN':
            parsed = c.parse_linked()
            ja_to_xml(parsed, ['Chapter', 'Verse', 'Comment'], 'commentary.xml')
            break

예제 #4

0

파일 보기

def check_chapters():
    xml_files = filter(lambda x: None if not re.search('\.xml$', x) else x, os.listdir('./XML'))
    for xml in xml_files:
        root = DCXMLsubs.parse("XML/{}".format(xml), silence=True)
        coms = root.body.commentaries
        if coms is not None:
            issues = root.check_commentary_chapters()
            if len(issues) > 0:
                print xml
                for issue in issues:
                    print u'commentary : {}'.format(issue.get_author())
                    issue.print_bad_chapters()

예제 #5

0

파일 보기

파일: parse_dcxml.py 프로젝트: YairRand/Sefaria-Data

def basic_test_suite():
    root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True)
    basetext = root.getBaseTextArray()
    ja_to_xml(basetext, ['Section', 'Segment'], 'base_text.xml')
    # root.review_commentaries()
    # root.check_commentary_chapters()
    comms = root.body.commentaries
    for c in comms.get_commentary():
        if comms.is_linked_commentary(c) and c.get_author() != 'UNKNOWN':
            parsed = c.parse_linked()
            ja_to_xml(parsed, ['Chapter', 'Verse', 'Comment'], 'commentary.xml')
            break

예제 #6

0

파일 보기

파일: parse_dcxml.py 프로젝트: YairRand/Sefaria-Data

def check_chapters():
    xml_files = filter(lambda x: None if not re.search('\.xml$', x) else x, os.listdir('./XML'))
    for xml in xml_files:
        root = DCXMLsubs.parse("XML/{}".format(xml), silence=True)
        coms = root.body.commentaries
        if coms is not None:
            issues = root.check_commentary_chapters()
            if len(issues) > 0:
                print xml
                for issue in issues:
                    print u'commentary : {}'.format(issue.get_author())
                    issue.print_bad_chapters()

예제 #7

0

파일 보기

파일: parse_dcxml.py 프로젝트: YairRand/Sefaria-Data

def clear_subchaps(filename, commentator, chap_number=None, overwrite=False):
    def condition(chapter_element):
        if chap_number is None:
            return True
        else:
            return int(chapter_element.num) == chap_number

    root = DCXMLsubs.parse(filename, silence=True)
    commentary = root.body.commentaries.get_commentary()[get_commentary_index(root, commentator)]
    for chapter in commentary.get_chapter():
        if condition(chapter):
            for phrase in chapter.get_phrase():
                phrase.subchap = None

    if not overwrite:
        filename = filename.replace('.xml', '_test.xml')
    clean_export(root, filename)

예제 #8

0

파일 보기

파일: parse_dcxml.py 프로젝트: YairRand/Sefaria-Data

def fix_commentator(filename, commentator, overwrite=False):
    root = DCXMLsubs.parse("XML/{}.xml".format(filename), silence=True)
    base_text = root.getBaseTextArray()
    commentary = root.body.commentaries.commentary[get_commentary_index(root, commentator)]
    assert isinstance(commentary, DCXMLsubs.commentarySub)
    locations = []
    # assert len(base_text) == len(commentary.get_chapter())
    # counter = 1
    for comment_chapter in commentary.get_chapter():
        chapter_index = int(comment_chapter.num)
        base_chapter = base_text[chapter_index - 1]
        print 'fixing chapter {}'.format(chapter_index)

        book_text = [bleach.clean(segment, tags=[], strip=True) for segment in base_chapter]
        seg_indices = first_word_indices(book_text)
        word_list = u' '.join(book_text).split()
        dh_list = [re.sub(ur' (\.|:)', ur'\1', p.dh.get_valueOf_()) for p in comment_chapter.get_phrase()]

예제 #9

0

파일 보기

파일: parse_dcxml.py 프로젝트: YairRand/Sefaria-Data

def unmatched_comments(filename, commentator):

    def get_phrase_page(phrase_id):
        return re.search(u'ph-[0-9]{1,2}-([0-9A-Z]{1,3})-[0-9]{1,2}', phrase_id).group(1)

    issues = []
    root = DCXMLsubs.parse(filename, silence=True)
    commentary = root.body.commentaries.get_commentary()[get_commentary_index(root, commentator)]
    for chapter in commentary.get_chapter():
        for phrase in chapter.get_phrase():
            if phrase.subchap == '0':
                issues.append({
                    'page': get_phrase_page(phrase.id),
                    'dh': phrase.dh.get_valueOf_(),
                    'id': phrase.id
                })
    with open(u'{} on {} issues.csv'.format(commentator, root.id), 'w') as outfile:
        writer = csv.DictWriter(outfile, ['id', 'page', 'dh', 'chapter', 'verse'])
        writer.writeheader()
        writer.writerows(issues)

예제 #10

0

파일 보기

    def parse_file(self, filename, en_title, he_title):

        root = DCXMLsubs.parse('XML/{}'.format(filename), silence=True)
        version = self.get_version_skeleton()
        version['text'] = root.getBaseTextArray()
        self.versionList.append({'ref': en_title, 'version': version})
        self.base_indices.append(root.get_base_index(en_title, he_title))
        commentaries = root.body.commentaries

        for commentary in commentaries.get_commentary():
            author = commentary.get_author()
            if self.commentarySchemas.get(author) is None:
                self.commentarySchemas[author] = commentary.build_node()
            self.commentarySchemas[author].append(root.commentary_ja_node(en_title, he_title))

            version = self.get_version_skeleton()
            if commentaries.is_linked_commentary(commentary):
                version['text'] = commentary.parse_linked()
            else:
                version['text'] = commentary.parse_unlinked()

            ref = '{}, {}'.format(DCXMLsubs.commentatorNames[author], en_title)
            self.versionList.append({'ref': ref, 'version': version})
        self.linkSet.extend(root.get_stored_links(en_title))

예제 #11

0

파일 보기

def output_missing_links(filename):
    root = DCXMLsubs.parse('./XML/{}'.format(filename), silence=True)
    commentary = root.body.commentaries.get_commentary()[-1]
    for phrase in commentary.get_phrase():
        if DCXMLsubs.commentStore.get(phrase.id) is None:
            print phrase.id

예제 #12

0

파일 보기

    """
    Get a list of indices representing the index of the first word of each string should the list be collapsed to a list
    of words
    :param string_list: list of strings
    :return: list of integers
    """
    indices = []
    for line in string_list:
        if len(indices) == 0:
            indices.append(len(line.split()))
        else:
            indices.append(len(line.split()) + indices[-1])
    return indices


root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True)
base_text = root.getBaseTextArray()[0]
base_text = [bleach.clean(segment, tags=[], strip=True) for segment in base_text]
seg_indices = first_word_indices(base_text)
word_list = u' '.join(base_text).split()
c = root.body.commentaries.commentary[6].chapter[0]
dh_list = [p.dh.get_valueOf_() for p in c.get_phrase()]


def cleaner(input_string):
    assert isinstance(input_string, basestring)
    pattern = u'\u05d5?(\u05db|\u05d2)\u05d5\u05f3?'
    match = re.search(pattern, input_string)
    if match is None:
        return input_string
    if match.start() > 6 and (match.start() > len(input_string) / 2):