def parse_file(self, filename, en_title, he_title, include_commentaries=True): root = DCXMLsubs.parse('XML/{}'.format(filename), silence=True) version = self.get_version_skeleton() version['text'] = root.getBaseTextArray() self.versionList.append({'ref': en_title, 'version': version}) self.base_indices.append(root.get_base_index(en_title, he_title)) if include_commentaries: commentaries = root.body.commentaries for commentary in commentaries.get_commentary(): author = commentary.get_author() if author == 'UNKNOWN': continue self.commentaryIndices.append(commentary.build_index(en_title, he_title)) self.terms.append(commentary.get_term_dict()) # if self.commentarySchemas.get(author) is None: # self.commentarySchemas[author] = commentary.build_node() # self.commentarySchemas[author].append(root.commentary_ja_node(en_title, he_title)) version = self.get_version_skeleton() if commentaries.is_linked_commentary(commentary): version['text'] = commentary.parse_linked() else: version['text'] = commentary.parse_unlinked() ref = '{} on {}'.format(DCXMLsubs.commentatorNames[author], en_title) self.versionList.append({'ref': ref, 'version': version}) self.linkSet.extend(root.get_stored_links(en_title))
def fix_commentator_by_page(filename, commentator, overwrite=False): root = DCXMLsubs.parse("XML/{}.xml".format(filename), silence=True) commentary = root.body.commentaries.commentary[get_commentary_index(root, commentator)] assert isinstance(commentary, DCXMLsubs.commentarySub) page_map = book_by_page(root) phrases_by_page = commentary.phrases_by_page() for page in phrases_by_page.keys(): # Not every page necessarily has a commentary phrase dh_list = [re.sub(ur' (\.|:)', ur'\1', phrase.dh.get_valueOf_()) for phrase in phrases_by_page[page]]
def basic_test_suite(): root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True) basetext = root.getBaseTextArray() ja_to_xml(basetext, ['Section', 'Segment'], 'base_text.xml') # root.review_commentaries() # root.check_commentary_chapters() comms = root.body.commentaries for c in comms.get_commentary(): if comms.is_linked_commentary(c) and c.get_author() != 'UNKNOWN': parsed = c.parse_linked() ja_to_xml(parsed, ['Chapter', 'Verse', 'Comment'], 'commentary.xml') break
def check_chapters(): xml_files = filter(lambda x: None if not re.search('\.xml$', x) else x, os.listdir('./XML')) for xml in xml_files: root = DCXMLsubs.parse("XML/{}".format(xml), silence=True) coms = root.body.commentaries if coms is not None: issues = root.check_commentary_chapters() if len(issues) > 0: print xml for issue in issues: print u'commentary : {}'.format(issue.get_author()) issue.print_bad_chapters()
def clear_subchaps(filename, commentator, chap_number=None, overwrite=False): def condition(chapter_element): if chap_number is None: return True else: return int(chapter_element.num) == chap_number root = DCXMLsubs.parse(filename, silence=True) commentary = root.body.commentaries.get_commentary()[get_commentary_index(root, commentator)] for chapter in commentary.get_chapter(): if condition(chapter): for phrase in chapter.get_phrase(): phrase.subchap = None if not overwrite: filename = filename.replace('.xml', '_test.xml') clean_export(root, filename)
def fix_commentator(filename, commentator, overwrite=False): root = DCXMLsubs.parse("XML/{}.xml".format(filename), silence=True) base_text = root.getBaseTextArray() commentary = root.body.commentaries.commentary[get_commentary_index(root, commentator)] assert isinstance(commentary, DCXMLsubs.commentarySub) locations = [] # assert len(base_text) == len(commentary.get_chapter()) # counter = 1 for comment_chapter in commentary.get_chapter(): chapter_index = int(comment_chapter.num) base_chapter = base_text[chapter_index - 1] print 'fixing chapter {}'.format(chapter_index) book_text = [bleach.clean(segment, tags=[], strip=True) for segment in base_chapter] seg_indices = first_word_indices(book_text) word_list = u' '.join(book_text).split() dh_list = [re.sub(ur' (\.|:)', ur'\1', p.dh.get_valueOf_()) for p in comment_chapter.get_phrase()]
def unmatched_comments(filename, commentator): def get_phrase_page(phrase_id): return re.search(u'ph-[0-9]{1,2}-([0-9A-Z]{1,3})-[0-9]{1,2}', phrase_id).group(1) issues = [] root = DCXMLsubs.parse(filename, silence=True) commentary = root.body.commentaries.get_commentary()[get_commentary_index(root, commentator)] for chapter in commentary.get_chapter(): for phrase in chapter.get_phrase(): if phrase.subchap == '0': issues.append({ 'page': get_phrase_page(phrase.id), 'dh': phrase.dh.get_valueOf_(), 'id': phrase.id }) with open(u'{} on {} issues.csv'.format(commentator, root.id), 'w') as outfile: writer = csv.DictWriter(outfile, ['id', 'page', 'dh', 'chapter', 'verse']) writer.writeheader() writer.writerows(issues)
def parse_file(self, filename, en_title, he_title): root = DCXMLsubs.parse('XML/{}'.format(filename), silence=True) version = self.get_version_skeleton() version['text'] = root.getBaseTextArray() self.versionList.append({'ref': en_title, 'version': version}) self.base_indices.append(root.get_base_index(en_title, he_title)) commentaries = root.body.commentaries for commentary in commentaries.get_commentary(): author = commentary.get_author() if self.commentarySchemas.get(author) is None: self.commentarySchemas[author] = commentary.build_node() self.commentarySchemas[author].append(root.commentary_ja_node(en_title, he_title)) version = self.get_version_skeleton() if commentaries.is_linked_commentary(commentary): version['text'] = commentary.parse_linked() else: version['text'] = commentary.parse_unlinked() ref = '{}, {}'.format(DCXMLsubs.commentatorNames[author], en_title) self.versionList.append({'ref': ref, 'version': version}) self.linkSet.extend(root.get_stored_links(en_title))
def output_missing_links(filename): root = DCXMLsubs.parse('./XML/{}'.format(filename), silence=True) commentary = root.body.commentaries.get_commentary()[-1] for phrase in commentary.get_phrase(): if DCXMLsubs.commentStore.get(phrase.id) is None: print phrase.id
""" Get a list of indices representing the index of the first word of each string should the list be collapsed to a list of words :param string_list: list of strings :return: list of integers """ indices = [] for line in string_list: if len(indices) == 0: indices.append(len(line.split())) else: indices.append(len(line.split()) + indices[-1]) return indices root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True) base_text = root.getBaseTextArray()[0] base_text = [bleach.clean(segment, tags=[], strip=True) for segment in base_text] seg_indices = first_word_indices(base_text) word_list = u' '.join(base_text).split() c = root.body.commentaries.commentary[6].chapter[0] dh_list = [p.dh.get_valueOf_() for p in c.get_phrase()] def cleaner(input_string): assert isinstance(input_string, basestring) pattern = u'\u05d5?(\u05db|\u05d2)\u05d5\u05f3?' match = re.search(pattern, input_string) if match is None: return input_string if match.start() > 6 and (match.start() > len(input_string) / 2):