def run(self): """ Handle Zotero in-line reference items @return: a list of items handled """ tei_manipulator = TeiManipulate(self.gv) object_list = tei_manipulator.get_object_list('//tei:ref[@rend="ref"]', ' ADDIN EN.CITE', u'zoterobiblio') object_list += tei_manipulator.get_object_list('//tei:ref', ' ADDIN ZOTERO_ITEM CSL_CITATION', u'zoterobiblio') tei_manipulator.drop_addin('//tei:ref[@rend="ref"]', ' ADDIN EN.CITE', 'EndNote', 'hi', 'reference_to_link', self, u'zoterobiblio', True) tei_manipulator.drop_addin_json('//tei:ref', ' ADDIN ZOTERO_ITEM CSL_CITATION', 'hi', 'reference_to_link', self) tei_manipulator.drop_addin_json('//tei:ref', ' ADDIN ZOTERO_ITEM', 'hi', 'reference_to_link', self) # handle bibliography self.handle_bibliography(tei_manipulator) if len(object_list) > 0: self.debug.print_debug(self, u'Stashed {0} references for bibliography parsing'.format(len(object_list))) return object_list
def pre_clean(self): self.extract_metadata_fields() manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # get all elements in the body section = tree.xpath('//tei:body//*', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) items_to_match = ['{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p', '{http://www.tei-c.org/ns/1.0}cit'] count = 0 matched_authors = [] for item in section: if count > 2: break if item.tag in items_to_match: count += 1 text = self.get_stripped_text(item) processed = False for author in self.authors: if not author in matched_authors: has_all = True for component in author: if not component in text: has_all = False break if has_all: # found a metadata line matched_authors.append(author) count -= 1 item.getparent().remove(item) self.debug.print_debug(self, u'Removed line "{0}" ' u'because it appears to be author metadata'.format(text)) processed = True break if not processed: for metadata in self.metadata: if metadata in text: # found a metadata line count -= 1 item.getparent().remove(item) self.debug.print_debug(self, u'Removed line "{0}" ' u'because it appears to be duplicated metadata'.format(text)) manipulate.save_tree(tree)
def run(self): """ Handle Mendeley reference tags, replacing them with NLM-spec references @return: a list of processed tags """ tei_manipulator = TeiManipulate(self.gv) object_list = tei_manipulator.get_object_list('//tei:ref[@rend="ref"]', 'ADDIN CSL_CITATION', u'zoterobiblio') tei_manipulator.drop_addin_json('//tei:ref', 'ADDIN CSL_CITATION', 'hi', 'reference_to_link', self) self.handle_bibliography(tei_manipulator) if len(object_list) > 0: self.debug.print_debug(self, u'Stashed {0} references for bibliography parsing'.format(len(object_list))) return object_list
def run(self): """ Handle all unknown types of addin, stripping them from the output @return: a list of tags that were removed """ tei_manipulator = TeiManipulate(self.gv) object_list = tei_manipulator.get_object_list('//*', ' ADDIN', u'addin') drop = self.gv.setting('drop-unknown-addins') tei_manipulator.drop_addin('//*', ' ADDIN', 'EndNote', 'hi', 'unknown_addin_text', self, u'addin', drop == 'True') if len(object_list) > 0: self.debug.print_debug(self, u'Handled {0} unknown addin tags'.format(len(object_list))) return object_list
def pre_cleanup(self): manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # make sure that head elements are not encapsulated within any elements that will stop them from being # correctly transformed by the XSL allowed = ['{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body'] head_elements = tree.xpath('//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) count = 0 for element in head_elements: current = element while current is not None: current = current.getparent() if current is not None: if current.tag and current.tag not in allowed: current.tag = 'REMOVE' count += 1 elif current.tag and current.tag in allowed: break else: break if count > 0: etree.strip_tags(tree, 'REMOVE') manipulate.save_tree(tree) self.debug.print_debug(self, u'Extracted {0} headings from inside invalid elements'.format(count)) # split any p tags with sub-tags hi rend="Indent" into new elements biblio_elements = tree.xpath('//tei:p' '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for parent in biblio_elements: add_position = parent for element in parent.xpath('tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): new_p = etree.Element('p') if 'rend' in parent.attrib: new_p.attrib['rend'] = parent.attrib['rend'] add_position.addnext(new_p) new_p.append(element) add_position = new_p manipulate.save_tree(tree) self.debug.print_debug(self, u'Separated out p {0}'.format(manipulate.get_stripped_text(parent)))
def pre_cleanup(self): manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # make sure that head elements are not encapsulated within any elements that will stop them from being # correctly transformed by the XSL allowed = [ '{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body' ] head_elements = tree.xpath( '//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) count = 0 for element in head_elements: current = element while current is not None: current = current.getparent() if current is not None: if current.tag and current.tag not in allowed: current.tag = 'REMOVE' count += 1 elif current.tag and current.tag in allowed: break else: break if count > 0: etree.strip_tags(tree, 'REMOVE') manipulate.save_tree(tree) self.debug.print_debug( self, u'Extracted {0} headings from inside invalid elements'.format( count)) # split any p tags with sub-tags hi rend="Indent" into new elements biblio_elements = tree.xpath( '//tei:p' '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for parent in biblio_elements: add_position = parent for element in parent.xpath( 'tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): new_p = etree.Element('p') if 'rend' in parent.attrib: new_p.attrib['rend'] = parent.attrib['rend'] add_position.addnext(new_p) new_p.append(element) add_position = new_p manipulate.save_tree(tree) self.debug.print_debug( self, u'Separated out p {0}'.format( manipulate.get_stripped_text(parent)))
def run_modules(self): ag = int(self.gv.settings.args['--aggression']) self.debug.print_debug( self, u'Running at aggression level {0} {1}'.format( ag, "[grrr!]" if ag == 10 else "")) if ag > 10: self.debug.print_debug( self, "WARNING: safety bail-out features are disabled at aggression level 11" ) if self.args['bibscan']: BibliographyDatabase(self.gv).scan() else: # check for stylesheets self.gv.check_file_exists(self.gv.docx_style_sheet_dir) # metadata file gv.metadata_file = self.set_metadata_file() self.gv.mk_dir(self.gv.output_folder_path) if self.args['doc']: # run doc to docx conversion # then run docx to tei UnoconvToDocx(self.gv).run('doc') DocxToTei(self.gv).run(True, self.args['--proprietary']) elif self.args['odt']: # run odt to docx conversion # then run docx to tei UnoconvToDocx(self.gv).run('odt') DocxToTei(self.gv).run(True, self.args['--proprietary']) elif self.args['other']: # run other unoconv-supported format to docx conversion # then run docx to tei UnoconvToDocx(self.gv).run('unoconv') DocxToTei(self.gv).run(True, self.args['--proprietary']) elif self.args['docx']: # run docx to tei conversion # includes hooks for proprietary transforms if enabled DocxToTei(self.gv).run(True, self.args['--proprietary']) elif self.args['docxextracted']: self.debug.print_debug(self, u'Skipping docx extraction') DocxToTei(self.gv).run(False, self.args['--proprietary']) elif self.args['tei']: self.debug.print_debug( self, u'Skipping docx extraction; processing TEI file') DocxToTei(self.gv).run(False, self.args['--proprietary'], tei=True) if self.args['--puretei']: self.debug.print_debug(self, u'Exiting as TEI transform complete') return metadata = Metadata(self.gv) metadata.pre_clean() # run size classifier # aggression 5 SizeClassifier(self.gv).run() # run bibliographic addins handler # aggression 4 found_bibliography = BibliographyAddins(self.gv).run() # run list classifier # aggression 4 ListClassifier(self.gv).run() bibliography_classifier = BibliographyClassifier(self.gv) if not found_bibliography: # run bibliographic classifier # aggression 4 bibliography_classifier.run() # tei # aggression 3 TeiManipulate(self.gv).run() # run tei to nlm conversion TeiToNlm(self.gv).run(not found_bibliography) if self.gv.settings.args['--purenlm']: self.debug.print_debug(self, u'Exiting as NLM transform complete') return manipulate = NlmManipulate(self.gv) if not self.gv.used_list_method: manipulate.fuse_references() # run reference linker if not (self.args['--nolink']): rl = ReferenceLinker(self.gv) rl.run(self.args['--interactive']) rl.cleanup() # run table classifier cc = CaptionClassifier(self.gv) if int(self.args['--aggression']) > int( self.gv.settings.get_setting( 'tablecaptions', self, domain='aggression')): cc.run_tables() if int(self.args['--aggression']) > int( self.gv.settings.get_setting( 'graphiccaptions', self, domain='aggression')): cc.run_graphics() # run metadata merge metadata.run() if self.args['--interactive']: bibliography_classifier.run_prompt(True) # process any bibliography entries that are possible BibliographyDatabase(self.gv).run() # remove stranded titles and cleanup manipulate.final_clean() if self.args['--identifiers']: IdGenerator(self.gv).run() if self.args['--chain']: # construct and run an XSLT chainer XslChain(self.gv).run() if self.args['--clean']: ComplianceEnforcer(self.gv).run()
def pre_clean(self): self.extract_metadata_fields() manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # get all elements in the body section = tree.xpath('//tei:body//*', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) items_to_match = [ '{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p', '{http://www.tei-c.org/ns/1.0}cit' ] count = 0 matched_authors = [] for item in section: if count > 2: break if item.tag in items_to_match: count += 1 text = self.get_stripped_text(item) processed = False for author in self.authors: if not author in matched_authors: has_all = True for component in author: if not component in text: has_all = False break if has_all: # found a metadata line matched_authors.append(author) count -= 1 item.getparent().remove(item) self.debug.print_debug( self, u'Removed line "{0}" ' u'because it appears to be author metadata'. format(text)) processed = True break if not processed: for metadata in self.metadata: if metadata in text: # found a metadata line count -= 1 item.getparent().remove(item) self.debug.print_debug( self, u'Removed line "{0}" ' u'because it appears to be duplicated metadata' .format(text)) manipulate.save_tree(tree)