def pre_cleanup(self): manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # make sure that head elements are not encapsulated within any elements that will stop them from being # correctly transformed by the XSL allowed = ['{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body'] head_elements = tree.xpath('//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) count = 0 for element in head_elements: current = element while current is not None: current = current.getparent() if current is not None: if current.tag and current.tag not in allowed: current.tag = 'REMOVE' count += 1 elif current.tag and current.tag in allowed: break else: break if count > 0: etree.strip_tags(tree, 'REMOVE') manipulate.save_tree(tree) self.debug.print_debug(self, u'Extracted {0} headings from inside invalid elements'.format(count)) # split any p tags with sub-tags hi rend="Indent" into new elements biblio_elements = tree.xpath('//tei:p' '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for parent in biblio_elements: add_position = parent for element in parent.xpath('tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): new_p = etree.Element('p') if 'rend' in parent.attrib: new_p.attrib['rend'] = parent.attrib['rend'] add_position.addnext(new_p) new_p.append(element) add_position = new_p manipulate.save_tree(tree) self.debug.print_debug(self, u'Separated out p {0}'.format(manipulate.get_stripped_text(parent)))
def pre_clean(self): self.extract_metadata_fields() manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # get all elements in the body section = tree.xpath('//tei:body//*', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) items_to_match = ['{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p', '{http://www.tei-c.org/ns/1.0}cit'] count = 0 matched_authors = [] for item in section: if count > 2: break if item.tag in items_to_match: count += 1 text = self.get_stripped_text(item) processed = False for author in self.authors: if not author in matched_authors: has_all = True for component in author: if not component in text: has_all = False break if has_all: # found a metadata line matched_authors.append(author) count -= 1 item.getparent().remove(item) self.debug.print_debug(self, u'Removed line "{0}" ' u'because it appears to be author metadata'.format(text)) processed = True break if not processed: for metadata in self.metadata: if metadata in text: # found a metadata line count -= 1 item.getparent().remove(item) self.debug.print_debug(self, u'Removed line "{0}" ' u'because it appears to be duplicated metadata'.format(text)) manipulate.save_tree(tree)
def pre_cleanup(self): manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # make sure that head elements are not encapsulated within any elements that will stop them from being # correctly transformed by the XSL allowed = [ '{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body' ] head_elements = tree.xpath( '//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) count = 0 for element in head_elements: current = element while current is not None: current = current.getparent() if current is not None: if current.tag and current.tag not in allowed: current.tag = 'REMOVE' count += 1 elif current.tag and current.tag in allowed: break else: break if count > 0: etree.strip_tags(tree, 'REMOVE') manipulate.save_tree(tree) self.debug.print_debug( self, u'Extracted {0} headings from inside invalid elements'.format( count)) # split any p tags with sub-tags hi rend="Indent" into new elements biblio_elements = tree.xpath( '//tei:p' '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for parent in biblio_elements: add_position = parent for element in parent.xpath( 'tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): new_p = etree.Element('p') if 'rend' in parent.attrib: new_p.attrib['rend'] = parent.attrib['rend'] add_position.addnext(new_p) new_p.append(element) add_position = new_p manipulate.save_tree(tree) self.debug.print_debug( self, u'Separated out p {0}'.format( manipulate.get_stripped_text(parent)))
def pre_clean(self): self.extract_metadata_fields() manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # get all elements in the body section = tree.xpath('//tei:body//*', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) items_to_match = [ '{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p', '{http://www.tei-c.org/ns/1.0}cit' ] count = 0 matched_authors = [] for item in section: if count > 2: break if item.tag in items_to_match: count += 1 text = self.get_stripped_text(item) processed = False for author in self.authors: if not author in matched_authors: has_all = True for component in author: if not component in text: has_all = False break if has_all: # found a metadata line matched_authors.append(author) count -= 1 item.getparent().remove(item) self.debug.print_debug( self, u'Removed line "{0}" ' u'because it appears to be author metadata'. format(text)) processed = True break if not processed: for metadata in self.metadata: if metadata in text: # found a metadata line count -= 1 item.getparent().remove(item) self.debug.print_debug( self, u'Removed line "{0}" ' u'because it appears to be duplicated metadata' .format(text)) manipulate.save_tree(tree)