def __init__(self, known_metadata, docid, format=ARTFLVector, parallel=ARTFLParallels, xpaths=None, metadata_xpaths=None, token_regex=Default_Token_Regex, non_nesting_tags=[], self_closing_tags=[], pseudo_empty_tags=[], output=None): self.known_metadata = known_metadata self.docid = docid self.i = shlaxtree.ShlaxIngestor(target=self) self.tree = None #unnecessary? self.root = None self.stack = [] self.map = xpaths or TEI_XPaths self.metadata_paths = metadata_xpaths or TEI_MetadataXPaths self.v = OHCOVector.CompoundStack(format, parallel, docid, output) # OHCOVector should take an output file handle. self.extractors = [] self.file_position = 0 self.token_regex = token_regex self.non_nesting_tags = non_nesting_tags self.self_closing_tags = self_closing_tags self.pseudo_empty_tags = pseudo_empty_tags self.pushed_tags = {} self.depth_pushed = {}
def parse(text): try: parser = FragmentParser() driver = LXMLTreeDriver(target=parser) feeder = st.ShlaxIngestor(target=driver) feeder.feed(text) return feeder.close() except ValueError: # we use LXML's HTML parser which is more flexible and then feed the result to fragment parser parser = etree.HTMLParser() tree = etree.fromstring(text.decode('utf8', 'ignore'), parser=parser) new_text = etree.tostring(tree, method="xml").replace( "<html><body>", '').replace("</body></html>", '').replace("philohighlight", "philoHighlight") parser = FragmentParser() driver = LXMLTreeDriver(target=parser) feeder = st.ShlaxIngestor(target=driver) feeder.feed(new_text) return feeder.close()
def strip_tags(text): parser = FragmentStripper() feeder = st.ShlaxIngestor(target=parser) feeder.feed(text) return feeder.close()
def parse(text): parser = FragmentParser() driver = LXMLTreeDriver(target=parser) feeder = st.ShlaxIngestor(target=driver) feeder.feed(text) return feeder.close()
def parse(self,text): self.tags = {} parser = st.ShlaxIngestor(target=self) parser.feed(text) self.close()
if census.tags[tag]["start"] != census.tags[tag]["end"]: status += "*" if census.tags[tag]["malformed"]: status += "X" print "%s\t%s\t%d\t%d\t%d\t%d" % ( status, tag, census.tags[tag]["start"], census.tags[tag]["end"], census.tags[tag]["empty"], census.tags[tag]["malformed"]) if __name__ == "__main__": file_count = 0 total = None for fn in sys.argv[1:]: file_count += 1 census = TagCensus() parser = st.ShlaxIngestor(target=census) parser.feed(open(fn).read()) print fn print_census(census) if total: for tag in census.tags.keys(): if tag not in total.tags: total.tags[tag] = { "start": 0, "end": 0, "empty": 0, "malformed": 0 } total.tags[tag]["start"] += census.tags[tag]["start"] total.tags[tag]["end"] += census.tags[tag]["end"] total.tags[tag]["empty"] += census.tags[tag]["empty"]