class LXMLParser(object): """XML parser using lxml. The default XML parser in python based on expat has issues with badly formed XML. We workaround this somewhat by using lxml for parsing which allows recovering from certain types of broken XML. """ def __init__(self, target): self._parser = XMLPullParser(events=('start', 'end'), recover=True) self._target = target def handle_events(self): for action, element in self._parser.read_events(): if action == 'start': self._target.start(element.tag, element.attrib) elif action == 'end': if element.text: self._target.data(element.text) self._target.end(element.tag) element.clear() def feed(self, data): try: self._parser.feed(data) except: raise self.handle_events() def close(self): self._parser.close()
def parse_wrapper(fpath, main_tag, parser): _, fname = os.path.split(fpath) pp = XMLPullParser(tag=main_tag, events=['end'], recover=True) def parse_all(): for _, pat in pp.read_events(): if not parser(pat, fname): return False clear(pat) return True with open(fpath, errors='ignore') as f: pp.feed('<root>\n') for line in f: if line.startswith('<?xml'): if not parse_all(): return False elif line.startswith('<!DOCTYPE') or line.startswith( '<!ENTITY') or line.startswith(']>'): pass else: pp.feed(line) else: pp.feed('</root>\n') return parse_all()
def count(self, xml_string): parser = XMLPullParser(huge_tree=True) parser.feed(xml_string) for event, elem in parser.read_events(): elem_tag = QName(elem.tag).localname if elem_tag == "record" or elem_tag == "dc": r = Record(elem) if (self.s % 1000) == 0 and self.s != 0: logging.info("%d records processed" % self.s) self.s += 1 if r.get_record_status() != "deleted": dc_breaker.collect_stats(self.stats_aggregate, r.get_stats()) elem.clear() parser.close() return self.stats_aggregate
def parse_wrapper(fpath, main_tag, parser): _, fname = os.path.split(fpath) pp = XMLPullParser(tag=main_tag, events=['end'], recover=True) def parse_all(): for _, pat in pp.read_events(): if not parser(pat, fname): return False clear(pat) return True with open(fpath, errors='ignore') as f: pp.feed('<root>\n') for line in f: if line.startswith('<?xml'): if not parse_all(): return False elif line.startswith('<!DOCTYPE') or line.startswith('<!ENTITY') or line.startswith(']>'): pass else: pp.feed(line) else: pp.feed('</root>\n') return parse_all()
address = ownref.find('ADR') if address is not None: pat['city'] = get_text(address, 'CITY/PDAT') pat['state'] = get_text(address, 'STATE/PDAT') pat['country'] = get_text(address, 'CTRY/PDAT') # abstract abspars = elem.findall('SDOAB/BTEXT/PARA') if len(abspars) > 0: pat['abstract'] = '\n'.join([raw_text(e) for e in abspars]) # roll it in return add_patent(pat) # parse mangled xml pp = XMLPullParser(tag='PATDOC', events=['end'], recover=True) def handle_all(): for _, pat in pp.read_events(): if not handle_patent(pat): return False return True with open(args.path, errors='ignore') as f: pp.feed('<root>\n') for line in f: if line.startswith('<?xml'): if not handle_all(): break elif line.startswith('<!DOCTYPE') or line.startswith( '<!ENTITY') or line.startswith(']>'):
def __init__(self, target): self._parser = XMLPullParser(events=('start', 'end'), recover=True) self._target = target
if address is not None: pat['city'] = get_text(address, 'city') pat['state'] = get_text(address, 'state') pat['country'] = get_text(address, 'country') # abstract abspar = elem.find('abstract') if abspar is not None: pat['abstract'] = raw_text(abspar, sep=' ') # roll it in return add_patent(pat) # parse mangled xml pp = XMLPullParser(tag=main_tag, events=['end'], recover=True) def handle_all(): for (_, pat) in pp.read_events(): if not handle_patent(pat): return False return True with open(args.path, errors='ignore') as f: pp.feed('<root>\n') for line in f: if line.startswith('<?xml'): if not handle_all(): break