Esempio n. 1
0
class LXMLParser(object):
    """XML parser using lxml.

    The default XML parser in python based on expat has issues with badly
    formed XML. We workaround this somewhat by using lxml for parsing which
    allows recovering from certain types of broken XML.
    """

    def __init__(self, target):
        self._parser = XMLPullParser(events=('start', 'end'), recover=True)
        self._target = target

    def handle_events(self):
        for action, element in self._parser.read_events():
            if action == 'start':
                self._target.start(element.tag, element.attrib)
            elif action == 'end':
                if element.text:
                    self._target.data(element.text)
                self._target.end(element.tag)
                element.clear()

    def feed(self, data):
        try:
            self._parser.feed(data)
        except:
            raise
        self.handle_events()

    def close(self):
        self._parser.close()
Esempio n. 2
0
def parse_wrapper(fpath, main_tag, parser):
    _, fname = os.path.split(fpath)
    pp = XMLPullParser(tag=main_tag, events=['end'], recover=True)

    def parse_all():
        for _, pat in pp.read_events():
            if not parser(pat, fname):
                return False
            clear(pat)
        return True

    with open(fpath, errors='ignore') as f:
        pp.feed('<root>\n')
        for line in f:
            if line.startswith('<?xml'):
                if not parse_all():
                    return False
            elif line.startswith('<!DOCTYPE') or line.startswith(
                    '<!ENTITY') or line.startswith(']>'):
                pass
            else:
                pp.feed(line)
        else:
            pp.feed('</root>\n')
            return parse_all()
Esempio n. 3
0
    def count(self, xml_string):
        parser = XMLPullParser(huge_tree=True)
        parser.feed(xml_string)
        for event, elem in parser.read_events():

            elem_tag = QName(elem.tag).localname
            if elem_tag == "record" or elem_tag == "dc":
                r = Record(elem)

                if (self.s % 1000) == 0 and self.s != 0:
                    logging.info("%d records processed" % self.s)
                self.s += 1
                if r.get_record_status() != "deleted":
                    dc_breaker.collect_stats(self.stats_aggregate, r.get_stats())
                elem.clear()
        parser.close()
        return self.stats_aggregate
Esempio n. 4
0
def parse_wrapper(fpath, main_tag, parser):
    _, fname = os.path.split(fpath)
    pp = XMLPullParser(tag=main_tag, events=['end'], recover=True)
    def parse_all():
        for _, pat in pp.read_events():
            if not parser(pat, fname):
                return False
            clear(pat)
        return True

    with open(fpath, errors='ignore') as f:
        pp.feed('<root>\n')
        for line in f:
            if line.startswith('<?xml'):
                if not parse_all():
                    return False
            elif line.startswith('<!DOCTYPE') or line.startswith('<!ENTITY') or line.startswith(']>'):
                pass
            else:
                pp.feed(line)
        else:
            pp.feed('</root>\n')
            return parse_all()
Esempio n. 5
0
            address = ownref.find('ADR')
            if address is not None:
                pat['city'] = get_text(address, 'CITY/PDAT')
                pat['state'] = get_text(address, 'STATE/PDAT')
                pat['country'] = get_text(address, 'CTRY/PDAT')

        # abstract
        abspars = elem.findall('SDOAB/BTEXT/PARA')
        if len(abspars) > 0:
            pat['abstract'] = '\n'.join([raw_text(e) for e in abspars])

        # roll it in
        return add_patent(pat)

    # parse mangled xml
    pp = XMLPullParser(tag='PATDOC', events=['end'], recover=True)

    def handle_all():
        for _, pat in pp.read_events():
            if not handle_patent(pat):
                return False
        return True

    with open(args.path, errors='ignore') as f:
        pp.feed('<root>\n')
        for line in f:
            if line.startswith('<?xml'):
                if not handle_all():
                    break
            elif line.startswith('<!DOCTYPE') or line.startswith(
                    '<!ENTITY') or line.startswith(']>'):
Esempio n. 6
0
 def __init__(self, target):
     self._parser = XMLPullParser(events=('start', 'end'), recover=True)
     self._target = target
Esempio n. 7
0
        if address is not None:
            pat['city'] = get_text(address, 'city')
            pat['state'] = get_text(address, 'state')
            pat['country'] = get_text(address, 'country')

        # abstract
        abspar = elem.find('abstract')
        if abspar is not None:
            pat['abstract'] = raw_text(abspar, sep=' ')

        # roll it in
        return add_patent(pat)


# parse mangled xml
pp = XMLPullParser(tag=main_tag, events=['end'], recover=True)


def handle_all():
    for (_, pat) in pp.read_events():
        if not handle_patent(pat):
            return False
    return True


with open(args.path, errors='ignore') as f:
    pp.feed('<root>\n')
    for line in f:
        if line.startswith('<?xml'):
            if not handle_all():
                break