def read_patent(file):
    count = 0
    storage_dir = "data"
    storage_path = os.path.join(os.getcwd(), storage_dir)
    path = os.path.join(storage_path, file.name.values[0])
    #declaring XMLPullParser with end event
    if file.name.values[0].startswith("pgb"):
        pat_xml = etree.XMLPullParser(tag='PATDOC',
                                      events=['end'],
                                      recover=True)
    elif file.name.values[0].startswith("ipgb"):
        pat_xml = etree.XMLPullParser(tag='us-patent-grant',
                                      events=['end'],
                                      recover=True)
    with open(path, 'r') as lines:
        for line in lines:
            if line.startswith("<?xml"):
                if (patent_ends(pat_xml, file.name.values[0])):
                    pat_xml.close()
                    if file.name.values[0].startswith("pgb"):
                        pat_xml = etree.XMLPullParser(tag='PATDOC',
                                                      events=['end'],
                                                      recover=True)
                    elif file.name.values[0].startswith("ipgb"):
                        pat_xml = etree.XMLPullParser(tag='us-patent-grant',
                                                      events=['end'],
                                                      recover=True)
            #Removing unwanted nodes
            elif line.startswith("<!DOCTYPE") or line.startswith(
                    "]>") or line.startswith('<!ENTITY'):
                pass
            else:
                #pudb.set_trace()
                #Using feed Parser for line by line parsing
                pat_xml.feed(line)
Example #2
0
def parse_file(file_to_parse):
    """
    Parse a file with multiple junos outputs in xml format
    Args: file_to_parse (file): an open file to parse
    Returns: A list of lxml trees with <rpc-reply> root elements
    """
    f = open("tmp.xml", 'w')
    f.close()
    #getting rid of everything not related to xml (\n, promts and other crap)
    for line in file_to_parse:
        line1 = line.strip()
        if line1.startswith('</rpc-reply>'):
            f = open("tmp.xml", 'a')
            f.write(line)
            f.write('\n')
            f.close()
        elif line1.startswith('<'):
            f = open("tmp.xml", 'a')
            f.write(line)
            f.close()
        else:
            continue

    #File tmp.xml is a clear XML file with set of "rpc-reply" elements
    f = open("tmp.xml")
    list_of_xml_trees = []
    parser = etree.XMLPullParser(events=['end'], recover=True)
    for line in f:
        parser.feed(line)
        for action, element in parser.read_events():
            if (action == 'end') and (element.tag == 'rpc-reply'):
                list_of_xml_trees.append(parser.close())
                parser = etree.XMLPullParser(events=('start', 'end'),
                                             recover=True)
    return list_of_xml_trees
Example #3
0
 def init_parser(self):
     """init the XML parser. The parser must always be reset for each new
     connexion
     """
     self.xml_depth = 0
     self.xml_root = None
     self.parser = ET.XMLPullParser(("start", "end"))
Example #4
0
def parse_file(file_to_parse):
    #"""Parse a file with multiple junos outputs in xml format
    #Args:
    #file_to_parse (file): an open file to parse
    #Returns:
    #A list of lxml trees with <rpc-reply> root elements
    #"""
    list_of_xml_trees = []
    parser = etree.XMLPullParser(events=['end'], recover=True)
    for line in file_to_parse:
        parser.feed(line)
        for action, element in parser.read_events():
            if (action == 'end') and (element.tag == 'rpc-reply'):
                list_of_xml_trees.append(parser.close())
        parser = etree.XMLPullParser(events=('start', 'end'), recover=True)
    return list_of_xml_trees
    def readAll(self):
        self.first_element = None
        self.parser = etree.XMLPullParser(('start', 'end'))
        read_bytes = 0
        garbage_bytes = len(self.cmd) + 1
        # Remove command string from result
        while not read_bytes == garbage_bytes:
            read_bytes += len(self.channel.recv(garbage_bytes - read_bytes))

        response = b''

        while True:
            data = self.channel.recv(BUF_SIZE)

            # Connection was closed by server
            if not data:
                break

            self.parser.feed(data)

            response += data

            if self.valid_xml():
                break

        return response.decode('utf-8')
Example #6
0
File: api.py Project: DolphDev/sans
    async def __aiter__(self) -> _AsyncGenerator[NSElement, None]:
        if not Api.agent:
            raise RuntimeError("The API's user agent is not yet set.")

        url = self.value
        # pylint: disable=E1101
        tag = self.name.upper().rstrip("S")

        parser = etree.XMLPullParser(["end"],
                                     base_url=url,
                                     remove_blank_text=True,
                                     tag=tag)
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=NSElement))
        events = parser.read_events()
        dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)

        async with Api.session.request("GET",
                                       url,
                                       headers={"User-Agent":
                                                Api.agent}) as response:
            async for data, _ in response.content.iter_chunks():
                parser.feed(dobj.decompress(data))
                for _, element in events:
                    yield element
                    element.clear()
                    while element.getparent(
                    ) is not None and element.getprevious() is not None:
                        del element.getparent()[0]
Example #7
0
    async def __aiter__(
            self) -> _AsyncGenerator[objectify.ObjectifiedElement, None]:
        if not Api.agent:
            raise RuntimeError("The API's user agent is not yet set.")

        url = self.url
        tag = self._category.name.upper().rstrip("S")
        dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)

        async with Api.session.request("GET",
                                       url,
                                       headers={"User-Agent":
                                                Api.agent}) as response:
            self._last_response = response
            response.raise_for_status()

            with contextlib.suppress(etree.XMLSyntaxError), contextlib.closing(
                    etree.XMLPullParser(["end"],
                                        base_url=url,
                                        remove_blank_text=True,
                                        tag=tag)) as parser:
                parser.set_element_class_lookup(
                    objectify.ObjectifyElementClassLookup())
                events = parser.read_events()

                async for data, _ in response.content.iter_chunks():
                    parser.feed(dobj.decompress(data))
                    for _, element in events:
                        yield element
                        element.clear(keep_tail=True)
Example #8
0
 def _start_xml(self):
     self._first_element = None
     # act on start and end element events and
     # allow huge text data (for report content)
     self._parser = etree.XMLPullParser(
         events=("start", "end"), huge_tree=True
     )
Example #9
0
    def get_jmeter_data(self):
        try:
            sample_started = False
            sample_children = []

            parser = et.XMLPullParser(['start', 'end'])
            parser.feed(self.buf)

            for event, elem in parser.read_events():
                if event == 'start' and elem.tag == 'sample':
                    sample_started = True
                    sample_children = []
                elif event == 'end' and elem.tag == 'httpSample':
                    sample = self.get_sample(elem)
                    if sample_started:
                        sample_children.append(sample)
                    else:
                        return sample
                elif event == 'end' and elem.tag == 'sample':
                    sample = self.get_sample(elem, sample_children)
                    sample_started = False
                    return sample

        except Exception as err:
            collectd.error("Plugin jmeter: Exception in get_jmeter_data due to %s" % err)
Example #10
0
def file_to_annotation_docs(fileObj):
    parser = etree.XMLPullParser(events=('start', 'end'))
    docs = []
    currentDoc = None
    currentAnnotation = None
    currentMistake = None
    currentText = ''
    # work around multiple roots in original doc
    parser.feed(b'<EVERYTHING>\n')
    for line in fileObj.readlines():
        parser.feed(line)
        for action, element in parser.read_events():
            if (action == 'start' and element.tag == 'DOC'):
                currentDoc = {'currentText': '', 'nid': element.attrib['nid']}
                currentText = ''
                continue
            if (action == 'end' and element.tag == 'DOC'):
                currentDoc['currentText'] = currentText
                docs.append(currentDoc)
                currentDoc = None
                continue
            if (action == 'start' and element.tag == 'TEXT'):
                continue
            if (action == 'end' and element.tag in ['TITLE', 'P']):
                currentText += element.text
                continue
            if (action == 'start' and element.tag == 'ANNOTATION'):
                currentAnnotation = {
                    'teacher_id': element.attrib['teacher_id'],
                    'mistakes': []
                }
                currentDoc['annotation'] = currentAnnotation
                continue
            if (action == 'start' and element.tag == 'MISTAKE'):
                currentMistake = {
                    'start':
                    str(int(element.attrib['start_par']) * 1000) +
                    element.attrib['start_off'],
                    'end':
                    str(int(element.attrib['end_par']) * 1000) +
                    element.attrib['end_off']
                }
                continue
            if (action == 'end' and element.tag == 'MISTAKE'):
                currentAnnotation['mistakes'].append(currentMistake)
                continue
            if (action == 'end' and element.tag == 'TYPE'):
                currentMistake['type'] = element.text
                continue
            if (action == 'end' and element.tag == 'CORRECTION'):
                currentMistake['corr'] = element.text
                continue
    parser.feed(b'</EVERYTHING>\n')

    return docs
Example #11
0
def xmlParse(f):
    parser = etree.XMLPullParser()
    events = parser.read_events()

    #   for line in f.readlines():
    #       print(line)
    for line in f.readlines():
        parser.feed(line)
        print_events(parser)

    result = parser.close()
    f.close()
Example #12
0
    async def __aiter__(
        self,
        *,
        clear: bool = True
    ) -> _AsyncGenerator[objectify.ObjectifiedElement, None]:
        if not Api.agent:
            raise RuntimeError("The API's user agent is not yet set.")
        if "a" in self and self["a"].lower() == "sendtg":
            raise RuntimeError(
                "This API wrapper does not support API telegrams.")
        if not self:
            # Preempt the request to conserve ratelimit
            raise BadRequest()
        url = str(self)

        headers = {"User-Agent": Api.agent}
        if self._password:
            headers["X-Password"] = self._password
        autologin = self.autologin
        if autologin:
            headers["X-Autologin"] = autologin
        if self.get("nation") in PINS:
            headers["X-Pin"] = PINS[self["nation"]]

        async with Api.session.request("GET", url,
                                       headers=headers) as response:
            self._last_response = response
            if "X-Autologin" in response.headers:
                self._password = None
            if "X-Pin" in response.headers:
                PINS[self["nation"]] = response.headers["X-Pin"]
            response.raise_for_status()

            encoding = (response.headers["Content-Type"].split("charset=")
                        [1].split(",")[0])
            with contextlib.suppress(etree.XMLSyntaxError), contextlib.closing(
                    etree.XMLPullParser(["end"],
                                        base_url=url,
                                        remove_blank_text=True)) as parser:
                parser.set_element_class_lookup(
                    objectify.ObjectifyElementClassLookup())
                events = parser.read_events()

                async for data, _ in response.content.iter_chunks():
                    parser.feed(data.decode(encoding))
                    for _, element in events:
                        if clear and (element.getparent() is None
                                      or element.getparent().getparent()
                                      is not None):
                            continue
                        yield element
                        if clear:
                            element.clear(keep_tail=True)
Example #13
0
async def parsexml(session: aiohttp.ClientSession) -> str:
    """ Parses ~100MB gzipped XML in Chunks (for memory profiling purposes) """
    import zlib
    url = "http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/pir/psd7003.xml.gz"
    d = zlib.decompressobj(zlib.MAX_WBITS | 16)
    parser = etree.XMLPullParser()
    async for chunk in iterread(url, session, 1024):
        chunk = d.decompress(chunk)
        parser.feed(chunk)
        for _, elem in parser.read_events():
            if elem.tag == r"ProteinEntry":
                yield elem
                elem.clear()
async def parsexml(session: aiohttp.ClientSession) -> str:
    """
    Parses elementtree from streamed input (by chunks) in-memory
    """
    url = "http://www.fit-pro.cz/export/fitpro-cf6ad8215df1f1cf993029a1684d5251.xml"
    ns = {"zbozi": "http://www.zbozi.cz/ns/offer/1.0"}
    parser = etree.XMLPullParser()
    async for chunk in iterread(url, session, 1024):
        parser.feed(chunk)
        for _, elem in parser.read_events():
            if elem.tag == r"{http://www.zbozi.cz/ns/offer/1.0}SHOPITEM":
                url = elem.xpath('//zbozi:URL', namespaces=ns)[0]
                elem.clear()
                yield url.text
Example #15
0
def _get_vtk_type(path):
    '''We use the incremental event emitting parser
    here since we can expect to encounter appended binary data in the xml
    which lxml cannot parse.
    :param path: vtk file to peek into
    :return: None if no VTKFile element found, else the type attribute of the VTKFile element
    '''
    parser = etree.XMLPullParser(events=('start', ))
    with open(path, 'rb') as xml:
        for lines in xml.readlines():
            parser.feed(lines)
            for action, element in parser.read_events():
                if element.tag == 'VTKFile':
                    return element.get('type')
    return None
Example #16
0
class FindElement(object):
    '''
    classdocs
    '''
    pullParser = etree.XMLPullParser()

    def __init__(self, xmlDoc):
        '''
        Constructor
        '''
        self.xml = xmlDoc
        # self.tree = pullP

    def find(aString):
        pass
Example #17
0
def get_datapoints_from_xml():
    with zipfile.ZipFile("naptandata/NaPTANxml.zip") as container:
        [contentname] = container.namelist()
        with container.open(contentname) as f:
            parser = etree.XMLPullParser(events=("end", ), no_network=True)
            while True:
                data = f.read(1024)
                if not data:
                    break
                parser.feed(data)
                for action, elem in parser.read_events():
                    if elem.tag.endswith('StopPoint'):
                        if elem.get("Status") == "active":
                            yield handle_stoppoint(elem)
                        cleanup(elem)
Example #18
0
def element_iterator(stream, events=['start']):
    def _iter(stream, parser):
        while True:
            line = stream.read(256)
            if line is None:
                return
            parser.feed(line)
            for event in parser.read_events():
                yield event

    parser = etree.XMLPullParser(events)
    try:
        yield _iter(stream, parser)
    finally:
        del parser
Example #19
0
    def parse_references(self, file_path, source):
        if not os.path.isfile(file_path):
            raise GrobidException("File {} does not exist".format(file_path))
        if not file_path.endswith(".pdf"):
            raise GrobidException("File {} is not pdf".format(file_path))
        try:
            requests.post(self.grobid_url)
        except ConnectionError:
            raise ConnectionError("Grobid does not answer")

        # file and connection are OK

        with open(file_path, "rb") as f:
            ref_handler = requests.post(self.grobid_url, files={"input": f})
        if ref_handler.ok is False:
            self.logger.error("File {} error {} when sending to Grobid".format(
                file_path, ref_handler.status_code))
            return False
        parser2 = etree.XMLPullParser(tag="{}biblStruct".format(
            self.namespace),
                                      load_dtd=True)
        parser2.feed(ref_handler.text)
        result = {}
        for action, elem in parser2.read_events():
            try:
                for i in elem:
                    if i.tag.replace(self.namespace, '') == "monogr":
                        result["monogr"] = self.parseMonogr(i)
                    if i.tag.replace(self.namespace, '') == "analytic":
                        result["analytic"] = self.parseMonogr(i)

                    reference = self.get_reference(result)
                    self.logger.debug("Original : %s", result)
                    self.logger.debug("Reference: %s", reference)
            except Exception as e:
                self.logger.critical("Reference Error %s", e)
                reference = None

            if reference is None:
                self.logger.debug("Invalid reference")
            else:
                author_list = msgpack.packb(";".join(reference["authors"]))
                SingleReference.objects.create(source=source,
                                               title=reference["title"],
                                               authors=author_list,
                                               date=reference["pubyear"])
Example #20
0
    def parse_self_xml(self):
        current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        parser = etree.XMLPullParser(events=('start', ))
        records = []

        def parse_tag(raw_tag):
            return raw_tag.split('}')[1]

        try:
            parser.feed(self.page)
        except XMLSyntaxError as err:
            print('XMLSyntaxError: ' + str(err))
        else:
            for action, element in parser.read_events():
                if parse_tag(element.tag) != self.entity_tag:
                    continue
                segs = [{parse_tag(t.tag.lower()): t.text} for t in element]
                record = {}
                for seg in segs:
                    record.update(seg)

                # Check if a valid record
                if 'link_id' and 'region' and 'road_type' and 'road_saturation_level' \
                        and 'traffic_speed' and 'capture_date' in record:
                    record['fetch_time'] = current_time

                    # The time is record['CAPTURE_DATE'], to be revised
                    r_time = time.strptime(record['CAPTURE_DATE'.lower()],
                                           "%Y-%m-%dT%H:%M:%S")
                    record['CAPTURE_DATE'.lower()] = time.strftime(
                        "%Y-%m-%d %H:%M:%S", r_time)

                    # Store the seconds rom 1970
                    capture_date_1970 = float(time.mktime(r_time))
                    current_time_1970 = float(
                        time.mktime(
                            time.strptime(current_time, "%Y-%m-%d %H:%M:%S")))
                    record['capture_date_1970'] = capture_date_1970
                    record['fetch_time_1970'] = current_time_1970

                    records.append(record)
                else:
                    print("invalid record")
        finally:
            return records
Example #21
0
    def is_empty(self, gml_path, zip_path):
        """
        Detect if the file is empty.

        Cadastre empty files (usually otherconstruction) comes with a null
        feature and results in a non valid layer in QGIS.
        """
        fo = self.get_file_object(gml_path, zip_path)
        text = fo.read(2000)
        fo.close()
        parser = etree.XMLPullParser(["start"])
        parser.feed(text)
        events = list(parser.read_events())
        try:
            parser.close()
        except etree.XMLSyntaxError:
            pass
        return len([event for event, elem in events if event == "start"]) < 3
Example #22
0
def iter_elements(xmlfile, interesting_tags):
	"""Parses a large xmlfile, yielding any listed tag.

	Removes tags from the tree after yielding: do not mark nested
	tags as interesting.
	"""
	parser = etree.XMLPullParser(events=("end",), no_network=True)
	while True:
		data = xmlfile.read(1024)
		if not data:
			break
		parser.feed(data)
		for action, elem in parser.read_events():
			tagname = elem.tag.split("}")[-1]
			if tagname in interesting_tags:
				yield tagname, elem
				cleanup(elem)
	parser.close()
Example #23
0
    def readAll(self):
        self.first_element = None
        self.parser = etree.XMLPullParser(('start', 'end'))

        response = b''

        while True:
            data = self.stdout.channel.recv(BUF_SIZE)
            # Connection was closed by server
            if not data:
                break

            self.parser.feed(data)

            response += data

            if self.valid_xml():
                break
        return response.decode('utf-8')
    def __init__(self, src_lang_code, dst_lang_code, input_stream):
        """Initializes `TmxParser`.

    Args:
      src_lang_code: String - source language code in BCP 47 spec.
      dst_lang_code: String - target language code in BCP 47 spec.
      input_stream: io stream - tmx stream that implemented file interface.
    """
        super(TmxParser, self).__init__()
        self._src_lang = _parse_locale(src_lang_code)
        self._dst_lang = _parse_locale(dst_lang_code)
        self._tmx_stream = input_stream
        self._parser = etree.XMLPullParser(events=('start', 'end'))
        self._events = self._parser.read_events()
        self._buffered_parsed_pairs = []
        self._buffered_pairs_index = 0
        # Current stack of tag names. It should start with empty string.
        self._tag_name_stack = ['']
        self._header_inited = False
        self._body_inited = False
        self._encoding = 'utf-8'
Example #25
0
    async def __aiter__(self):
        url = self.value

        parser = etree.XMLPullParser(["end"],
                                     base_url=url,
                                     remove_blank_text=True)
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=_NSElement))
        events = parser.read_events()
        dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)

        async with Api.session.request("GET",
                                       url,
                                       headers={"User-Agent":
                                                Api.agent}) as response:
            yield parser.makeelement("HEADERS", attrib=response.headers)
            async for data, _ in response.content.iter_chunks():
                parser.feed(dobj.decompress(data))
                for _, element in events:
                    yield element
                    element.clear()
Example #26
0
File: api.py Project: DolphDev/sans
    async def __aiter__(
            self,
            *,
            no_clear: bool = False) -> _AsyncGenerator[NSElement, None]:
        if not self.agent:
            raise RuntimeError("The API's user agent is not yet set.")
        if not self:
            # Preempt the request to conserve ratelimit
            raise ValueError("Bad request")
        if "a" in self and self["a"].lower() == "sendtg":
            raise RuntimeError(
                "This API wrapper does not support API telegrams.")
        url = str(self)

        parser = etree.XMLPullParser(["end"],
                                     base_url=url,
                                     remove_blank_text=True)
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=NSElement))
        events = parser.read_events()

        async with self.session.request("GET",
                                        url,
                                        headers={"User-Agent":
                                                 self.agent}) as response:
            encoding = response.headers["Content-Type"].split(
                "charset=")[1].split(",")[0]
            async for data, _ in response.content.iter_chunks():
                parser.feed(data.decode(encoding))
                for _, element in events:
                    if not no_clear and (element.getparent() is None
                                         or element.getparent().getparent()
                                         is not None):
                        continue
                    yield element
                    if no_clear:
                        continue
                    element.clear()
                    while element.getprevious() is not None:
                        del element.getparent()[0]
Example #27
0
def to_list(filename):
    html = etree.parse(filename, etree.XMLPullParser())
    all_data = html.xpath('//data')
    res = []
    for data in all_data:
        d = {}
        name = data.attrib['name']
        d.update(dict(name=name))
        children = data.getchildren()
        for child in children:
            tag = child.tag
            value = child.text
            d.update({tag: value})
        value = d.get('value', '')
        value = value.replace('"', '\'').replace("\\", r"\\")
        d['value'] = value
        comment = d.get('comment', '')
        # update
        comment = comment.replace('"', '\'')
        d['comment'] = comment
        res.append(d)
    return res
Example #28
0
    async def ingest(self):
        """
        Task that reads all incoming messages and puts them into the queue for handling.
        """
        while True:
            # Read the next tag from the reader
            data = await self.reader.readuntil(b">")

            # If we receive a new XML header, start a new parser.
            if data.startswith(b"<?xml"):
                parser = etree.XMLPullParser(events=['start', 'end'])

            # Feed the data to the parser
            parser.feed(data)

            # Look for tags that we have handlers for
            for event, element in parser.read_events():
                if event == 'start' and element.tag == '{http://etherx.jabber.org/streams}stream':
                    logger.debug(f"RECEIVED {ensure_str(element)}")
                    await self.on_open_stream(element)
                elif event == 'end' and element.tag in self.handlers:
                    logger.debug(f"RECEIVED {ensure_str(element)}")
                    asyncio.create_task(self.handlers[element.tag](element))
def generate_with_progress(response, path: str):
    # dtd is resolved via base_url
    stream = response.raw
    parser = etree.XMLPullParser(
        events=['start', 'end'],
        base_url=path,
        load_dtd=True,
        dtd_validation=True,
    )
    suffix = '%(percent)d%% %(elapsed_td)s (ETA: %(eta_td)s)'
    length = int(response.headers['Content-Length'])
    progress = Bar(suffix=suffix, max=length, hide_cursor=False)
    try:
        for line in decompress_gzip_stream(stream):
            parser.feed(line)
            yield from parser.read_events()
            current_pos = stream.tell()
            if current_pos > progress.index:
                # have to check, otherwise ETA is screwed up
                progress.goto(current_pos)
    except KeyboardInterrupt:
        pass
    finally:
        progress.finish()
Example #30
0
    async def __aiter__(self, *, clear: bool = True):
        if not self:
            raise ValueError("Bad request")
        url = str(self)

        parser = etree.XMLPullParser(["end"],
                                     base_url=url,
                                     remove_blank_text=True)
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=_NSElement))
        events = parser.read_events()

        async with type(self).session.request(
                "GET", url, headers={"User-Agent":
                                     type(self).agent}) as response:
            yield parser.makeelement("HEADERS", attrib=response.headers)
            encoding = response.headers["Content-Type"].split(
                "charset=")[1].split(",")[0]
            async for data, _ in response.content.iter_chunks():
                parser.feed(data.decode(encoding))
                for _, element in events:
                    yield element
                    if clear:
                        element.clear()