def read_patent(file): count = 0 storage_dir = "data" storage_path = os.path.join(os.getcwd(), storage_dir) path = os.path.join(storage_path, file.name.values[0]) #declaring XMLPullParser with end event if file.name.values[0].startswith("pgb"): pat_xml = etree.XMLPullParser(tag='PATDOC', events=['end'], recover=True) elif file.name.values[0].startswith("ipgb"): pat_xml = etree.XMLPullParser(tag='us-patent-grant', events=['end'], recover=True) with open(path, 'r') as lines: for line in lines: if line.startswith("<?xml"): if (patent_ends(pat_xml, file.name.values[0])): pat_xml.close() if file.name.values[0].startswith("pgb"): pat_xml = etree.XMLPullParser(tag='PATDOC', events=['end'], recover=True) elif file.name.values[0].startswith("ipgb"): pat_xml = etree.XMLPullParser(tag='us-patent-grant', events=['end'], recover=True) #Removing unwanted nodes elif line.startswith("<!DOCTYPE") or line.startswith( "]>") or line.startswith('<!ENTITY'): pass else: #pudb.set_trace() #Using feed Parser for line by line parsing pat_xml.feed(line)
def parse_file(file_to_parse): """ Parse a file with multiple junos outputs in xml format Args: file_to_parse (file): an open file to parse Returns: A list of lxml trees with <rpc-reply> root elements """ f = open("tmp.xml", 'w') f.close() #getting rid of everything not related to xml (\n, promts and other crap) for line in file_to_parse: line1 = line.strip() if line1.startswith('</rpc-reply>'): f = open("tmp.xml", 'a') f.write(line) f.write('\n') f.close() elif line1.startswith('<'): f = open("tmp.xml", 'a') f.write(line) f.close() else: continue #File tmp.xml is a clear XML file with set of "rpc-reply" elements f = open("tmp.xml") list_of_xml_trees = [] parser = etree.XMLPullParser(events=['end'], recover=True) for line in f: parser.feed(line) for action, element in parser.read_events(): if (action == 'end') and (element.tag == 'rpc-reply'): list_of_xml_trees.append(parser.close()) parser = etree.XMLPullParser(events=('start', 'end'), recover=True) return list_of_xml_trees
def init_parser(self): """init the XML parser. The parser must always be reset for each new connexion """ self.xml_depth = 0 self.xml_root = None self.parser = ET.XMLPullParser(("start", "end"))
def parse_file(file_to_parse): #"""Parse a file with multiple junos outputs in xml format #Args: #file_to_parse (file): an open file to parse #Returns: #A list of lxml trees with <rpc-reply> root elements #""" list_of_xml_trees = [] parser = etree.XMLPullParser(events=['end'], recover=True) for line in file_to_parse: parser.feed(line) for action, element in parser.read_events(): if (action == 'end') and (element.tag == 'rpc-reply'): list_of_xml_trees.append(parser.close()) parser = etree.XMLPullParser(events=('start', 'end'), recover=True) return list_of_xml_trees
def readAll(self): self.first_element = None self.parser = etree.XMLPullParser(('start', 'end')) read_bytes = 0 garbage_bytes = len(self.cmd) + 1 # Remove command string from result while not read_bytes == garbage_bytes: read_bytes += len(self.channel.recv(garbage_bytes - read_bytes)) response = b'' while True: data = self.channel.recv(BUF_SIZE) # Connection was closed by server if not data: break self.parser.feed(data) response += data if self.valid_xml(): break return response.decode('utf-8')
async def __aiter__(self) -> _AsyncGenerator[NSElement, None]: if not Api.agent: raise RuntimeError("The API's user agent is not yet set.") url = self.value # pylint: disable=E1101 tag = self.name.upper().rstrip("S") parser = etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True, tag=tag) parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=NSElement)) events = parser.read_events() dobj = zlib.decompressobj(16 + zlib.MAX_WBITS) async with Api.session.request("GET", url, headers={"User-Agent": Api.agent}) as response: async for data, _ in response.content.iter_chunks(): parser.feed(dobj.decompress(data)) for _, element in events: yield element element.clear() while element.getparent( ) is not None and element.getprevious() is not None: del element.getparent()[0]
async def __aiter__( self) -> _AsyncGenerator[objectify.ObjectifiedElement, None]: if not Api.agent: raise RuntimeError("The API's user agent is not yet set.") url = self.url tag = self._category.name.upper().rstrip("S") dobj = zlib.decompressobj(16 + zlib.MAX_WBITS) async with Api.session.request("GET", url, headers={"User-Agent": Api.agent}) as response: self._last_response = response response.raise_for_status() with contextlib.suppress(etree.XMLSyntaxError), contextlib.closing( etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True, tag=tag)) as parser: parser.set_element_class_lookup( objectify.ObjectifyElementClassLookup()) events = parser.read_events() async for data, _ in response.content.iter_chunks(): parser.feed(dobj.decompress(data)) for _, element in events: yield element element.clear(keep_tail=True)
def _start_xml(self): self._first_element = None # act on start and end element events and # allow huge text data (for report content) self._parser = etree.XMLPullParser( events=("start", "end"), huge_tree=True )
def get_jmeter_data(self): try: sample_started = False sample_children = [] parser = et.XMLPullParser(['start', 'end']) parser.feed(self.buf) for event, elem in parser.read_events(): if event == 'start' and elem.tag == 'sample': sample_started = True sample_children = [] elif event == 'end' and elem.tag == 'httpSample': sample = self.get_sample(elem) if sample_started: sample_children.append(sample) else: return sample elif event == 'end' and elem.tag == 'sample': sample = self.get_sample(elem, sample_children) sample_started = False return sample except Exception as err: collectd.error("Plugin jmeter: Exception in get_jmeter_data due to %s" % err)
def file_to_annotation_docs(fileObj): parser = etree.XMLPullParser(events=('start', 'end')) docs = [] currentDoc = None currentAnnotation = None currentMistake = None currentText = '' # work around multiple roots in original doc parser.feed(b'<EVERYTHING>\n') for line in fileObj.readlines(): parser.feed(line) for action, element in parser.read_events(): if (action == 'start' and element.tag == 'DOC'): currentDoc = {'currentText': '', 'nid': element.attrib['nid']} currentText = '' continue if (action == 'end' and element.tag == 'DOC'): currentDoc['currentText'] = currentText docs.append(currentDoc) currentDoc = None continue if (action == 'start' and element.tag == 'TEXT'): continue if (action == 'end' and element.tag in ['TITLE', 'P']): currentText += element.text continue if (action == 'start' and element.tag == 'ANNOTATION'): currentAnnotation = { 'teacher_id': element.attrib['teacher_id'], 'mistakes': [] } currentDoc['annotation'] = currentAnnotation continue if (action == 'start' and element.tag == 'MISTAKE'): currentMistake = { 'start': str(int(element.attrib['start_par']) * 1000) + element.attrib['start_off'], 'end': str(int(element.attrib['end_par']) * 1000) + element.attrib['end_off'] } continue if (action == 'end' and element.tag == 'MISTAKE'): currentAnnotation['mistakes'].append(currentMistake) continue if (action == 'end' and element.tag == 'TYPE'): currentMistake['type'] = element.text continue if (action == 'end' and element.tag == 'CORRECTION'): currentMistake['corr'] = element.text continue parser.feed(b'</EVERYTHING>\n') return docs
def xmlParse(f): parser = etree.XMLPullParser() events = parser.read_events() # for line in f.readlines(): # print(line) for line in f.readlines(): parser.feed(line) print_events(parser) result = parser.close() f.close()
async def __aiter__( self, *, clear: bool = True ) -> _AsyncGenerator[objectify.ObjectifiedElement, None]: if not Api.agent: raise RuntimeError("The API's user agent is not yet set.") if "a" in self and self["a"].lower() == "sendtg": raise RuntimeError( "This API wrapper does not support API telegrams.") if not self: # Preempt the request to conserve ratelimit raise BadRequest() url = str(self) headers = {"User-Agent": Api.agent} if self._password: headers["X-Password"] = self._password autologin = self.autologin if autologin: headers["X-Autologin"] = autologin if self.get("nation") in PINS: headers["X-Pin"] = PINS[self["nation"]] async with Api.session.request("GET", url, headers=headers) as response: self._last_response = response if "X-Autologin" in response.headers: self._password = None if "X-Pin" in response.headers: PINS[self["nation"]] = response.headers["X-Pin"] response.raise_for_status() encoding = (response.headers["Content-Type"].split("charset=") [1].split(",")[0]) with contextlib.suppress(etree.XMLSyntaxError), contextlib.closing( etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True)) as parser: parser.set_element_class_lookup( objectify.ObjectifyElementClassLookup()) events = parser.read_events() async for data, _ in response.content.iter_chunks(): parser.feed(data.decode(encoding)) for _, element in events: if clear and (element.getparent() is None or element.getparent().getparent() is not None): continue yield element if clear: element.clear(keep_tail=True)
async def parsexml(session: aiohttp.ClientSession) -> str: """ Parses ~100MB gzipped XML in Chunks (for memory profiling purposes) """ import zlib url = "http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/pir/psd7003.xml.gz" d = zlib.decompressobj(zlib.MAX_WBITS | 16) parser = etree.XMLPullParser() async for chunk in iterread(url, session, 1024): chunk = d.decompress(chunk) parser.feed(chunk) for _, elem in parser.read_events(): if elem.tag == r"ProteinEntry": yield elem elem.clear()
async def parsexml(session: aiohttp.ClientSession) -> str: """ Parses elementtree from streamed input (by chunks) in-memory """ url = "http://www.fit-pro.cz/export/fitpro-cf6ad8215df1f1cf993029a1684d5251.xml" ns = {"zbozi": "http://www.zbozi.cz/ns/offer/1.0"} parser = etree.XMLPullParser() async for chunk in iterread(url, session, 1024): parser.feed(chunk) for _, elem in parser.read_events(): if elem.tag == r"{http://www.zbozi.cz/ns/offer/1.0}SHOPITEM": url = elem.xpath('//zbozi:URL', namespaces=ns)[0] elem.clear() yield url.text
def _get_vtk_type(path): '''We use the incremental event emitting parser here since we can expect to encounter appended binary data in the xml which lxml cannot parse. :param path: vtk file to peek into :return: None if no VTKFile element found, else the type attribute of the VTKFile element ''' parser = etree.XMLPullParser(events=('start', )) with open(path, 'rb') as xml: for lines in xml.readlines(): parser.feed(lines) for action, element in parser.read_events(): if element.tag == 'VTKFile': return element.get('type') return None
class FindElement(object): ''' classdocs ''' pullParser = etree.XMLPullParser() def __init__(self, xmlDoc): ''' Constructor ''' self.xml = xmlDoc # self.tree = pullP def find(aString): pass
def get_datapoints_from_xml(): with zipfile.ZipFile("naptandata/NaPTANxml.zip") as container: [contentname] = container.namelist() with container.open(contentname) as f: parser = etree.XMLPullParser(events=("end", ), no_network=True) while True: data = f.read(1024) if not data: break parser.feed(data) for action, elem in parser.read_events(): if elem.tag.endswith('StopPoint'): if elem.get("Status") == "active": yield handle_stoppoint(elem) cleanup(elem)
def element_iterator(stream, events=['start']): def _iter(stream, parser): while True: line = stream.read(256) if line is None: return parser.feed(line) for event in parser.read_events(): yield event parser = etree.XMLPullParser(events) try: yield _iter(stream, parser) finally: del parser
def parse_references(self, file_path, source): if not os.path.isfile(file_path): raise GrobidException("File {} does not exist".format(file_path)) if not file_path.endswith(".pdf"): raise GrobidException("File {} is not pdf".format(file_path)) try: requests.post(self.grobid_url) except ConnectionError: raise ConnectionError("Grobid does not answer") # file and connection are OK with open(file_path, "rb") as f: ref_handler = requests.post(self.grobid_url, files={"input": f}) if ref_handler.ok is False: self.logger.error("File {} error {} when sending to Grobid".format( file_path, ref_handler.status_code)) return False parser2 = etree.XMLPullParser(tag="{}biblStruct".format( self.namespace), load_dtd=True) parser2.feed(ref_handler.text) result = {} for action, elem in parser2.read_events(): try: for i in elem: if i.tag.replace(self.namespace, '') == "monogr": result["monogr"] = self.parseMonogr(i) if i.tag.replace(self.namespace, '') == "analytic": result["analytic"] = self.parseMonogr(i) reference = self.get_reference(result) self.logger.debug("Original : %s", result) self.logger.debug("Reference: %s", reference) except Exception as e: self.logger.critical("Reference Error %s", e) reference = None if reference is None: self.logger.debug("Invalid reference") else: author_list = msgpack.packb(";".join(reference["authors"])) SingleReference.objects.create(source=source, title=reference["title"], authors=author_list, date=reference["pubyear"])
def parse_self_xml(self): current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) parser = etree.XMLPullParser(events=('start', )) records = [] def parse_tag(raw_tag): return raw_tag.split('}')[1] try: parser.feed(self.page) except XMLSyntaxError as err: print('XMLSyntaxError: ' + str(err)) else: for action, element in parser.read_events(): if parse_tag(element.tag) != self.entity_tag: continue segs = [{parse_tag(t.tag.lower()): t.text} for t in element] record = {} for seg in segs: record.update(seg) # Check if a valid record if 'link_id' and 'region' and 'road_type' and 'road_saturation_level' \ and 'traffic_speed' and 'capture_date' in record: record['fetch_time'] = current_time # The time is record['CAPTURE_DATE'], to be revised r_time = time.strptime(record['CAPTURE_DATE'.lower()], "%Y-%m-%dT%H:%M:%S") record['CAPTURE_DATE'.lower()] = time.strftime( "%Y-%m-%d %H:%M:%S", r_time) # Store the seconds rom 1970 capture_date_1970 = float(time.mktime(r_time)) current_time_1970 = float( time.mktime( time.strptime(current_time, "%Y-%m-%d %H:%M:%S"))) record['capture_date_1970'] = capture_date_1970 record['fetch_time_1970'] = current_time_1970 records.append(record) else: print("invalid record") finally: return records
def is_empty(self, gml_path, zip_path): """ Detect if the file is empty. Cadastre empty files (usually otherconstruction) comes with a null feature and results in a non valid layer in QGIS. """ fo = self.get_file_object(gml_path, zip_path) text = fo.read(2000) fo.close() parser = etree.XMLPullParser(["start"]) parser.feed(text) events = list(parser.read_events()) try: parser.close() except etree.XMLSyntaxError: pass return len([event for event, elem in events if event == "start"]) < 3
def iter_elements(xmlfile, interesting_tags): """Parses a large xmlfile, yielding any listed tag. Removes tags from the tree after yielding: do not mark nested tags as interesting. """ parser = etree.XMLPullParser(events=("end",), no_network=True) while True: data = xmlfile.read(1024) if not data: break parser.feed(data) for action, elem in parser.read_events(): tagname = elem.tag.split("}")[-1] if tagname in interesting_tags: yield tagname, elem cleanup(elem) parser.close()
def readAll(self): self.first_element = None self.parser = etree.XMLPullParser(('start', 'end')) response = b'' while True: data = self.stdout.channel.recv(BUF_SIZE) # Connection was closed by server if not data: break self.parser.feed(data) response += data if self.valid_xml(): break return response.decode('utf-8')
def __init__(self, src_lang_code, dst_lang_code, input_stream): """Initializes `TmxParser`. Args: src_lang_code: String - source language code in BCP 47 spec. dst_lang_code: String - target language code in BCP 47 spec. input_stream: io stream - tmx stream that implemented file interface. """ super(TmxParser, self).__init__() self._src_lang = _parse_locale(src_lang_code) self._dst_lang = _parse_locale(dst_lang_code) self._tmx_stream = input_stream self._parser = etree.XMLPullParser(events=('start', 'end')) self._events = self._parser.read_events() self._buffered_parsed_pairs = [] self._buffered_pairs_index = 0 # Current stack of tag names. It should start with empty string. self._tag_name_stack = [''] self._header_inited = False self._body_inited = False self._encoding = 'utf-8'
async def __aiter__(self): url = self.value parser = etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True) parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=_NSElement)) events = parser.read_events() dobj = zlib.decompressobj(16 + zlib.MAX_WBITS) async with Api.session.request("GET", url, headers={"User-Agent": Api.agent}) as response: yield parser.makeelement("HEADERS", attrib=response.headers) async for data, _ in response.content.iter_chunks(): parser.feed(dobj.decompress(data)) for _, element in events: yield element element.clear()
async def __aiter__( self, *, no_clear: bool = False) -> _AsyncGenerator[NSElement, None]: if not self.agent: raise RuntimeError("The API's user agent is not yet set.") if not self: # Preempt the request to conserve ratelimit raise ValueError("Bad request") if "a" in self and self["a"].lower() == "sendtg": raise RuntimeError( "This API wrapper does not support API telegrams.") url = str(self) parser = etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True) parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=NSElement)) events = parser.read_events() async with self.session.request("GET", url, headers={"User-Agent": self.agent}) as response: encoding = response.headers["Content-Type"].split( "charset=")[1].split(",")[0] async for data, _ in response.content.iter_chunks(): parser.feed(data.decode(encoding)) for _, element in events: if not no_clear and (element.getparent() is None or element.getparent().getparent() is not None): continue yield element if no_clear: continue element.clear() while element.getprevious() is not None: del element.getparent()[0]
def to_list(filename): html = etree.parse(filename, etree.XMLPullParser()) all_data = html.xpath('//data') res = [] for data in all_data: d = {} name = data.attrib['name'] d.update(dict(name=name)) children = data.getchildren() for child in children: tag = child.tag value = child.text d.update({tag: value}) value = d.get('value', '') value = value.replace('"', '\'').replace("\\", r"\\") d['value'] = value comment = d.get('comment', '') # update comment = comment.replace('"', '\'') d['comment'] = comment res.append(d) return res
async def ingest(self): """ Task that reads all incoming messages and puts them into the queue for handling. """ while True: # Read the next tag from the reader data = await self.reader.readuntil(b">") # If we receive a new XML header, start a new parser. if data.startswith(b"<?xml"): parser = etree.XMLPullParser(events=['start', 'end']) # Feed the data to the parser parser.feed(data) # Look for tags that we have handlers for for event, element in parser.read_events(): if event == 'start' and element.tag == '{http://etherx.jabber.org/streams}stream': logger.debug(f"RECEIVED {ensure_str(element)}") await self.on_open_stream(element) elif event == 'end' and element.tag in self.handlers: logger.debug(f"RECEIVED {ensure_str(element)}") asyncio.create_task(self.handlers[element.tag](element))
def generate_with_progress(response, path: str): # dtd is resolved via base_url stream = response.raw parser = etree.XMLPullParser( events=['start', 'end'], base_url=path, load_dtd=True, dtd_validation=True, ) suffix = '%(percent)d%% %(elapsed_td)s (ETA: %(eta_td)s)' length = int(response.headers['Content-Length']) progress = Bar(suffix=suffix, max=length, hide_cursor=False) try: for line in decompress_gzip_stream(stream): parser.feed(line) yield from parser.read_events() current_pos = stream.tell() if current_pos > progress.index: # have to check, otherwise ETA is screwed up progress.goto(current_pos) except KeyboardInterrupt: pass finally: progress.finish()
async def __aiter__(self, *, clear: bool = True): if not self: raise ValueError("Bad request") url = str(self) parser = etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True) parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=_NSElement)) events = parser.read_events() async with type(self).session.request( "GET", url, headers={"User-Agent": type(self).agent}) as response: yield parser.makeelement("HEADERS", attrib=response.headers) encoding = response.headers["Content-Type"].split( "charset=")[1].split(",")[0] async for data, _ in response.content.iter_chunks(): parser.feed(data.decode(encoding)) for _, element in events: yield element if clear: element.clear()