def audit(osmfile, audit_value = ''): osm_file = open(osmfile, "r") if audit_value == 'postal': zip_fixed= set() for event, elem in ET.iterparse(osm_file, events=("start",)): if elem.tag == "node" or elem.tag == "way": for tag in elem.iter("tag"): if is_postal(tag): zip_fixed.add(audit_postalcode(tag.attrib['v'])) return zip_fixed elif audit_value == 'city': city_fixed = set() for event, elem in ET.iterparse(osm_file, events=("start",)): if elem.tag == "node" or elem.tag == "way": for tag in elem.iter("tag"): if is_city(tag): city_fixed.add(audit_city(tag.attrib['v'])) return city_fixed else: street_types = defaultdict(set) for event, elem in ET.iterparse(osm_file, events=("start",)): if elem.tag == "node" or elem.tag == "way": for tag in elem.iter("tag"): if is_street_name(tag): audit_street_type(street_types, tag.attrib['v']) return street_types
def test_getAllNestedElementInformation(self): expectedResultPeaks = {'fullName': 'Proteomics Standards Initiative Mass Spectrometry Ontology', 'id': 'MS', 'tagName': '{http://psi.hupo.org/ms/mzml}cv', 'URI': 'http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo'} expectedResultMzml = {'fullName': 'Proteomics Standards Initiative Mass Spectrometry Ontology', 'id': 'MS', 'tagName': '{http://psi.hupo.org/ms/mzml}cv', 'URI': 'http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo', 'version':'2.26.0'} expectedResultFeatureXML = {'name': 'FeatureFinder', 'tagName': 'software', 'version': '1.8.0'} actualResultPeaks = {} elementFile = open(testFolder+'peaksMzmlTestfile.peaks.mzML') for event, element in cElementTree.iterparse(elementFile): actualResultPeaks = elementFunctions.getAllNestedElementInformation(element) # only doing one to test, break break actualResultMzml = {} elementFile = open(testFolder+'mzml_test_file_1.mzML') for event, element in cElementTree.iterparse(elementFile): actualResultMzml = elementFunctions.getAllNestedElementInformation(element) # only doing one to test, break break actualResultFeatureXML = {} elementFile = open(testFolder+'featurexmlTestFile_1.featureXML') for event, element in cElementTree.iterparse(elementFile): actualResultFeatureXML = elementFunctions.getAllNestedElementInformation(element) # only doing one to test, break break self.assertDictEqual(expectedResultPeaks, actualResultPeaks) self.assertDictEqual(expectedResultMzml, actualResultMzml) self.assertDictEqual(expectedResultFeatureXML, actualResultFeatureXML)
def parse_blast_xml(args, cur): if args.input: for f in args.input: con = et.iterparse(f, events=('end', 'start')) _parse_blast_xml(args, cur, con) else: con = et.iterparse(sys.stdin, events=('end', 'start')) _parse_blast_xml(args, cur, con)
def __init__(self): self.ATC_dict = json.loads('all_level.json') self.result = [] for event, drug = ET.iterparse('drugbank.xml'): if drug.tag != '{http://www.drugbank.ca}drug': continue for first_level in drug: if first_level.tag == '{http://www.drugbank.ca}calculated-properties': self.result += self.calculated_properties(first_level)
def _iterparse(xmlfile): """ Avoid bug in python 3.{2,3}. See http://bugs.python.org/issue9257. :param xmlfile: XML file or file-like object """ try: return ET.iterparse(xmlfile, events=("start-ns", )) except TypeError: return ET.iterparse(xmlfile, events=(b"start-ns", ))
def main(): global args options = argparse.ArgumentParser(epilog="Example: \ %(prog)s dmarc-xml-file 1> outfile.log") options.add_argument("dmarcfile", help="dmarc file in XML format") args = options.parse_args() # get an iterable and turn it into an iterator meta_fields = get_meta(iter(etree.iterparse(args.dmarcfile, events=("start", "end")))); if not meta_fields: print >> sys.stderr, "Error: No valid 'policy_published' and 'report_metadata' xml tags found; File: " + args.dmarcfile sys.exit(1) print_record(iter(etree.iterparse(args.dmarcfile, events=("start", "end"))), meta_fields, args)
def do_search(search): """ Given any arbitrary string, return list of possible matching locations. """ import StringIO from x84.bbs import echo, getch disp_msg(u'SEARChiNG') resp = requests.get(u'http://apple.accuweather.com' + u'/adcbin/apple/Apple_find_city.asp', params=(('location', search),)) locations = list() if resp is None: disp_notfound() elif resp.status_code != 200: # todo: logger.error echo(u'\r\n' + u'Status Code: %s\r\n\r\n' % (resp.status_code,)) echo(repr(resp.content)) echo(u'\r\n\r\n' + 'Press any key') getch() else: # print resp.content xml_stream = StringIO.StringIO(resp.content) locations = list([dict(elem.attrib.items()) for _event, elem in ET.iterparse(xml_stream) if elem.tag == 'location']) if 0 == len(locations): disp_notfound() else: disp_found(len(locations)) return locations
def process_map(filename): users = set() for _, element in ET.iterparse(filename, events = ("start",)): if 'uid' in element.attrib: users.add(get_user(element)) return users
def readPrimary(self): # If we have either a local cache of the primary.xml.gz file or if # it is already local (nfs or local file system) we calculate it's # checksum and compare it with the one from repomd. If they are # the same we don't need to cache it again and can directly use it. if self.repomd.has_key("primary"): if not self.repomd["primary"].has_key("location"): return 0 primary = self.repomd["primary"]["location"] (csum, destfile) = self.nc.checksum(primary, "sha") if self.repomd["primary"].has_key("checksum") and \ csum == self.repomd["primary"]["checksum"]: filename = destfile else: filename = self.nc.cache(primary, 1) if not filename: return 0 try: fd = PyGZIP(filename) ip = iterparse(fd, events=("start","end")) ip = iter(ip) except IOError: log.error("Couldn't parse primary.xml") return 0 self._parse(ip) return 1
def parse_and_write(xml_file, outfile, fields, tag, n, interval=1): # get an iterable context = ET.iterparse(xml_file, events=("start", "end")) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() i = 0 with open(outfile, 'w') as f: for event, row in context: if event == "end" and row.tag == tag: if i % 100000 == 0: pct = round((i * 1.0 / n) * 100, 1) Printer("Processed {0} records. ~ {1}\% complete.".format(i, pct)) if interval == 1 or i % interval == 0: if all(map(lambda x: x in row.attrib, fields)): field_data = [] for fd in fields: if fd == 'Tags': field_data.extend(parse_tags(row.attrib[fd].encode('ascii', 'ignore'))) else: field_data.append(clean(row.attrib[fd].encode('ascii', 'ignore'))) text = " ".join(field_data) + "\n" f.write(text) i += 1 root.clear() if i >= n: break
def new_parse(self): """Generator using cElementTree iterparse function""" if self.filename.endswith('.bz2'): import bz2 source = bz2.BZ2File(self.filename) elif self.filename.endswith('.gz'): import gzip source = gzip.open(self.filename) elif self.filename.endswith('.7z'): import subprocess source = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % self.filename, shell=True, stdout=subprocess.PIPE, bufsize=65535).stdout else: # assume it's an uncompressed XML file source = open(self.filename) context = iterparse(source, events=("start", "end", "start-ns")) self.root = None for event, elem in context: if event == "start-ns" and elem[0] == "": self.uri = elem[1] continue if event == "start" and self.root is None: self.root = elem continue for rev in self._parse(event, elem): yield rev
def parseOsm(source, handler): for event, elem in ElementTree.iterparse(source, events=('start', 'end')): if event == 'start': handler.startElement(elem.tag, elem.attrib) elif event == 'end': handler.endElement(elem.tag) elem.clear()
def process_map(filename): keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} for _, element in ET.iterparse(filename): # print element.tag keys = key_type(element, keys) return keys
def process_map(filename): users = set() for _, el in ET.iterparse(filename): if 'user' in el.attrib: users.add(el.attrib['user']) return users
def list_country(filename): governorate_set=set() for _,element in ET.iterparse(filename): for tag in element.iter('tag'): if 'governorate' in tag.attrib['k'] and tag.attrib['v'] not in governorate_set: governorate_set.add(tag.attrib['v']) return governorate_set
def audit(osmfile): osm_file = open(osmfile, "r") street_types = defaultdict(set) for event, elem in ET.iterparse(osm_file, events=("start",)): # audit street names in "nodes" containing "addr:street" if elem.tag == "node": for tag in elem.iter("tag"): if is_street_name(tag) and tag.attrib['v'] != "": street_type, street_name = audit_street_type( tag.attrib['v']) add_street_type(street_types, street_type, street_name) # audit street names in "ways" containing "highway" elif elem.tag == "way": highway = 0 # check if way matches an included highway type for tag in elem.iter("tag"): if (tag.attrib['k'] == "highway") and (tag.attrib['v'] in highway_types): highway = 1 if highway == 1: for tag in elem.iter("tag"): if is_name(tag) and tag.attrib['v'] != "": street_type, street_name = audit_street_type( tag.attrib['v']) if street_type != None: add_street_type(street_types, street_type, street_name) return street_types
def _parse_results(self, stream): """Parse results and messages out of *stream*.""" result = None values = None try: for event, elem in et.iterparse(stream, events=('start', 'end')): if elem.tag == 'results' and event == 'start': # The wrapper element is a <results preview="0|1">. We # don't care about it except to tell is whether these # are preview results, or the final results from the # search. is_preview = elem.attrib['preview'] == '1' self.is_preview = is_preview if elem.tag == 'result': if event == 'start': result = OrderedDict() elif event == 'end': yield result result = None elem.clear() elif elem.tag == 'field' and result is not None: # We need the 'result is not None' check because # 'field' is also the element name in the <meta> # header that gives field order, which is not what we # want at all. if event == 'start': values = [] elif event == 'end': field_name = elem.attrib['k'].encode('utf8') if len(values) == 1: result[field_name] = values[0] else: result[field_name] = values # Calling .clear() is necessary to let the # element be garbage collected. Otherwise # arbitrarily large results sets will use # arbitrarily large memory intead of # streaming. elem.clear() elif elem.tag in ('text', 'v') and event == 'end': text = "".join(elem.itertext()) values.append(text.encode('utf8')) elem.clear() elif elem.tag == 'msg': if event == 'start': msg_type = elem.attrib['type'] elif event == 'end': text = elem.text if elem.text is not None else "" yield Message(msg_type, text.encode('utf8')) elem.clear() except SyntaxError as pe: # This is here to handle the same incorrect return from # splunk that is described in __init__. if 'no element found' in pe.msg: return else: raise
def process_osm(file_in): with open(file_in) as file: for _, element in ET.iterparse(file): el = shape_data(element) if el: #pprint.pprint(el) way_node_collection.insert(el)
def parseFile(self, f): """ Parses a single Doxygen XML file :param f: XML file path """ documentable_members = 0 documented_members = 0 # Wrap everything in a try, as sometimes Doxygen XML is malformed try: for event, elem in ET.iterparse(f): if event == 'end' and elem.tag == 'compounddef': if self.elemIsPublicClass(elem): # store documentation status members, documented, undocumented, bindable, has_brief_description, found_version_added = self.parseClassElem(elem) documentable_members += members documented_members += documented class_name = elem.find('compoundname').text acceptable_missing = self.acceptable_missing.get(class_name, []) if not self.hasGroup(class_name) and not class_name in self.acceptable_missing_group: self.classes_missing_group.append(class_name) if not class_name in self.acceptable_missing_brief and not has_brief_description: self.classes_missing_brief.append(class_name) if not class_name in self.acceptable_missing_added_note and not found_version_added: self.classes_missing_version_added.append(class_name) # GEN LIST # if len(undocumented) > 0: # print('"%s": [%s],' % (class_name, ", ".join(['"%s"' % e.replace('"', '\\"') for e in undocumented]))) unacceptable_undocumented = undocumented - set(acceptable_missing) # do a case insensitive check too unacceptable_undocumented_insensitive = set([u.lower() for u in undocumented]) - set([u.lower() for u in acceptable_missing]) if len(unacceptable_undocumented_insensitive) > 0: self.undocumented_members[class_name] = {} self.undocumented_members[class_name]['documented'] = documented self.undocumented_members[class_name]['members'] = members self.undocumented_members[class_name]['missing_members'] = unacceptable_undocumented # store bindable members if self.classElemIsBindable(elem): for m in bindable: self.bindable_members.append(m) elem.clear() except ET.ParseError as e: # sometimes Doxygen generates malformed xml (e.g., for < and > operators) line_num, col = e.position with open(f, 'r') as xml_file: for i, l in enumerate(xml_file): if i == line_num - 1: line = l break caret = '{:=>{}}'.format('^', col) print(('ParseError in {}\n{}\n{}\n{}'.format(f, e, line, caret))) self.documentable_members += documentable_members self.documented_members += documented_members
def parse(stream): for event, element in et.iterparse(stream): if element.tag != 'row': continue yield { x.get('name'): x.text.strip() if x.text else None for x in element }
def count_tags(filename): # YOUR CODE HERE d = defaultdict(int) for events, elem in ET.iterparse(filename): d[elem.tag] += 1 return d
def parse_mediawiki_xml(xml_fo, dump_func): """ Process mediawiki xml dump of wiktionary. """ # create tag names in advance (setting default xmlns doesn't help) page_tag = tag("page") text_tag = tag("text") title_tag = tag("title") # using event based eltree parser itree = ET.iterparse(xml_fo, events=("start","end")) word_name = None etymology_list = None for event, elem in itree: # reset data for new word entry if event == "start": if elem.tag == page_tag: word_name = None continue # get data for current word entry (event == end) if elem.tag == title_tag: if not elem.text.startswith("Wiktionary:"): word_name = elem.text elif word_name is not None: if elem.tag == text_tag: etymology_list = get_etymology(elem.text) elif elem.tag == page_tag: # all data for current word entry has beed processed dump_func(word_name, etymology_list)
def examine_tags(osmfile, tag_range, item_limit): assert len(tag_range) == 2 # use pre-loaded tag_keys list of tuples, if exists if TAG_KEYS: tag_keys = TAG_KEYS # else call mapparser count_tags method to pull sorted list of top tags else: _, tag_keys = count_tags(osmfile) # list comprehension for producing a list of tag_keys in string format tag_keys = [tag_key[0] for tag_key in tag_keys][tag_range[0]:tag_range[1]] print "Examining tag keys: {}".format(tag_keys) # open osm file osm_file = open(osmfile, "r") # initialize data with default set data structure data = defaultdict(set) # iterate through elements for _, elem in ET.iterparse(osm_file, events=("start",)): # check if the element is a node or way if elem.tag == "node" or elem.tag == "way": # iterate through children matching `tag` for tag in elem.iter("tag"): # skip if does not contain key-value pair if 'k' not in tag.attrib or 'v' not in tag.attrib: continue key = tag.get('k') val = tag.get('v') # add to set if in tag keys of interest and is below the item limit if key in tag_keys and len(data[key]) < item_limit: data[key].add(val) return data
def parse(xmlfile, element_names, element_attrs={}, attr_conversions={}): """ Parses the given element_names from xmlfile and yield compound objects for their xml subtrees (no extra objects are returned if element_names appear in the subtree) The compound objects provide all element attributes of the root of the subtree as attributes unless attr_names are supplied. In this case attr_names maps element names to a list of attributes which are supplied. If attr_conversions is not empty it must map attribute names to callables which will be called upon the attribute value before storing under the attribute name. The compound objects gives dictionary style access to list of compound objects o for any children with the given element name o['child_element_name'] = [osub0, osub1, ...] As a shorthand, attribute style access to the list of child elements is provided unless an attribute with the same name as the child elements exists (i.e. o.child_element_name = [osub0, osub1, ...]) @Note: All elements with the same name must have the same type regardless of the subtree in which they occur @Note: Attribute names may be modified to avoid name clashes with python keywords. @Note: The element_names may be either a single string or a list of strings. @Example: parse('plain.edg.xml', ['edge']) """ if isinstance(element_names, str): element_names = [element_names] elementTypes = {} for event, parsenode in ET.iterparse(xmlfile): if parsenode.tag in element_names: yield _get_compound_object(parsenode, elementTypes, parsenode.tag, element_attrs, attr_conversions) parsenode.clear()
def elements(self): if not self.parser: reader = self.stream.reader class f(object): def read(self, n): if reader.buffer.remaining == 0: #read more data into buffer reader._read_more() return reader.buffer.read_bytes(min(n, reader.buffer.remaining)) self.parser = iter(iterparse(f(), events=("start", "end"))) event, self.root = self.parser.next() level = 0 for event, element in self.parser: if event == 'start': level += 1 elif event == 'end': level -= 1 if level == 0: yield element #TODO clear root else: assert False, "unexpected event"
def iterparse(text, interested_path_handlers): ''' interested_path_handlers => {'start': ((interested_path, handler), (interested_path, handler), ...), 'end':((interested_path, handler), (interested_path, handler), ...)} interested_path => (tag1, tag2, tag3, ...) An incremental XML parser. ElementTree.findall has too high CPU/Memory footprint when data set is big ''' strf = StringIO() strf.write(text) strf.seek(0) context = ElementTree.iterparse(strf, events=('start', 'end')) context = iter(context) all_start_handlers = interested_path_handlers.get('start', ()) all_end_handlers = interested_path_handlers.get('end', ()) current_path = [] for ev, elem in context: tag, value = elem.tag, elem.text if ev == 'start': current_path.append(tag) if all_start_handlers: _do_handlers(ev, elem, current_path, all_start_handlers) elif ev == 'end': if all_end_handlers: _do_handlers(ev, elem, current_path, all_end_handlers) current_path.pop() elem.clear()
def count_tags(filename, limit=-1, verbose=False): """ Parses the OSM file and counts the tags by type. """ # initialize dict objects and counter tag_count = {} tag_keys = {} counter = 0 # iterate through elements for _, element in ET.iterparse(filename, events=("start",)): # add to tag count add_tag(element.tag, tag_count) # if tag and has key, add the tag key to tag_keys dict if element.tag == 'tag' and 'k' in element.attrib: add_tag(element.get('k'), tag_keys) # print if verbose output enabled if verbose: print "{0}: {1}".format(counter, element.tag) # break if exceed limit if limit > 0 and counter >= limit: break counter += 1 # produces a sorted-by-decreasing list of tag key-count pairs tag_keys = sorted(tag_keys.items(), key=operator.itemgetter(1))[::-1] # return values return tag_count, tag_keys
def _xml_namespaces(self): for _, e in iterparse(self._path, events=('start-ns',)): lcode, uri = e if 1 > Namespace.objects.filter(resource__name=uri).count(): r = Resource(name=uri) yield r yield Namespace(code=lcode, resource=r)
def iterArticles(f): pmid = None title = None abstract = None journal= None mesh_list = [] for event,elem in ET.iterparse(f, events=("start","end")): if event == 'start': if elem.tag == 'PubmedArticle': pmid,title,abstract,journal,mesh_list= None,None,None, None,[] elif event == 'end': if elem.tag == 'PubmedArticle': yield pmid, title, abstract,journal,mesh_list elif elem.tag == 'PMID': pmid = elem.text elif elem.tag == 'ArticleTitle': title = elem.text elif elem.tag == 'AbstractText': abstract = elem.text elif elem.tag == 'Title': journal = elem.text elif elem.tag == 'KeywordList': keyword_list=elem.findall("Keyword") for aa in keyword_list: mesh_list.append(aa.text) elif elem.tag == 'MeshHeadingList': mhlist = elem.findall("MeshHeading") for child in mhlist: if child.findtext('DescriptorName'): mesh_list.append(child.findtext('DescriptorName')) if child.findtext('QualifierName'): mesh_list.append(child.findtext('QualifierName'))
def _get_annotations(self, source, start_offset=0): """It returns the annotations found in the document. It follows the following format: [ ('TAG', {ATTRIBUTES}, (start_offset, end_offset)), ('TAG', {ATTRIBUTES}, (start_offset, end_offset)), ... ('TAG', {ATTRIBUTES}, (start_offset, end_offset)) ] """ annotations = [] for event, element in etree.iterparse( StringIO(source), events=('start', 'end')): if event == 'start': if element.tag in self.tags_to_spot: try: end_offset = start_offset + len(element.text) except TypeError: continue annotations.append((element.tag, element.attrib, (start_offset, end_offset))) start_offset += len(element.text) elif event == 'end': if element.text is not None and element.tail is not None: start_offset += len(element.tail) return annotations
def convert(blastxml_filename, output_handle): blast_program = None # get an iterable try: context = ElementTree.iterparse(blastxml_filename, events=("start", "end")) except Exception: sys.exit("Invalid data format.") # turn it into an iterator context = iter(context) # get the root element try: event, root = context.next() except Exception: sys.exit("Invalid data format.") for event, elem in context: if event == "end" and elem.tag == "BlastOutput_program": blast_program = elem.text # for every <Iteration> tag if event == "end" and elem.tag == "Iteration": # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> # <Iteration_query-def>Endoplasmic reticulum resident protein 44 # OS=H**o sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> # <Iteration_query-len>406</Iteration_query-len> # <Iteration_hits></Iteration_hits> # # Or, from BLAST 2.2.24+ run online # <Iteration_query-ID>Query_1</Iteration_query-ID> # <Iteration_query-def>Sample</Iteration_query-def> # <Iteration_query-len>516</Iteration_query-len> # <Iteration_hits>... qseqid = elem.findtext("Iteration_query-ID") if re_default_query_id.match(qseqid): # Place holder ID, take the first word of the query definition qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0] qlen = int(elem.findtext("Iteration_query-len")) # for every <Hit> within <Iteration> for hit in elem.findall("Iteration_hits/Hit"): # Expecting either this, # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> # <Hit_def>RecName: Full=Rhodopsin</Hit_def> # <Hit_accession>P56514</Hit_accession> # or, # <Hit_id>Subject_1</Hit_id> # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> # <Hit_accession>Subject_1</Hit_accession> # # apparently depending on the parse_deflines switch # # Or, with a local database not using -parse_seqids can get this, # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis # thaliana chromosome 3, complete sequence</Hit_def> # <Hit_accession>2</Hit_accession> sseqid = hit.findtext("Hit_id").split(None, 1)[0] hit_def = sseqid + " " + hit.findtext("Hit_def") if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"): # Place holder ID, take the first word of the subject definition hit_def = hit.findtext("Hit_def") sseqid = hit_def.split(None, 1)[0] if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): # Alternative place holder ID, again take the first word of hit_def hit_def = hit.findtext("Hit_def") sseqid = hit_def.split(None, 1)[0] # for every <Hsp> within <Hit> for hsp in hit.findall("Hit_hsps/Hsp"): nident = hsp.findtext("Hsp_identity") length = hsp.findtext("Hsp_align-len") # As of NCBI BLAST+ 2.4.0 this is given to 3dp (not 2dp) pident = "%0.3f" % (100 * float(nident) / float(length)) q_seq = hsp.findtext("Hsp_qseq") h_seq = hsp.findtext("Hsp_hseq") m_seq = hsp.findtext("Hsp_midline") assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 + len(h_seq.replace('-', ' ').split()) - 1) mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-') # TODO - Remove this alternative mismatch calculation and test # once satisifed there are no problems expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq) if q == h or q == "-" or h == "-") xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X") if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): sys.exit("%s vs %s mismatches, expected %i <= %i <= %i" % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), int(mismatch), expected_mismatch)) # TODO - Remove this alternative identity calculation and test # once satisifed there are no problems expected_identity = sum(1 for q, h in zip(q_seq, h_seq) if q == h) if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")): sys.exit("%s vs %s identities, expected %i <= %i <= %i" % (qseqid, sseqid, expected_identity, int(nident), expected_identity + q_seq.count("X"))) evalue = hsp.findtext("Hsp_evalue") if evalue == "0": evalue = "0.0" else: evalue = "%0.0e" % float(evalue) bitscore = float(hsp.findtext("Hsp_bit-score")) if bitscore < 100: # Seems to show one decimal place for lower scores bitscore = "%0.1f" % bitscore else: # Note BLAST does not round to nearest int, it truncates bitscore = "%i" % bitscore values = [qseqid, sseqid, pident, length, # hsp.findtext("Hsp_align-len") str(mismatch), gapopen, hsp.findtext("Hsp_query-from"), # qstart, hsp.findtext("Hsp_query-to"), # qend, hsp.findtext("Hsp_hit-from"), # sstart, hsp.findtext("Hsp_hit-to"), # send, evalue, # hsp.findtext("Hsp_evalue") in scientific notation bitscore, # hsp.findtext("Hsp_bit-score") rounded ] if extended: try: sallseqid = ";".join(name.split(None, 1)[0] for name in hit_def.split(" >")) salltitles = "<>".join(name.split(None, 1)[1] for name in hit_def.split(" >")) except IndexError as e: sys.exit("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e)) # print hit_def, "-->", sallseqid positive = hsp.findtext("Hsp_positive") ppos = "%0.2f" % (100 * float(positive) / float(length)) qframe = hsp.findtext("Hsp_query-frame") sframe = hsp.findtext("Hsp_hit-frame") if blast_program == "blastp": # Probably a bug in BLASTP that they use 0 or 1 depending on format if qframe == "0": qframe = "1" if sframe == "0": sframe = "1" slen = int(hit.findtext("Hit_len")) values.extend([sallseqid, hsp.findtext("Hsp_score"), # score, nident, positive, hsp.findtext("Hsp_gaps"), # gaps, ppos, qframe, sframe, # NOTE - for blastp, XML shows original seq, tabular uses XXX masking q_seq, h_seq, str(qlen), str(slen), salltitles, ]) if cols: # Only a subset of the columns are needed values = [values[colnames.index(c)] for c in cols] # print "\t".join(values) output_handle.write("\t".join(values) + "\n") # prevents ElementTree from growing large datastructure root.clear() elem.clear()
# elif ep[0].text == "Monoisotopic Weight": # pass # else: # pass return result with open("json_dict.txt") as f: atc_dict = f.read() d = json.loads(atc_dict) count = 0 flag = 0 output_file = open('output', 'ab') for event, drug in ET.iterparse('drugbank.xml'): if drug.tag != '{http://www.drugbank.ca}drug': continue if not drug.get('type'): continue result = [0] * 5 result.append([0.11, 0.01, 0, 0.99, 0.5]) if drug.get('type') == 'small molecule': result[0] = [1] else: result[0] = [0] items = 0 for first_level in drug: flag = 0
# -*- coding: utf-8 -*- """ Created on Wed Jul 19 20:51:11 2017 @author: Burky """ import xml.etree.cElementTree as ET import pprint from collections import defaultdict filename = 'cleveland_ohio.osm' zip_codes = defaultdict(int) for event, elem in ET.iterparse(filename): if elem.tag == 'way': for tag in elem.findall('tag'): if tag.attrib['k'] == 'addr:postcode': zip_codes[tag.attrib['v']] += 1 pprint.pprint(zip_codes)
import xml.etree.cElementTree as cElementTree import csv folder = 'D:/Will/GIS Data/Raw OSM Data/EP/' raw_osm_file = folder + 'arizona_8-12-2016.osm' outfile = folder + 'parking_lots.csv' writer = csv.writer(open(outfile, 'wb')) writer.writerow(['OSMID']) context = cElementTree.iterparse(raw_osm_file, events=("start", "end")) context = iter(context) event, root = next(context) for event, elem in context: if event == 'end' and elem.tag == 'way': is_parking = False iterator = iter(elem) for child in iterator: if child.get('k') == 'service' and ( child.get('v') == 'parking_aisle' or child.get('v') == 'parking'): is_parking = True if child.get('k') == 'amenity' and child.get('v') == 'parking': is_parking = True if is_parking: writer.writerow([elem.get('id')]) root.clear()
def audit(): for event, elem in ET.iterparse(filename): if is_street_name(elem): audit_street_type(street_types, elem.attrib['v']) print_sorted_dict(street_types)
def parse(self, file_obj): nodes = [] ways = [] print('----------------') context = iter(iterparse(file_obj, events=('start', 'end'))) event, root = next(context) for (event, elem) in context: name = elem.tag attrs = elem.attrib if 'start' == event: """Parse the XML element at the start""" if name == 'node': record = self.fillDefault(attrs) loc = [float(attrs['lon']), float(attrs['lat'])] record['loc'] = loc record['geometry'] = {'type':'Point', 'coordinates': loc} elif name == 'tag': k = attrs['k'] v = attrs['v'] # MongoDB doesn't let us have dots in the key names. #k = k.replace('.', ',,') record['tg'].append((k, v)) record['ky'].append(k) elif name == 'way': # Insert remaining nodes if len(nodes) > 0: self.client.osm2.nodes.insert_many(nodes) nodes = [] record = self.fillDefault(attrs) record['nd'] = [] elif name == 'relation': # Insert remaining ways if len(ways) > 0: self.client.osm2.ways.insert_many(ways) ways = [] record = self.fillDefault(attrs) record['mm'] = [] elif name == 'nd': ref = int(attrs['ref']) record['nd'].append(ref) elif name == 'member': record['mm'].append(dict(type=attrs['type'], ref=int(attrs['ref']), role=attrs['role'])) if attrs['type'] == 'way': ways2relations = self.client.osm2.ways.find_one({ '_id' : ref}) if ways2relations: if 'relations' not in ways2relations: ways2relations['relations'] = [] ways2relations['relations'].append(record['_id']) self.client.osm2.ways.save(ways2relations) elif attrs['type'] == 'node': nodes2relations = self.client.osm2.nodes.find_one({ '_id' : ref}) if nodes2relations: if 'relations' not in nodes2relations: nodes2relations['relations'] = [] nodes2relations['relations'].append(record['_id']) self.client.osm2.nodes.save(nodes2relations) elif 'end' == event: """Finish parsing an element (only really used with nodes, ways and relations)""" if name == 'node': if len(record['tg']) == 0: del record['tg'] if len(record['ky']) == 0: del record['ky'] nodes.append(record) if len(nodes) > 2500: self.client.osm2.nodes.insert_many(nodes) nodes = [] self.writeStatsToScreen() record = {} self.stat_nodes = self.stat_nodes + 1 elif name == 'way': if len(record['tg']) == 0: del record['tg'] if len(record['ky']) == 0: del record['ky'] nds = dict((rec['_id'], rec) for rec in self.client.osm2.nodes.find({ '_id': { '$in': record['nd'] } }, { 'loc': 1, '_id': 1 })) record['loc'] = [] record['geometry'] = dict() locs = [] for node in record['nd']: if node in nds: record['loc'].append(nds[node]['loc']) locs.append(nds[node]['loc']) else: print('node not found: '+ str(node)) record['geometry'] = {'type':'LineString', 'coordinates': locs} ways.append(record) if len(ways) > 2000: self.client.osm2.ways.insert_many(ways) ways = [] record = {} self.statsCount = self.statsCount + 1 if self.statsCount > 1000: self.writeStatsToScreen() self.statsCount = 0 self.stat_ways = self.stat_ways + 1 elif name == 'relation': if len(record['tg']) == 0: del record['tg'] if len(record['ky']) == 0: del record['ky'] self.client.osm2.relations.save(record) record = {} self.statsCount = self.statsCount + 1 if self.statsCount > 10: self.writeStatsToScreen() self.statsCount = 0 self.stat_relations = self.stat_relations + 1 elem.clear() root.clear()
#llamado con 7za e -so eowiki-20091128-pages-meta-history.xml.7z | python stubmetahistory-fetch-celementtree.py eo lang = 'es' #idioma que será analizado if len(sys.argv) >= 2: lang = sys.argv[1] rawtotalrevisions = 0.0 site = wikipedia.Site(lang, 'wikipedia') data = site.getUrl("/wiki/Special:Statistics?action=raw") rawtotalrevisions += float(data.split("edits=")[1].split(";")[0]) source = sys.stdin outputfile = "/mnt/user-store/dump/%swiki-fetched.txt.bz" % (lang) g = bz2.BZ2File(outputfile, "w") context = iterparse(source, events=("start", "end")) context = iter(context) r_newlines = re.compile(ur"(?im)[\n\r\t\s]") r_redirect = re.compile( ur"(?i)^\s*#\s*(REDIRECCIÓN|REDIRECT)\s*\[\[[^\]]+?\]\]") r_disambig = re.compile( ur"(?i)\{\{\s*(d[ei]sambig|desambiguaci[oó]n|des|desamb)\s*[\|\}]") r_links = re.compile(ur"\[\[\s*[^\]]+?\s*[\]\|]") r_categories = re.compile( ur"(?i)\[\[\s*(category|categoría)\s*\:\s*[^\]\|]+\s*[\]\|]") r_sections = re.compile(ur"(?im)^(\=+)[^\=]+?\1") r_templates = "" r_interwikis = re.compile(ur"(?i)\[\[\s*[a-z]{2,3}(-[a-z]{2,3})?\s*\:") r_externallinks = re.compile(ur"://") r_bold = ""
def audit(osmfile, options=None): ''' Audits the OSM file using the different audit functions defined herein. osm_file: str. Filepath to the OSM file being audited options: list of str. Dictates what types of audits are run. Allowed options values: 'counting' 'zips' 'county/state counting' 'county/state reporting' 'lat/long' 'amenities' 'property types' 'property type counts' ''' with open(osmfile, "rb") as fileIn: if options: #Setting up the necessary beginning parameters for each function if 'counting' in options: tag_counts = {} if 'zips' in options: zipLength = 5 zipLengthDict = {zipLength: 0, "Non-number": 0} known_zips = set() knownZipTags = set() zip_tags_ignored = [] if 'county/state counting' in options: county_tags = {} state_tags = {} state_tags_ignored = [ 'state_capital', 'source:hgv:state_network', 'hgv:state_network' ] if 'county/state reporting' in options: counties_found = set() states_found = set() countyKeys = [ 'gnis:County', 'gnis:County_num', 'gnis:county_id', 'gnis:county_name', 'is_in:county', 'tiger:county' ] stateKeys = [ 'addr:state', 'gnis:ST_alpha', 'gnis:state_id', 'nist:state_fips', 'ST_num' ] if 'lat/long' in options: badNodes = defaultdict( list ) #ensures that each new key will automatically have an empty list value if 'amenities' in options: known_amenities = defaultdict(set) if 'property types' in options: propTypes = defaultdict(set) if 'property type counts' in options: propRecords = defaultdict(int) allowed_propTypes = { 'landuse': [ 'residential', 'village_green', 'recreation_ground', 'allotments', 'commercial', 'depot', 'industrial', 'landfill', 'orchard', 'plant_nursery', 'port', 'quarry', 'retail' ], 'building': [ 'apartments', 'farm', 'house', 'detached', 'residential', 'dormitory', 'houseboat', 'bungalow', 'static_caravan', 'cabin', 'hotel', 'commercial', 'industrial', 'retail', 'warehouse', 'kiosk', 'hospital', 'stadium' ] } #---------------------------------------------------------------------- #Iterating through the XML file for _, elem in ET.iterparse(fileIn): if 'counting' in options: tag_counts = count_tags(elem, tag_counts) if 'zips' in options: zipLengthDict, known_zips, knownZipTags = zipCheck(elem, zipLengthDict, known_zips, \ knownZipTags, zip_tags_ignored, digits=zipLength) if 'county/state counting' in options: county_tags, state_tags = countyStateTypeCounter( elem, county_tags, state_tags, state_tags_ignored) if 'county/state reporting' in options: counties_found, states_found = countyStateReporter( elem, countyKeys, stateKeys, counties_found, states_found) if 'lat/long' in options: badNodes = lat_long_checker(elem, badNodes) if 'amenities' in options: known_amenities = amenityFinder(elem, known_amenities) if 'property types' in options: propTypes = propertyType(elem, propTypes) if 'property type counts' in options: propRecords = propertyCounter(elem, allowed_propTypes, propRecords) #---------------------------------------------------------------------- #printing everything once done iterating if 'counting' in options: print("Tags Found") pprint.pprint(tag_counts) if 'zips' in options: print("\nZip Lengths") pprint.pprint(zipLengthDict) print("\nUnique Zip Codes") pprint.pprint(known_zips) print("\nZip Code Tag Keys Found") pprint.pprint(knownZipTags) if 'county/state counting' in options: print("\nTypes of County Tags") pprint.pprint(county_tags) print("\nTypes of State Tags") pprint.pprint(state_tags) if 'county/state reporting' in options: print("\nStates Identified") pprint.pprint(states_found) print("\nCounties Identified") pprint.pprint(counties_found) if 'lat/long' in options: print("\nNodes with Incorrect Latitudes and/or Longitudes") pprint.pprint(badNodes) if 'amenities' in options: print("\nUnique Amenity and Shop Types Identified") pprint.pprint(known_amenities) if 'property types' in options: print("\nUnique Landuse Types") pprint.pprint(propTypes) if 'property type counts' in options: print("\nCounts of Relevant Landuse Types") pprint.pprint(propRecords)
def test_saving_network_with_bonkers_attributes_with_geometry(tmpdir): # attributes are assumed to be a nested dictionary of very specific format. Due to the fact that user can # do virtually anything to edge attributes, or due to calculation error, this may not be the case. If it's not # of correct format, we don't expect it to get saved to the matsim network.xml network = Network('epsg:27700') network.add_node('0', attribs={ 'id': '0', 'x': 1, 'y': 2, 'lat': 1, 'lon': 2 }) network.add_node('1', attribs={ 'id': '1', 'x': 2, 'y': 2, 'lat': 2, 'lon': 2 }) link_attribs = { 'id': '0', 'from': '0', 'to': '1', 'length': 1, 'freespeed': 1, 'capacity': 20, 'permlanes': 1, 'oneway': '1', 'modes': ['car'], 'geometry': LineString([(1, 2), (2, 3), (3, 4)]), 'attributes': float('nan') } network.add_link('0', '0', '1', attribs=link_attribs) network.write_to_matsim(tmpdir) assert_semantically_equal(dict(network.links()), {'0': link_attribs}) assert_semantically_equal( matsim_xml_writer.check_link_attributes(link_attribs), { 'id': '0', 'from': '0', 'to': '1', 'length': 1, 'freespeed': 1, 'capacity': 20, 'permlanes': 1, 'oneway': '1', 'modes': ['car'], 'geometry': LineString([(1, 2), (2, 3), (3, 4)]) }) found_geometry_attrib = False for event, elem in ET.iterparse(os.path.join(tmpdir, 'network.xml'), events=('start', 'end')): if event == 'start': if elem.tag == 'attribute': if elem.attrib['name'] == 'geometry': assert elem.text == '_ibE_seK_ibE_ibE_ibE_ibE' found_geometry_attrib = True assert found_geometry_attrib
def _parse_wikipedia(self, maximum_number_of_documents=None): assert self.__path.exists(), "Wikipedia data does not exist" # Determine size of file compressed_size = self.__path.stat().st_size # Initialise container for documents documents = [] with open(self.__path, mode="rb") as compressed_file: with bz2.BZ2File(compressed_file, mode="rb") as uncompressed_file: total_compressed_bytes_read_at_last_batch = 0 tag_prefix = "" namespaces = [] article_namespace_key = None in_page = False with tqdm(desc="", total=compressed_size, unit="B", unit_scale=True) as progress_bar: for event_number, (event, element) in enumerate( ElementTree.iterparse( uncompressed_file, events=["start", "end", "start-ns", "end-ns"])): if event == "start-ns": namespaces.append(element) namespace_id, namespace_uri = element if namespace_id == "": tag_prefix = f"{{{namespace_uri}}}" elif event == "end-ns": namespace = namespaces.pop() namespace_id, namespace_uri = namespace if namespace_id == "": tag_prefix = "" elif event == "start": if element.tag == f"{tag_prefix}page": in_page = True title = None text = None page_namespace_keys = [] page_redirect = False elif event == "end": tag = element.tag if tag.startswith(tag_prefix): tag = tag.replace(tag_prefix, "", 1) if tag == "namespace": if element.text is None: article_namespace_key = element.attrib[ "key"] elif in_page and tag == "title": if not title: title = element.text else: progress_bar.write( "Multiple titles found for article " f"\"{title}\". First one used.") elif in_page and tag == "text": if not text: text = element.text else: progress_bar.write( "Multiple text sections found for article " f"\"{title}\". First one used.") elif in_page and tag == "ns": page_namespace_keys.append(element.text) elif in_page and tag == "redirect": page_redirect = True elif in_page and tag == "page": in_page = False if article_namespace_key not in page_namespace_keys \ or page_redirect: continue url = self.__page_base_url \ + title.replace(" ", "_") abstract = self._parse_wikipedia_article( article_text=text, sections="first paragraph", include_header_image_captions=False, include_header_infoboxes=False) fulltext = self._parse_wikipedia_article( article_text=text, sections="all", include_header_image_captions=False, include_header_infoboxes=False) document = { "title": title, "url": url, "abstract": abstract, "text": fulltext } documents.append(document) element.clear() if maximum_number_of_documents and \ len(documents) >= maximum_number_of_documents: break if event_number % 1000 == 0: total_compressed_bytes_read = \ compressed_file.tell() compressed_bytes_read_for_batch = \ total_compressed_bytes_read \ - total_compressed_bytes_read_at_last_batch total_compressed_bytes_read_at_last_batch = \ total_compressed_bytes_read progress_bar.update( compressed_bytes_read_for_batch) return documents
def process_to_csv(file_in): """ Main process to clean data and save to CSV""" LOGGER.info("Processing elements in %s", file_in) nodes = [] nodes_tags = [] addresses = [] ways = [] ways_nodes = [] ways_tags = [] for _, elem in ET.iterparse(file_in): # Process the nodes if elem.tag == 'node': node = {} node_id = 0 if elem.keys(): for name, value in elem.items(): if name == 'id': node_id = value node[name] = value # Process any tags if len(elem): address = {'id': node_id} for tag in elem.iter('tag'): # Build a seperate table for real addresses if 'addr' in tag.attrib['k']: address = add_address(tag, address) else: newtag = {'id': node_id} newtag['key'] = tag.attrib['k'].lower() newtag['value'] = tag.attrib['v'] nodes_tags.append(newtag) if len(address) > 1: address = audit_address(address) addresses.append(address) nodes.append(node) # Process ways elif elem.tag == 'way': position = 0 way = {} way_id = 0 if elem.keys(): for name, value in elem.items(): if name == 'id': way_id = value way[name] = value # Process any Children Found if len(elem): # Process Tags for tag in elem.iter('tag'): way_tag = {'id': way_id} way_tag['key'] = tag.attrib['k'].lower() way_tag['value'] = tag.attrib['v'] ways_tags.append(way_tag) # Process Node Relations for ndr in elem.iter('nd'): position += 1 way_node = {'id': way_id} way_node['node_id'] = ndr.attrib['ref'] way_node['position'] = position ways_nodes.append(way_node) ways.append(way) write_csv(nodes, 'output/nodes.csv', NODES_FIELDS) write_csv(nodes_tags, 'output/nodes_tags.csv', TAGS_FIELDS) write_csv(addresses, 'output/node_addresses.csv', ADDRESS_FIELDS) write_csv(ways, 'output/ways.csv', WAYS_FIELDS) write_csv(ways_tags, 'output/ways_tags.csv', TAGS_FIELDS) write_csv(ways_nodes, 'output/ways_nodes.csv', WAYS_NODES_FIELDS) return
def processPMCFile(source): # Skip to the article element in the file for event, elem in etree.iterparse(source, events=('start', 'end', 'start-ns', 'end-ns')): if (event == 'end' and elem.tag == 'article'): pmidText, pmcidText, doiText, pubYear, pubMonth, pubDay, journal, journalISO = getMetaInfoForPMCArticle( elem) # We're going to process the main article along with any subarticles # And if any of the subarticles have distinguishing IDs (e.g. PMID), then # that'll be used, otherwise the parent article IDs will be used subarticles = [elem] + elem.findall('./sub-article') for articleElem in subarticles: if articleElem == elem: # This is the main parent article. Just use its IDs subPmidText, subPmcidText, subDoiText, subPubYear, subPubMonth, subPubDay, subJournal, subJournalISO = pmidText, pmcidText, doiText, pubYear, pubMonth, pubDay, journal, journalISO else: # Check if this subarticle has any distinguishing IDs and use them instead subPmidText, subPmcidText, subDoiText, subPubYear, subPubMonth, subPubDay, subJournal, subJournalISO = getMetaInfoForPMCArticle( articleElem) if subPmidText == '' and subPmcidText == '' and subDoiText == '': subPmidText, subPmcidText, subDoiText = pmidText, pmcidText, doiText if subPubYear == None: subPubYear = pubYear subPubMonth = pubMonth subPubDay = pubDay if subJournal == None: subJournal = journal subJournalISO = journalISO # Extract the title of paper title = articleElem.findall( './front/article-meta/title-group/article-title' ) + articleElem.findall( './front-stub/title-group/article-title') assert len(title) <= 1 titleText = extractTextFromElemList(title) titleText = [ removeWeirdBracketsFromOldTitles(t) for t in titleText ] # Get the subtitle (if it's there) subtitle = articleElem.findall( './front/article-meta/title-group/subtitle' ) + articleElem.findall('./front-stub/title-group/subtitle') subtitleText = extractTextFromElemList(subtitle) subtitleText = [ removeWeirdBracketsFromOldTitles(t) for t in subtitleText ] # Extract the abstract from the paper abstract = articleElem.findall( './front/article-meta/abstract') + articleElem.findall( './front-stub/abstract') abstractText = extractTextFromElemList(abstract) # Extract the full text from the paper as well as supplementaries and floating blocks of text articleText = extractTextFromElemList( articleElem.findall('./body')) backText = extractTextFromElemList( articleElem.findall('./back')) floatingText = extractTextFromElemList( articleElem.findall('./floats-group')) document = { 'pmid': subPmidText, 'pmcid': subPmcidText, 'doi': subDoiText, 'pubYear': subPubYear, 'pubMonth': subPubMonth, 'pubDay': subPubDay, 'journal': subJournal, 'journalISO': subJournalISO } textSources = {} textSources['title'] = titleText textSources['subtitle'] = subtitleText textSources['abstract'] = abstractText textSources['article'] = articleText textSources['back'] = backText textSources['floating'] = floatingText for k in textSources.keys(): tmp = textSources[k] tmp = [t for t in tmp if len(t) > 0] tmp = [html.unescape(t) for t in tmp] tmp = [removeBracketsWithoutWords(t) for t in tmp] textSources[k] = tmp document['textSources'] = textSources yield document # Less important here (compared to abstracts) as each article file is not too big elem.clear()
def process_map(filename): users = set() for _, element in ET.iterparse(filename): if 'user' in element.attrib: users.add(get_user(element)) return users
def parseFile(self, f): """ Parses a single Doxygen XML file :param f: XML file path """ documentable_members = 0 documented_members = 0 # Wrap everything in a try, as sometimes Doxygen XML is malformed try: for event, elem in ET.iterparse(f): if event == 'end' and elem.tag == 'compounddef': if self.elemIsPublicClass(elem): # store documentation status members, documented, undocumented, bindable, has_brief_description, found_version_added = self.parseClassElem( elem) documentable_members += members documented_members += documented class_name = elem.find('compoundname').text acceptable_missing = self.acceptable_missing.get( class_name, []) if not self.hasGroup( class_name ) and not class_name in self.acceptable_missing_group: self.classes_missing_group.append(class_name) if not class_name in self.acceptable_missing_brief and not has_brief_description: self.classes_missing_brief.append(class_name) if not class_name in self.acceptable_missing_added_note and not found_version_added: self.classes_missing_version_added.append( class_name) # GEN LIST # if len(undocumented) > 0: # print('"%s": [%s],' % (class_name, ", ".join(['"%s"' % e.replace('"', '\\"') for e in undocumented]))) unacceptable_undocumented = undocumented - set( acceptable_missing) # do a case insensitive check too unacceptable_undocumented_insensitive = set([ u.lower() for u in undocumented ]) - set([u.lower() for u in acceptable_missing]) if len(unacceptable_undocumented_insensitive) > 0: self.undocumented_members[class_name] = {} self.undocumented_members[class_name][ 'documented'] = documented self.undocumented_members[class_name][ 'members'] = members self.undocumented_members[class_name][ 'missing_members'] = unacceptable_undocumented # store bindable members if self.classElemIsBindable(elem): for m in bindable: self.bindable_members.append(m) elem.clear() except ET.ParseError as e: # sometimes Doxygen generates malformed xml (eg for < and > operators) line_num, col = e.position with open(f, 'r') as xml_file: for i, l in enumerate(xml_file): if i == line_num - 1: line = l break caret = '{:=>{}}'.format('^', col) print(('ParseError in {}\n{}\n{}\n{}'.format(f, e, line, caret))) self.documentable_members += documentable_members self.documented_members += documented_members
# first data import import xml.etree.cElementTree as ET from consts import * file_name = r'C:\Users\Meital\Desktop\Posts.xml' questions_cnt = 0 answers_cnt = 0 if __name__ == "__main__": DB = client.sof_new POSTS_DB = DB.posts #POSTS_DB.drop() POSTS_DB.ensure_index("Id", unique=True) for event, elem in ET.iterparse(file_name, events=("start", "end")): if event == 'start': post_type = elem.get('PostTypeId') if post_type == '1': title = elem.get('Title') tags = elem.get('Tags') post_id = elem.get('Id') if POSTS_DB.find({"Id": post_id}).count() == 0: rep_id = POSTS_DB.insert(elem.attrib, w=0) questions_cnt += 1 else: print 'id ' + id + ' already exists' elem.clear() print 'inserted:' + str(questions_cnt) for event, elem in ET.iterparse(file_name, events=("start", "end")):
end_tag = str[index:].index('>') yield str[(index + 1):(index + end_tag)] index += end_tag + 1 except ValueError: raise Exception("Tag parsing error in \"%s\"" % str) else: raise Exception("Tag parsing error in \"%s\"" % str) if len(sys.argv) != 2: raise Exception("Usage: %s so-files-directory" % sys.argv[0]) os.chdir(sys.argv[1]) filename = "Posts.xml" posts = ElementTree.iterparse(filename) tags = {} tag_id = 1 print "COPY posts (id, type, creation, score, viewcount, title, body, userid, lastactivity, tags, answercount, commentcount) FROM stdin;" for event, post in posts: if event == "end" and post.tag == "row": id = int(post.attrib["Id"]) if post.attrib.has_key("PostTypeId"): type = int(post.attrib["PostTypeId"]) else: type = "\N" creation = post.attrib["CreationDate"]
def processArticleFiles(filelist, outFile, processFunction): if not isinstance(filelist, list): filelist = [filelist] # Go through the list of filenames and open each one for filename in filelist: with open(filename, 'r') as openfile: # Skip to the article element in the file for event, elem in etree.iterparse(openfile, events=('start', 'end', 'start-ns', 'end-ns')): if (event == 'end' and elem.tag == 'article'): pmidText, pmcidText, doiText, pubYear = getMetaInfoForPMCArticle( elem) # We're going to process the main article along with any subarticles # And if any of the subarticles have distinguishing IDs (e.g. PMID), then # that'll be used, otherwise the parent article IDs will be used subarticles = [elem] + elem.findall('./sub-article') for articleElem in subarticles: if articleElem == elem: # This is the main parent article. Just use its IDs subPmidText, subPmcidText, subDoiText, subPubYear = pmidText, pmcidText, doiText, pubYear else: # Check if this subarticle has any distinguishing IDs and use them instead subPmidText, subPmcidText, subDoiText, subPubYear = getMetaInfoForPMCArticle( articleElem) if subPmidText == '' and subPmcidText == '' and subDoiText == '': subPmidText, subPmcidText, subDoiText = pmidText, pmcidText, doiText if subPubYear == '': subPubYear = pubYear # Information about the source of this text textSourceInfo = { 'pmid': subPmidText, 'pmcid': subPmcidText, 'doi': subDoiText, 'pubYear': subPubYear } # Extract the title of paper title = articleElem.findall( './front/article-meta/title-group/article-title' ) + articleElem.findall( './front-stub/title-group/article-title') assert len(title) <= 1 titleText = extractTextFromElemList(title) titleText = [ removeWeirdBracketsFromOldTitles(t) for t in titleText ] # Get the subtitle (if it's there) subtitle = articleElem.findall( './front/article-meta/title-group/subtitle' ) + articleElem.findall( './front-stub/title-group/subtitle') subtitleText = extractTextFromElemList(subtitle) subtitleText = [ removeWeirdBracketsFromOldTitles(t) for t in subtitleText ] # Extract the abstract from the paper abstract = articleElem.findall( './front/article-meta/abstract' ) + articleElem.findall('./front-stub/abstract') abstractText = extractTextFromElemList(abstract) # Extract the full text from the paper as well as supplementaries and floating blocks of text articleText = extractTextFromElemList( articleElem.findall('./body')) backText = extractTextFromElemList( articleElem.findall('./back')) floatingText = extractTextFromElemList( articleElem.findall('./floats-group')) # Combine all the text we want to process allText = titleText + subtitleText + abstractText + articleText + backText + floatingText allText = [t for t in allText if len(t) > 0] allText = [htmlUnescape(t) for t in allText] allText = [ removeBracketsWithoutWords(t) for t in allText ] # Get the co-occurrences using a single list processFunction(outFile, allText, textSourceInfo) # Less important here (compared to abstracts) as each article file is not too big elem.clear()
def audit(osm_file): audit_results = {} field_types = {'node': {}, 'node_tags': {}, 'way': {}, 'way_tags': {}, 'way_nodes': {}} field_validity = {'node': {}, 'node_tags': {}, 'way': {}, 'way_tags': {}, 'way_nodes': {}} name_en_re = re.compile(r'\b\S+\.?$', re.IGNORECASE) expected = ['Road', 'Street', 'Expressway', 'Bridge', 'Highway', 'River', 'Lake', "Hutong" 'Park', 'Zone', 'Area', 'Alley', 'Market', 'Campus', 'Gate', 'Hall', 'Engineering', 'China', 'Elegance', 'Avenue', 'Mansion', 'Square', 'Palace', 'Hotel', 'Rail', 'Quarter', "Building", "Line", "Apartment", "Airport", "Institute", "College"] # for a given tag, update the set of its field types based on current element def update_field_types(e, tag): for field in FIELDS[tag]: if field not in field_types[tag]: field_types[tag][field] = set() field_types[tag][field].add(detect_type(e.attrib[field])) # 验证时间标记 def validate_timestamp(e, tag): timestamp = parse_datetime(e.attrib['timestamp']) if timestamp is not None: if timestamp < field_validity[tag]['timestamp'][0]: field_validity[tag]['timestamp'][0] = timestamp if timestamp > field_validity[tag]['timestamp'][1]: field_validity[tag]['timestamp'][1] = timestamp # 验证经纬度和时间信息是否有效 def validate_node(e, tag): if field_validity[tag] == {}: field_validity[tag] = { 'lat': [90, 0], 'lon': [180, -180], 'timestamp': [datetime.now(), datetime(1970, 1, 1, 0, 0, 0)] } lat = parse_float(e.attrib['lat']) if lat is not None: if lat < field_validity[tag]['lat'][0]: field_validity[tag]['lat'][0] = lat if lat > field_validity[tag]['lat'][1]: field_validity[tag]['lat'][1] = lat lon = parse_float(e.attrib['lon']) if lon is not None: if lon < field_validity[tag]['lon'][0]: field_validity[tag]['lon'][0] = lon if lon > field_validity[tag]['lon'][1]: field_validity[tag]['lon'][1] = lon validate_timestamp(e, tag) # 验证 way 的 timestamp 是否有效 def validate_way(e, tag): if field_validity[tag] == {}: field_validity[tag] = { 'timestamp': [datetime.now(), datetime(1970, 1, 1, 0, 0, 0)] } validate_timestamp(e, tag) # 验证 postcode 格式是否正确 def validate_postcode(e, tag): if e.attrib['k'] == 'addr:postcode': postcode = e.attrib['v'] if not check_postcode(postcode): field_validity[tag]['postcode'].add(postcode) def validate_node_tags(e, tag): if field_validity[tag] == {}: field_validity[tag] = { 'postcode': set() } validate_postcode(e, tag) # 验证 name:en def validate_way_name_en(e, tag): if e.attrib['k'] == 'name:en': name_en = e.attrib['v'] m = name_en_re.search(name_en) if m: way_type = m.group() if way_type not in expected: field_validity[tag]['name_en'][way_type].add(name_en) # 验证 way 的 'name_en'、'postcode' def validate_way_tags(e, tag): if field_validity[tag] == {}: field_validity[tag] = { 'name_en': defaultdict(set), 'postcode': set() } validate_way_name_en(e, tag) validate_postcode(e, tag) for _, ele in ET.iterparse(osm_file): if ele.tag == 'node': update_field_types(ele, 'node') validate_node(ele, 'node') for e_tag in ele.iter('tag'): update_field_types(e_tag, 'node_tags') validate_node_tags(e_tag, 'node_tags') if ele.tag == 'way': update_field_types(ele, 'way') validate_way(ele, 'way') for e_tag in ele.iter('tag'): update_field_types(e_tag, 'way_tags') validate_way_tags(e_tag, 'way_tags') for e_nd in ele.iter('nd'): update_field_types(e_nd, 'way_nodes') audit_results['field_types'] = field_types audit_results['field_validity'] = field_validity return audit_results
def process_medline_file( source: Union[str, TextIO], tag_handlers: Dict[str, TagHandlerFunction] = {}) -> Iterable[MedlineArticle]: """ Args: source: path to the MEDLINE xml file """ for event, elem in etree.iterparse(source, events=("start", "end", "start-ns", "end-ns")): if event == "end" and elem.tag == "PubmedArticle": # MedlineCitation'): # Try to extract the pmid_id pmid_field = elem.find("./MedlineCitation/PMID") assert pmid_field is not None pmid = pmid_field.text journal_year, journal_month, journal_day = get_journal_date_for_medline_file( elem, pmid) entry_year, entry_month, entry_day = get_pubmed_entry_date( elem, pmid) jComparison = tuple( 9999 if d is None else d for d in [journal_year, journal_month, journal_day]) eComparison = tuple(9999 if d is None else d for d in [entry_year, entry_month, entry_day]) if ( jComparison < eComparison ): # The PubMed entry has been delayed for some reason so let's try the journal data pub_year, pub_month, pub_day = journal_year, journal_month, journal_day else: pub_year, pub_month, pub_day = entry_year, entry_month, entry_day # Extract the authors author_elems = elem.findall( "./MedlineCitation/Article/AuthorList/Author") authors = [] for author_elem in author_elems: forename = author_elem.find("./ForeName") lastname = author_elem.find("./LastName") collectivename = author_elem.find("./CollectiveName") name = None if (forename is not None and lastname is not None and forename.text is not None and lastname.text is not None): name = "%s %s" % (forename.text, lastname.text) elif lastname is not None and lastname.text is not None: name = lastname.text elif forename is not None and forename.text is not None: name = forename.text elif collectivename is not None and collectivename.text is not None: name = collectivename.text else: raise RuntimeError( "Unable to find authors in Pubmed citation (PMID=%s)" % pmid) authors.append(name) chemicals = [] chemical_elems = elem.findall( "./MedlineCitation/ChemicalList/Chemical/NameOfSubstance") for chemical_elem in chemical_elems: chem_id = chemical_elem.attrib["UI"] name = chemical_elem.text # chemicals.append((chem_id,name)) chemicals.append("%s|%s" % (chem_id, name)) chemicals_txt = "\t".join(chemicals) mesh_headings = [] mesh_elems = elem.findall( "./MedlineCitation/MeshHeadingList/MeshHeading") for mesh_elem in mesh_elems: descriptor_elem = mesh_elem.find("./DescriptorName") mesh_id = descriptor_elem.attrib["UI"] major_topic_yn = descriptor_elem.attrib["MajorTopicYN"] name = descriptor_elem.text assert "|" not in mesh_id and "~" not in mesh_id, "Found delimiter in %s" % mesh_id assert "|" not in major_topic_yn and "~" not in major_topic_yn, ( "Found delimiter in %s" % major_topic_yn) assert "|" not in name and "~" not in name, "Found delimiter in %s" % name mesh_heading = "Descriptor|%s|%s|%s" % (mesh_id, major_topic_yn, name) qualifier_elems = mesh_elem.findall("./QualifierName") for qualifier_elem in qualifier_elems: mesh_id = qualifier_elem.attrib["UI"] major_topic_yn = qualifier_elem.attrib["MajorTopicYN"] name = qualifier_elem.text assert "|" not in mesh_id and "~" not in mesh_id, ( "Found delimiter in %s" % mesh_id) assert "|" not in major_topic_yn and "~" not in major_topic_yn, ( "Found delimiter in %s" % major_topic_yn) assert "|" not in name and "~" not in name, "Found delimiter in %s" % name mesh_heading += "~Qualifier|%s|%s|%s" % ( mesh_id, major_topic_yn, name) mesh_headings.append(mesh_heading) mesh_headings_txt = "\t".join(mesh_headings) supplementary_concepts = [] concept_elems = elem.findall( "./MedlineCitation/SupplMeshList/SupplMeshName") for concept_elem in concept_elems: concept_id = concept_elem.attrib["UI"] concept_type = concept_elem.attrib["Type"] concept_name = concept_elem.text # supplementary_concepts.append((concept_id,concept_type,concept_name)) supplementary_concepts.append( "%s|%s|%s" % (concept_id, concept_type, concept_name)) supplementary_concepts_txt = "\t".join(supplementary_concepts) doi_elems = elem.findall( "./PubmedData/ArticleIdList/ArticleId[@IdType='doi']") dois = [ doi_elem.text for doi_elem in doi_elems if doi_elem.text and doi_regex.match(doi_elem.text) ] doi = None if dois: doi = dois[0] # We'll just use DOI the first one provided pmc_elems = elem.findall( "./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']") assert len(pmc_elems ) <= 1, "Foud more than one PMCID with PMID: %s" % pmid pmcid = None if len(pmc_elems) == 1: pmcid = pmc_elems[0].text pub_type_elems = elem.findall( "./MedlineCitation/Article/PublicationTypeList/PublicationType" ) pub_type = [ e.text for e in pub_type_elems if e.text not in pub_type_skips ] pub_type_txt = "|".join(pub_type) # Extract the title of paper title = elem.findall("./MedlineCitation/Article/ArticleTitle") title_text = extract_text_chunks(title, tag_handlers=tag_handlers) title_text = [ remove_weird_brackets_from_old_titles(chunk.text) for chunk in title_text if chunk.text ] title_text = [t for t in title_text if len(t) > 0] title_text = [html.unescape(t) for t in title_text] title_text = [remove_brackets_without_words(t) for t in title_text] # Extract the abstract from the paper abstract = elem.findall( "./MedlineCitation/Article/Abstract/AbstractText") abstract_text = extract_text_chunks(abstract, tag_handlers=tag_handlers) abstract_text = [ chunk.text for chunk in abstract_text if len(chunk.text) > 0 ] abstract_text = [html.unescape(t) for t in abstract_text] abstract_text = [ remove_brackets_without_words(t) for t in abstract_text ] journal_title_fields = elem.findall( "./MedlineCitation/Article/Journal/Title") journal_title_iso_fields = elem.findall( "./MedlineCitation/Article/Journal/ISOAbbreviation") journal_title, journal_iso_title = "", "" assert len(journal_title_fields) <= 1, "Error with pmid=%s" % pmid assert len( journal_title_iso_fields) <= 1, "Error with pmid=%s" % pmid if journal_title_fields: journal_title = journal_title_fields[0].text if journal_title_iso_fields: journal_iso_title = journal_title_iso_fields[0].text document = {} document["pmid"] = pmid document["pmcid"] = pmcid document["doi"] = doi document["pubYear"] = pub_year document["pubMonth"] = pub_month document["pubDay"] = pub_day document["title"] = title_text document["abstract"] = abstract_text document["journal"] = journal_title document["journalISO"] = journal_iso_title document["authors"] = authors document["chemicals"] = chemicals_txt document["meshHeadings"] = mesh_headings_txt document["supplementaryMesh"] = supplementary_concepts_txt document["publicationTypes"] = pub_type_txt yield MedlineArticle(document) # Important: clear the current element from memory to keep memory usage low elem.clear()
def audit(): for event, elem in ET.iterparse(osm_file): if is_street_name(elem): audit_street_type(street_types, elem.attrib['v']) print_sorted_dict(street_types) if __name__ == '__main__': audit() #mapparser.py import xml.etree.cElementTree as ET #导入库函数 ET import pprint def count_tags(filename): elem_dict = {}.fromkeys(('bounds','member','nd','node','osm','relation','tag','way'),0) #创建空白字典,包含key值 且value值等于0 for _, elem in ET.iterparse(filename, events=("start",)): #遍历数据文件 if elem.tag in elem_dict: #通过筛选是否存在空字典中key值对应的标签 elem_dict[elem.tag] += 1 #改变字典的value值 else: elem_dict[elem.tag] = 1 #此处的[]使用 elem.tag 而没有使用 '' 符号 ???? return elem_dict def test(): tags = count_tags('example.osm') pprint.pprint(tags) assert tags =={'bounds': 1, 'member': 3, 'nd': 4, 'node': 20, 'osm': 1, 'relation': 1, 'tag': 7, 'way': 1} if __name__ =="__main__": test() # Iterating through Ways Tags import xml.etree.cElementTree as ET from collections import defaultdict import re import pprint #练习: 标签类型 import xml.etree.cElementTree as ET import pprint import re """ Your task is to explore the data a bit more. Before you process the data and add it into your database, you should check the "k" value for each "<tag>" and see if there are any potential problems. We have provided you with 3 regular expressions to check for certain patterns in the tags. As we saw in the quiz earlier, we would like to change the data model and expand the "addr:street" type of keys to a dictionary like this: {"address": {"street": "Some value"}} So, we have to see if we have such tags, and if we have any tags with problematic characters. Please complete the function 'key_type', such that we have a count of each of four tag categories in a dictionary: "lower", for tags that contain only lowercase letters and are valid, "lower_colon", for otherwise valid tags with a colon in their names, "problemchars", for tags with problematic characters, and "other", for other tags that do not fall into the other three categories. See the 'process_map' and 'test' functions for examples of the expected format. """ lower = re.compile(r'^([a-z]|_)*$') lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$') problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]') def key_type(element, keys): if element.tag == "tag": if lower.match(element.attrib['k']): keys["lower"] +=1 elif lower_colon.match(element.attrib['k']): keys["lower_colon"] +=1 elif problemchars.search(element.attrib['k']): keys["problemchars"] +=1 else: keys["other"] +=1 return keys def process_map(filename): keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} for _, element in ET.iterparse(filename): keys = key_type(element, keys) return keys #练习:探索用户 import xml.etree.cElementTree as ET import pprint import re """ Your task is to explore the data a bit more. The first task is a fun one - find out how many unique users have contributed to the map in this particular area! The function process_map should return a set of unique user IDs ("uid") """ def get_user(element): if 'uid' in element.attrib: return element.attrib['uid'] def process_map(filename): users = set() for _, element in ET.iterparse(filename): if get_user(element): users.add(get_user(element)) return users def test(): users = process_map('example.osm') pprint.pprint(users) assert len(users) == 6 if __name__ == "__main__": test() def test(): # You can use another testfile 'map.osm' to look at your solution # Note that the assertion below will be incorrect then. # Note as well that the test function here is only used in the Test Run; # when you submit, your code will be checked against a different dataset. keys = process_map('example.osm') pprint.pprint(keys) assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1} if __name__ == "__main__": test()
#helps to identify important tag keys in the sample data. #Here we will ctreat a dictionary holding key as key name and value as number of times it has appeard. import xml.etree.cElementTree as ET import pprint import re filename = 'sample20.osm' node_tag_keys = dict() for _, element in ET.iterparse(filename): if element.tag == 'node': for e in element: if e.attrib['k'] in node_tag_keys: node_tag_keys[e.attrib['k']] += 1 else: node_tag_keys[e.attrib['k']] = 1 for nodes in node_tag_keys: #if the count of repetition is greater than 50 then it will print those names and their values if node_tag_keys[nodes] >= 50: print('{}->{}'.format(nodes, node_tag_keys[nodes]))
def process_map(filename): check = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} for event, element in ET.iterparse(filename, events=("start", )): check = key_check(element, check) return check
kwargs['failfunc'] = failfunc kwargs['async'] = True try: mdpath[repo] = repo._retrieveMD(name, **kwargs) except RepoError, e: failfunc(e) if async: grabber.parallel_wait() # parse metadata, create DeltaPackage instances for repo, cpath in mdpath.items(): pinfo_repo = pinfo[repo] path = repo_gen_decompress(cpath, 'prestodelta.xml', cached=repo.cache) for ev, el in iterparse(path): if el.tag != 'newpackage': continue name = el.get('name') arch = el.get('arch') new = name, arch, el.get('epoch'), el.get('version'), el.get( 'release') index = pinfo_repo.get(new) if index is not None: po = pkgs[index] perc = repo.deltarpm_percentage if perc is None: perc = ayum.conf.deltarpm_percentage best = po.size * (perc / 100.0) have = oldrpms.get(repo, {}).get((name, arch), {}) for el in el.findall('delta'): size = int(el.find('size').text)
# Do we have what we need? if not file_ref or not file_hyp: print help_str exit(0) # Totals # System t_counts_sys = Counter({ D:Counter({TP:0,TN:0,FP:0,FN:0,FPN:0}), C:Counter({TP:0,TN:0,FP:0,FN:0,FPN:0}) }) # Baseline t_counts_base = Counter({ D:Counter({TP:0,TN:0,FP:0,FN:0,FPN:0}), C:Counter({TP:0,TN:0,FP:0,FN:0,FPN:0}) }) cg = CG.CandidateGenerator() f_hyp = open(file_hyp,"r") context = ET.iterparse(file_ref, events=("start", "end")) context = iter(context) event, root = context.next() print_info(file_ref, file_hyp, max_a, max_m, optimise, b, w) # Show results per sentence? if per_sent: print "\nSENTENCE RESULTS" print_header(per_sent) # Read gold standard and process each sentence for event, elem in context: if event == "end": if elem.tag == "sentence": sid = elem.get("id") # Sentence ID
def xml_parse(xm_file, ifilter, tfilter, nfilter, list): """ Function for parsing XML files created by DNSRecon and apply filters. """ iplist = [] for event, elem in cElementTree.iterparse(xm_file): # Check if it is a record if elem.tag == "record": # Check that it is a RR Type that has an IP Address if "address" in elem.attrib: # Check if the IP is in the filter list of IPs to ignore if (len(ifilter) == 0 or IPAddress(elem.attrib['address']) in ifilter) and (elem.attrib['address'] != "no_ip"): # Check if the RR Type against the types if re.match(tfilter, elem.attrib['type'], re.I): # Process A, AAAA and PTR Records if re.search(r'PTR|^[A]$|AAAA', elem.attrib['type']) \ and re.search(nfilter, elem.attrib['name'], re.I): if list: if elem.attrib['address'] not in iplist: print elem.attrib['address'] else: print_good("{0} {1} {2}".format(elem.attrib['type'], elem.attrib['name'], elem.attrib['address'])) # Process NS Records elif re.search(r'NS', elem.attrib['type']) and \ re.search(nfilter, elem.attrib['target'], re.I): if list: if elem.attrib['address'] not in iplist: iplist.append(elem.attrib['address']) else: print_good("{0} {1} {2}".format(elem.attrib['type'], elem.attrib['target'], elem.attrib['address'])) # Process SOA Records elif re.search(r'SOA', elem.attrib['type']) and \ re.search(nfilter, elem.attrib['mname'], re.I): if list: if elem.attrib['address'] not in iplist: iplist.append(elem.attrib['address']) else: print_good("{0} {1} {2}".format(elem.attrib['type'], elem.attrib['mname'], elem.attrib['address'])) # Process MS Records elif re.search(r'MX', elem.attrib['type']) and \ re.search(nfilter, elem.attrib['exchange'], re.I): if list: if elem.attrib['address'] not in iplist: iplist.append(elem.attrib['address']) else: print_good("{0} {1} {2}".format(elem.attrib['type'], elem.attrib['exchange'], elem.attrib['address'])) # Process SRV Records elif re.search(r'SRV', elem.attrib['type']) and \ re.search(nfilter, elem.attrib['target'], re.I): if list: if elem.attrib['address'] not in iplist: iplist.append(elem.attrib['address']) else: print_good("{0} {1} {2} {3}".format(elem.attrib['type'], elem.attrib['name'], elem.attrib['address'], elem.attrib['target'], elem.attrib['port'])) else: if re.match(tfilter, elem.attrib['type'], re.I): # Process TXT and SPF Records if re.search(r'TXT|SPF', elem.attrib['type']): if not list: print_good("{0} {1}".format(elem.attrib['type'], elem.attrib['strings'])) # Process IPs in list if len(iplist) > 0: try: for ip in filter(None, iplist): print_line(ip) except IOError: sys.exit(0)
def parse(self): for event, elem in ElementTree.iterparse(self.dump): if elem.tag == "artist": artist = self.proc_artist(elem) self.download_artist(artist)
def process_map(filename): keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} for _, element in ET.iterparse(filename): keys = key_type(element, keys) return keys
def process_users(filename): users = [] for _, element in ET.iterparse(filename): if "user" in element.attrib: users.append(element.attrib["user"]) return users
def iterative_files_changelog_parser(file_extension, filelists_xml_path, other_xml_path): """ Iteratively parse filelists.xml and other.xml, to avoid over-use of memory. createrepo_c parses everything in bulk, into memory. For large repositories such as RHEL 7 or OL 7, this can require more than 5gb of memory. That isn't acceptable, especially when many repositories are being synced at once. The main offenders are other.xml (changelogs) and filelists.xml (list of owned files). These also happen to be relatively easy to parse. This function, ported from Pulp 2, takes a path to filelists.xml and other.xml, creates a streaming parser for each, and then yields one package worth of data from each file. """ # it's basically always gzip, but we'll cover our bases w/ all the possibilites if file_extension == "gz": open_func = gzip.open elif file_extension == "xz": open_func = lzma.open elif file_extension == "bz2": open_func = bz2.open elif file_extension == "xml": open_func = open else: raise TypeError("Unknown metadata compression type") # TODO: zstd with open_func(filelists_xml_path) as filelists_xml, open_func( other_xml_path) as other_xml: filelists_parser = iterparse(filelists_xml, events=("start", "end")) filelists_xml_iterator = iter(filelists_parser) other_parser = iterparse(other_xml, events=("start", "end")) other_xml_iterator = iter(other_parser) # get a hold of the root element so we can clear it # this prevents the entire parsed document from building up in memory try: filelists_root_element = next(filelists_xml_iterator)[1] other_root_element = next(other_xml_iterator)[1] # I know. This is a terrible misuse of SyntaxError. Don't blame the messenger. except SyntaxError: log.error("failed to parse XML metadata file") raise while True: for event, filelists_element in filelists_xml_iterator: # if we're not at a fully parsed package element, keep going if event != "end": continue # make this work whether the file has namespace as part of the tag or not if not (filelists_element.tag == "package" or re.sub( NS_STRIP_RE, "", filelists_element.tag) == "package"): continue break for event, other_element in other_xml_iterator: # if we're not at a fully parsed package element, keep going if event != "end": continue # make this work whether the file has namespace as part of the tag or not if not (other_element.tag == "package" or re.sub( NS_STRIP_RE, "", other_element.tag) == "package"): continue break (filelists_pkgid, files) = process_filelists_package_element(filelists_element) (other_pkgid, changelogs) = process_other_package_element(other_element) filelists_root_element.clear( ) # clear all previously parsed ancestors of the root other_root_element.clear() assert ( filelists_pkgid == other_pkgid ), "Package id for filelists.xml ({}) and other.xml ({}) do not match".format( filelists_pkgid, other_pkgid) yield filelists_pkgid, files, changelogs
def audit(): osm_file = open(OSMFILE, encoding='utf8') for event, elem in ET.iterparse(osm_file): if is_phone_number(elem): audit_phone_numbers(elem.attrib["v"]) osm_file.close()