def parse_positions(self): doc_context = etree.iterparse(self.filename, events=('end',), tag='{http://earth.google.com/kml/2.2}Document', encoding='utf-8') context = etree.iterparse(self.filename, events=('end',), tag='{http://earth.google.com/kml/2.2}Placemark', encoding='utf-8') collarname = None xp = etree.XPath("//*[local-name()='name']/text()") for action, elem in doc_context: collarname_result = xp(elem) if len(collarname_result) > 0: collarname = collarname_result[0] break else: raise Exception("Error getting collarname") if not collarname: raise Exception("No collarname found") try: collar = Collar.objects.get(serial=collarname) except ObjectDoesNotExist: collar = Collar.objects.create(serial=collarname) collar.save() except Exception, e: raise Exception("DB Error: %s" % e)
def users_xmldata(): """ Extracts data from XML file and groups it by user_name. """ locale.setlocale(locale.LC_COLLATE, 'pl_PL.utf-8') data = {} with open(app.config['DATA_XML'], 'r') as xml_file: for event, element in etree.iterparse(xml_file, tag='server'): protocol = element.findtext('protocol') host = element.findtext('host') port = element.findtext('port') xml_file.seek(0) for event, element in etree.iterparse(xml_file, tag='user'): if element.tag == 'user': user_id = element.attrib.get('id') image = element.findtext('avatar') link_to_avatar = '{}://{}:{}{}'.format(protocol, host, port, image) user_name = element.findtext('name') user_name = user_name.encode('utf-8') data[user_id] = { 'user_name': user_name, 'link_to_avatar': link_to_avatar, } return sorted( data.iteritems(), key=lambda x: x[1]['user_name'], cmp=locale.strcoll )
def dump_tables(table_names, anatomy, xml_path, dump_path, dump_database_name, log_filename='dump.log'): logging.basicConfig(filename=os.path.join(dump_path, log_filename), level=logging.INFO) db = sqlite3.connect(os.path.join(dump_path, dump_database_name)) for table_name in table_names: print "Opening {0}.xml".format(table_name) with open(os.path.join(xml_path, table_name + '.xml')) as xml_file: tree = etree.iterparse(xml_file) sql_create = CREATE_QUERY.format( table=table_name, fields=", ".join(['{0} {1}'.format(name, type) for name, type in anatomy[table_name].items()])) print 'Creating table {0}'.format(table_name) try: logging.info(sql_create) db.execute(sql_create) except Exception, e: logging.warning(e) for _, row in etree.iterparse(xml_file, tag="row"): try: logging.debug(row.attrib.keys()) db.execute(INSERT_QUERY.format( table=table_name, columns=', '.join(row.attrib.keys()), values=('?, ' * len(row.attrib.keys()))[:-2]), row.attrib.values()) print ".", except Exception, e: logging.warning(e) print "x", finally:
def check(self, filename='FORCE_CONSTANTS'): ref = io.read('SPOSCAR') files = shell_exec("ls dirs").split('\n') fc2 = readfc2(filename) np.set_printoptions(precision=2, suppress=True) vasprunxml = "dir_SPOSCAR/vasprun.xml" if exists(vasprunxml): vasprun = etree.iterparse(vasprunxml, tag='varray') forces0 = parseVasprun(vasprun, 'forces') print(forces0.max()) else: forces0 = 0.0 for file in files: print(file) POSCAR = 'dirs/%s/POSCAR' % file vasprunxml = "dirs/%s/vasprun.xml" % file atoms = io.read(POSCAR) u = atoms.positions - ref.positions f = -np.einsum('ijkl,jl', fc2, u) vasprun = etree.iterparse(vasprunxml, tag='varray') forces = parseVasprun(vasprun, 'forces') - forces0 print(np.abs(f).max(), "\n") print(np.abs(forces - f).max()) print(np.allclose(f, forces, atol=1e-2))
def UserUpdate(self, *args, **kw): if not kw.has_key('data'): raise ValueError, "No data passed to UserGet, got %s / %s" %(args, kw) # Must check xml validity here # parse the data to retrieve the user id context = etree.iterparse(StringIO(str(kw['data'])), events=('end',), tag="OxID") for event, element in context: user_id = element.text # retrieve the person inside the test module person_list = self.context.getPortalObject().oxatis_test_module.searchFolder(reference=user_id, portal_type="Oxatis Test Person") if len(person_list) != 1: raise KeyError(user_id) else: person = person_list[0].getObject() context = etree.iterparse(StringIO(str(kw['data'])), events=('end',)) person_dict = {} for event, element in context: if element.text is None: person_dict[element.tag.lower()] = "" else: person_dict[element.tag.lower()] = element.text person_dict.pop('oxid') LOG("editing person %s with %s" %(person.getPath(), person_dict,), 300, "\n") person.edit(**person_dict) transaction.commit() # Return default xml root = self.generateResultHeader() xml = etree.tostring(root, pretty_print=True) return "", xml
def load_data(ksj_file, ksj_name, ksj_parser, host='192.168.1.10', port=27017, db='test', collection='geo'): mongo = pymongo.MongoClient(host=host, port=port) geo_collection = mongo[db][collection] point_context = etree.iterparse( ksj_file, events={"end",}, tag="{http://www.opengis.net/gml/3.2}Point", recover=True ) point_dict = {} for _, point in point_context: point_id = point.get("{http://www.opengis.net/gml/3.2}id") point_loc = point.find("gml:pos", namespaces=schema.namespaces).text point_dict[point_id] = [float(p) for p in point_loc.split()] ksj_context = etree.iterparse( ksj_file, events={"end",}, tag="{http://nlftp.mlit.go.jp/ksj/schemas/ksj-app}%s" % ksj_name, recover=True ) for _, ksj in ksj_context: geo_collection.insert_one(ksj_parser.parse(ksj, point_dict))
def write_FORCE_SETS_vasp(forces_filenames, displacements, filename='FORCE_SETS', is_zero_point=False, verbose=True): try: from lxml import etree except ImportError: print "You need to install python-lxml." sys.exit(1) if verbose: print "counter (file index):", num_atom = displacements['natom'] count = 0 are_files_correct = True if is_zero_point: force_files = forces_filenames[1:] if vasp.is_version528(forces_filenames[0]): zero_forces = vasp.get_forces_vasprun_xml(etree.iterparse( vasp.VasprunWrapper(forces_filenames[0]), tag='varray')) else: zero_forces = vasp.get_forces_vasprun_xml( etree.iterparse(forces_filenames[0], tag='varray')) if verbose: print "%d" % (count + 1), count += 1 if not check_forces(zero_forces, num_atom, forces_filenames[0]): are_files_correct = False else: force_files = forces_filenames zero_forces = None for i, disp in enumerate(displacements['first_atoms']): if vasp.is_version528(force_files[i]): disp['forces'] = vasp.get_forces_vasprun_xml(etree.iterparse( vasp.VasprunWrapper(force_files[i]), tag='varray')) else: disp['forces'] = vasp.get_forces_vasprun_xml( etree.iterparse(force_files[i], tag='varray')) if verbose: print "%d" % (count + 1), count += 1 if not check_forces(disp['forces'], num_atom, force_files[i]): are_files_correct = False if verbose: print write_FORCE_SETS(displacements, filename=filename, zero_forces=zero_forces) return are_files_correct
def __init__(self, users, badges, posts): self.users = users self.badges = badges self.posts = posts self.summary = dict.fromkeys(["epic", "famous", "questions", "answers", "accepted", "users"],0) self.userContext = etree.iterparse(self.users) self.badgeContext = etree.iterparse(self.badges) self.postContext = etree.iterparse(self.posts)
def visualize(iabook): # scandata = objectify.parse(iabook.get_scandata_path()).getroot() scandata = iabook.get_scandata() if opts.source == 'abbyy': context = etree.iterparse(iabook.get_abbyy(), tag=abbyyns+'page') elif opts.source == 'pdfxml': context = etree.iterparse(iabook.get_pdfxml_xml(), tag='PAGE') elif opts.source == 'djvu': context = etree.iterparse(iabook.get_djvu_xml(), tag='OBJECT') info = scan_pages(context, scandata, iabook)
def write_FORCES( lattice, forces_filenames, displacements, filename='FORCE_SETS', amplitude=0.01, mode='vasp', is_zero_point=False, is_fropho_disp=False ): if mode == "vasp": try: from lxml import etree except ImportError: print "You need to install python-lxml." sys.exit(1) if is_zero_point: force_files = forces_filenames[1:] if mode == "wien2k": zero_forces = wien2k.get_forces_wien2k(forces_filenames[0], lattice) else: # "vasp" case zero_forces = \ vasp.get_forces_vasprun_xml(etree.iterparse( vasp.VasprunWrapper( forces_filenames[0] ), tag='varray' ) ) else: force_files = forces_filenames zero_forces = None displacements = sort_displacements( displacements ) forces = [] # Show progress print >> sys.stderr, "counter (file index):", for i in range( len( displacements ) ): if mode == "wien2k": # wien2k forces.append( wien2k.get_forces_wien2k( force_files[i], lattice ) ) else: # vasp forces.append( vasp.get_forces_vasprun_xml( etree.iterparse( vasp.VasprunWrapper( force_files[i] ), tag='varray') ) ) write_FORCES_from_forces( lattice, forces, displacements, amplitude, filename, zero_forces, is_fropho_disp, verbose=True ) # Show progress print >> sys.stderr, "\n"
def exampleIterativeParsing(): some_file_like = BytesIO("<root><a>data</a></root>") for event, element in etree.iterparse(some_file_like): print("%s, %4s, %s" % (event, element.tag, element.text)) some_file_like.close() some_file_like = BytesIO("<root><a>data</a></root>") for event, element in etree.iterparse(some_file_like, events=("start", "end")): print("%s, %4s, %s" % (event, element.tag, element.text)) some_file_like.close() some_file_like = BytesIO("<root><a>data</a></root>") tree = etree.parse(some_file_like) root = tree.getroot() return root
def main(self): queued_docs = [] process_f = { 'membership': self.process_membership_doc, 'person': self.process_person_doc }.get(self.args.element, self.process_doc) target_db = self.couchdb_client(self.args.db) datasource = None datetime = None # Start with scanning specifically for the properties element once. # The properties element appears at the top of the file for PeopleSoft data, # but at the end of the file for Destiny One data. context = etree.iterparse(self.args.file, events=('end',), tag='properties') for event, elem in context: assert elem.tag == 'properties' properties = self.etree_to_dict(elem) datasource = properties['datasource'] datetime = properties['datetime'] try: datetime_ = dt.strptime(datetime, '%Y-%m-%d %H:%M:%S') datetime_ = timezone('Canada/Mountain').localize(datetime_) datetime = datetime_.isoformat() except: raise # Then, scan for and parse the user specified elements, # batching up to batch elements before processing with open(self.args.file, 'r') as f: progress = progressbar.ProgressBar(maxval=os.path.getsize(self.args.file)) progress.start() context = etree.iterparse(f, events=('end',), tag=self.args.element) for event, elem in context: assert elem.tag == self.args.element progress.update(f.tell()) doc = self.etree_to_dict(elem) if not 'datasource' in doc: doc['datasource'] = datasource if not 'datetime' in doc: doc['datetime'] = datetime queued_docs.append(doc) if len(queued_docs) >= self.args.batch: self.process_documents(queued_docs, target_db, process_f) queued_docs = [] progress.finish() if len(queued_docs) > 0: self.process_documents(queued_docs, target_db, process_f)
def read_force_constant_vasprun_xml(filename): import sys try: from lxml import etree except ImportError: print "You need to install python-lxml." sys.exit(1) if vasp.is_version528(filename): vasprun = etree.iterparse(vasp.VasprunWrapper(filename)) else: vasprun = etree.iterparse(filename) return vasp.get_force_constants_vasprun_xml(vasprun)
def repomdmetadata_from_xml_factory(xmlpath): rm_obj = RepomdMetadata(xmlpath) for _, elements in etree.iterparse(xmlpath): elements = MyElement(elements) for elem in elements: elem = MyElement(elem) # Get revision if elem.tag.endswith("revision"): rm_obj.revision = elem.text # Parse tags if elem.tag.endswith("tags"): for subelem in elem: if subelem.tag.endswith("content"): rm_obj.tags.setdefault("content", set()).add(subelem.text) if subelem.tag.endswith("repo"): rm_obj.tags.setdefault("repo", set()).add(subelem.text) if subelem.tag.endswith("distro"): rm_obj.tags.setdefault("distro", set()).add((subelem.get("cpeid"), subelem.text)) # Iter over data elements (<data type="primary">, ...) for _, elements in etree.iterparse(xmlpath, tag="%sdata" % MD_NS): elements = MyElement(elements) re = RepomdItem() re.name = elements.get("type") for elem in elements: elem = MyElement(elem) if elem.tag.endswith("location"): re.location_href = elem.get("href") elif elem.tag.endswith("open-size"): re.open_size = elem.text elif elem.tag.endswith("open-checksum"): re.open_checksum_type = elem.get("type") re.open_checksum = elem.text elif elem.tag.endswith("checksum"): re.checksum_type = elem.get("type") re.checksum = elem.text elif elem.tag.endswith("timestamp"): re.timestamp = elem.text elif elem.tag.endswith("size"): re.size = elem.text elif elem.tag.endswith("database_version"): re.database_version = elem.text elements.clear() rm_obj.append(re.name, re) return rm_obj
def get_codelist_data(elem=None, name=None): if not name: name = self.return_first_exist(elem.xpath('name/text()')) description = self.return_first_exist(elem.xpath('description/text()')) count = self.return_first_exist(elem.xpath('count/text()')) fields = self.return_first_exist(elem.xpath('fields/text()')) date_updated = datetime.datetime.now() if (Codelist.objects.filter(name=name).exists()): current_codelist = Codelist.objects.get(name=name) current_codelist.date_updated = date_updated current_codelist.description = description current_codelist.count = count current_codelist.fields = fields current_codelist.save() else: new_codelist = Codelist(name=name, description=description, count=count, fields=fields, date_updated=date_updated) new_codelist.save() cur_downloaded_xml = "http://www.iatistandard.org/105/codelists/downloads/clv1/codelist/" + name + ".xml" cur_file_opener = urllib2.build_opener() cur_xml_file = cur_file_opener.open(cur_downloaded_xml) context2 = etree.iterparse(cur_xml_file, tag=name) fast_iter(context2, add_code_list_item)
def download(file_name): log.info("Downloading Regexp") # res = requests.get(URL, stream=True) # res.raw.decode_content = True fh = open(file_name, "rb") for evt, el in etree.iterparse(fh): if evt != "end" or el.tag != NS + "group": continue xml = etree.tostring(el) group = parse_group(el) prov = data_table.find_one(group=group.get("id")) if prov is not None: prov["last_seen"] = datetime.utcnow() else: prov = { "group": group.get("id"), "name": group.get("name"), "first_seen": datetime.utcnow(), "last_seen": datetime.utcnow(), "xml": xml # 'json': json.dumps(group, default=json_default) } log.info("Importing %s" % group.get("name")) store_group(group) data_table.upsert(prov, ["group"]) el.clear()
def write_csv(self, root, header): """ Handle from root to start as of modulnr to extract the information and put them into the CSV with header. """ modulnr = 1 csvfile = self.filename + '.csv' with open(csvfile, 'wb') as f: f.write(codecs.BOM_UTF8) f_csv = csv.writer(f) f_csv.writerow(header) for _, modul in ET.iterparse(self.filename, tag='modul'): # FIXME: Define Dialect and Encoding='UTF-8' # Python3: with open(csvfile, 'w', newline='') as f: ueberschriften = modul.find('ueberschriften') positionen = [siblings[0] for siblings in ueberschriften.itersiblings() if siblings.tag == 'positionen'] for uberschrift in ueberschriften.findall('uberschrift'): for pos in positionen: output = [modulnr] modulnr += 1 for item in header: value = pos.findtext(item) if value is not None: output.append(value.encode('utf-8')) else: output.append('') print(output) f_csv.writerow(output)
def test_real(jmdict_path, examples_path): i = 0 errs = 0 ef = open('errors.txt', 'wb') out = open('jmdict-importable.xml', 'wb') jmdict_total_size = os.path.getsize(jmdict_path) examples_total_size = os.path.getsize(examples_path) widgets = ['Converting: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.Timer(), ' '] pbar = pb.ProgressBar(widgets=widgets, maxval=jmdict_total_size).start() example_dict = load_examples(examples_path) with open(jmdict_path, 'rb') as f: with etree.xmlfile(out, encoding='utf-8') as xf: xf.write_declaration() context = etree.iterparse(f, tag=('entry'), resolve_entities=False) with xf.element(NAMESPACE_PREFIX+'dictionary', nsmap=NSMAP, attrib={XSI_PREFIX+'schemaLocation': SCHEMA_LOCATION, 'schema_version': __schema_version__}): xf.write("\n") xml_meta = create_meta(jmdict_path) xf.write(xml_meta, pretty_print=True) for action, elem in context: xml_entry = convert_entry(elem, example_dict) xf.write(xml_entry, pretty_print=True) pbar.update(f.tell()) elem.clear() pbar.finish()
def iterparser(self): iterparser = etree.iterparse(self.osm_file, events=("start", "end",)) item = None for action, element in iterparser: if action == "start": if item is None: if element.tag == "node": item = Node(**element.attrib) elif element.tag == "way": item = Way(**element.attrib) elif element.tag == "relation": item = Relation(**element.attrib) else: if element.tag == "nd": item.nodes.append(element.get("ref")) elif element.tag == "tag": item.tags[element.get("k")] = element.get("v") elif element.tag == "member": item.members.append(( element.get("type"), element.get("ref"), element.get("role") )) else: print("Tag %s under item %s" % (element.tag, item)) else: if element.tag in ("node", "way", "relation"): yield item item = None element.clear() while element.getprevious() is not None: del element.getparent()[0] del iterparser
def read_pan_data(fn:str, gender_names=GENDER_NAMES, age_names=AGE_NAMES): #Read blog data from xml in PAN13 format. gender_dic = {v:i for i,v in enumerate(gender_names)} age_dic = {v:i for i,v in enumerate(age_names)} texts = [] genders = [] ages = [] logging.info('Read PAN13 format data from {}'.format(fn)) elements = let.iterparse(fn, events=["end"]) n_authors = 0 for event, el in elements: if el.tag == 'conversation': t = el.text or '' texts.append(_preprocess(t)) el.clear() elif el.tag == 'author': n_authors += 1 gender = gender_dic[el.attrib['gender']] age = age_dic[el.attrib['age_group']] genders.extend([gender] * (len(texts) - len(genders))) ages.extend([age] * (len(texts) - len(ages))) elif el.tag == 'file': del el.getparent()[0] else: continue logging.info('{} authors'.format(n_authors)) return texts, genders, ages
def sentence_generator(filename,separate=True,gzipped=True): """Returns metadata and the sentence: [(words),(tags),(lemmas)] Arguments --------- filename: filename separate: if False, changes sentence format to [(w1,t1,l1),(w2,t2,l2),...] gzipped : assumes the file is gzipped. Change to False for unpacked files """ source = gzip.GzipFile(filename) if gzipped else filename parser = etree.iterparse(source,html=True) for x,y in parser: try: # Trips is a list of the word, part-of-speech and the lemma. # by zipping that list, you get a format that I prefer (see details above) # The good thing about it is that you can search for sub-sequences # in the POS list. E.g. using the contains() function that I included # for convenience. trips = [w.split('\t') for w in y.text.strip().split('\n')] # y.attrib contains the sentence metadata. yield y.attrib, zip(*trips) if separate else trips except AttributeError: print 'No text for this element!' pass y.clear() # Save memory # Save more memory by deleting references to previous sentences for ancestor in y.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0]
def findTerms(threadName, XMLFILE, list_terms, queue): print "Running thread %d" % threadName list_count_terms = queue.get() context = etree.iterparse(XMLFILE, events=("end",), tag="page") #for each article, which is each <page> tag, verifies if each term or pair of terms appear in it count = 0 for event, elem in context: for each_term in list_terms: #If it's a pair of terms if each_term.find("_") > -1: words = each_term.split(":")[0] if findPair(words.split("_")[0], words.split("_")[1])(elem.text) or findPair(words.split("_")[1], words.split("_")[0])(elem.text): index = list_terms.index(each_term) list_count_terms[index] += 1 #If it's only one term elif each_term.find("_") == -1 and findWholeWord(each_term.split(":")[0])(elem.text): index = list_terms.index(each_term) list_count_terms[index] += 1 count += 1 if count % 1000 == 0: print "Thread %d processed %d articles." % (threadName, count) #after verifing all the terms, clear the element from memory, because otherwise we end up with all the file in memory, and THAT IS BAD elem.clear() queue.put(list_count_terms) print "Thread %d finished" % threadName
def convert(xml_path, csv_path): with open(xml_path) as fobj: with open(csv_path, 'w') as outfile: pos_writer = csv.writer(outfile) context = etree.iterparse(fobj) for _, elem in context: if elem.tag == 'FrameSet': # print elem.attrib player = player_id(elem) team = team_id(elem) match = convert_id(elem.get('MatchId') or elem.get('Match')) period = 1+ (elem.get('GameSection') == 'secondHalf') for n, frame in enumerate(elem): #to centi seconds (1/100 s) time = n*4 x = frame.get('X') y = frame.get('Y') #last 0 for velocity pos_writer.writerow([match, period, time, player, team, x, y, 0]) elem.clear()
def view(identifier): item=get_item(identifier) host, path = locate(identifier) url = 'http://%s/~edward/read_abbyy.php?item_id=%s&doc=%s&path=%s' % (host, identifier, identifier, path) f = urlopen(url) page_count = 0 body = [] for eve, page in etree.iterparse(f): if page.tag != page_tag: continue for block in page: if block.attrib['blockType'] != 'Text': continue region, text = block for par in text: cur_par = '' if len(par) == 0 or len(par[0]) == 0 or len(par[0][0]) == 0: continue for line in par: chars = [] for fmt in line: chars += [c.text for c in fmt] cur_par += ''.join(chars) body.append(cur_par) if page_count == 20: break page_count += 1 return render_template('view.html', item=item, int=int, body=body)
def get_glyph_pngs(id): # Make a png from the xml file. # Server side path to xml file: # This won't be in home... it'll be at /classifier/[UUID]$ encoded_glyphs = [] for event, element in etree.iterparse(UPLOADS + "projects/1/classifiers/" + str(id) + "/" + str(id) + ".xml"): if (element.tag == "data"): # Maybe a better way in lxml to get to the data element ncols = int(element.getparent().get("ncols")) nrows = int(element.getparent().get("nrows")) # Make an iterable that yields each row in boxed row flat pixel format.* # *http://pypng.googlecode.com/svn/trunk/code/png.py # Plan: make a list of length nrows * ncols * 3 then make sublists of length ncols * 3. # The *3 is for RGB: (0,0,0) is black and (255,255,255) is white pixels = [] white_or_black = True for n in re.findall("\d+", element.text): pixels.extend([255 * white_or_black] * int(n)) white_or_black = not(white_or_black) png_writer = png.Writer(width=ncols, height=nrows, greyscale=True) pixels_2D = [] for i in xrange(nrows): pixels_2D.append(pixels[i*ncols: (i+1)*ncols]) # Index one row of pixels # StringIO.StringIO lets you write to strings as files: it gives you a file descriptor. # (pypng expects a file descriptor) buf = StringIO.StringIO() #image = png.from_array(pixels_2D,mode='L') #image.save(buf) # Hopefully this doesn't write to a file png_writer.write(buf, pixels_2D) my_png = buf.getvalue() encoded_png = base64.b64encode(my_png) # not sure why encoded_glyphs.append(encoded_png) return encoded_glyphs
def iterpages(self): context = etree.iterparse( self.file_, events=('start', 'end'), tag='{}page'.format(self.NAMESPACE), encoding=self.ENCODING) context = iter(context) _, root = next(context) for event, page in context: if event == 'start': continue pagedata = {} for key in ('id', 'title'): pagedata[key] = self.findtext(page, key) revisions = [] for revision in page.iterfind('{}revision'.format(self.NAMESPACE)): revisiondata = {} for key in ('id', 'comment', 'text'): revisiondata[key] = self.findtext(revision, key) timestamp = self.findtext(revision, 'timestamp') revisiondata['timestamp'] = self.parse_iso8601(timestamp) revisions.append(revisiondata) revisions.sort(key=lambda rev: rev['timestamp']) # XXX: why is root.clear() not sufficient? page.clear() while page.getprevious() is not None: del page.getparent()[0] root.clear() yield pagedata, revisions
def LossCurveParser(input_file): refs = [] longitude = [] latitude = [] losses = [] poes = [] meta_info = {} for _, element in etree.iterparse(input_file): if element.tag == '%slossCurves' % xmlNRML: meta_info = parse_metadata(element) elif element.tag == '%slossCurve' % xmlNRML: lon, lat, ref, poe, loss = parse_single_loss_curve(element) longitude.append(lon) latitude.append(lat) refs.append(ref) poes.append(poe) losses.append(loss) else: continue longitude = np.array(longitude) latitude = np.array(latitude) return refs, longitude, latitude, poes, losses
def process_data(inputdump, outputdir, maxfilesize, compress, outformat): # we expects large dumps so we are using iterparse method context = etree.iterparse(inputdump) context = iter(context) # discover prefix from the xml dump file # /mediawiki/siteinfo/base prefix = None for event, elem in context: if event == "end" and elem.tag.endswith("base"): prefix = elem.text[:elem.text.rfind("/")] break print "base url: %s" % prefix # initialize wiki page queue queue = Queue.Queue(maxsize=1024) # start worker threads for _ in range(multiprocessing.cpu_count()): cleaner = WikiCleanerThread(queue, outputdir, maxfilesize, prefix, compress, outformat) cleaner.setDaemon(False) cleaner.start() # put element pages in the queue to be processed by the cleaner threads for event, elem in context: if event == "end" and elem.tag.endswith("page"): queue.put(elem) print "finishing..."
def sentence_generator(filename, gzipped=True, structure=False): """Returns metadata, optionally the sentence structure, and the sentence itself. Each sentence is represented as a list of Token objects. Tokens are named tuples, with the following values: ['token', 'POS', 'lemma', 'depid', 'dephead', 'deprel'] Arguments --------- filename: filename gzipped: assumes the file is gzipped. Change to False for unpacked files structure: assumes we don't need information about sentence structure. change to True to get this info. """ source = gzip.GzipFile(filename) if gzipped else filename parser = etree.iterparse(source, html=True, events=('start','end',), tag='s') # get_full_sentence_data() returns the structure and a list of tokens # get_sentence_data() returns a list of tokens data_func = get_full_sentence_data if structure else get_sentence_data for event, element in parser: if event == 'start': # element.attrib() returns a dictionary with metadata for the sentence. yield (element.attrib, data_func(element)) opening_element = element elif event == 'end': clear_references(opening_element) clear_references(element) # Aggressively keep memory load down del parser
def augment_with_region(in_file='../personlist.xml', out_file='personlist_with_region_iterparse.xml', etree=etree): """ Try to minimise memory overhead by serialising as the data comes in. """ region_element = etree.Element('region') region_index = build_region_index(etree=etree) context = etree.iterparse(in_file, tag='person') with open(out_file, 'w') as out: out.write('<personlist>\n') for _, person in context: # find city and country of each person city = person.findtext('address/city') country = person.findtext('address/country') if not city or not country: continue # insert region tag region_element.text = region_index.get((city,country)) city_el = person.find('address/city') city_el.addnext(deepcopy(region_element)) # serialise into target file out.write(etree.tostring(person)) # clear processed content person.clear() out.write('\n</personlist>')
def context_iter(dblp_path): """Create a dblp data iterator of (event, element) pairs for processing""" return etree.iterparse(source=dblp_path, dtd_validation=True, load_dtd=True) # required dtd
# Parse buildings from output of osmosis and output corresponding keys # Apache License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 from lxml.etree import XMLParser, parse , iterparse, tostring p = XMLParser(huge_tree=True) import sys # write to disk fo = open("osm-buildings.tsv","w") # Documentation on iterparse # http://effbot.org/zone/element-iterparse.htm context = iterparse(sys.stdin, events=("start", "end"), huge_tree=True) context = iter(context) event, root = context.next() print "wayid\ttimestamp\tuid\tuser\tchangeset\tkey\tvalue" for event, elem in context: if event == "end" and elem.tag == "way": wayid = unicode(elem.get("id")) timestamp = unicode(elem.get("timestamp")) uid = unicode(elem.get("uid")) user = unicode(elem.get("user")) changeset = unicode(elem.get("changeset")) td = {} for i in elem.findall("tag"): if isinstance(i.get("v"), str):
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_NULL = NS_CAS + "NULL" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = [] views = {} feature_structures = {} children = defaultdict(list) context = etree.iterparse(source, events=("start", "end")) state = OUTSIDE_FS for event, elem in context: if elem.tag == TAG_XMI or elem.tag == TAG_CAS_NULL: pass # Ignore the 'xmi:XMI' and 'cas:NULL' elements elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(elem) sofas.append(sofa) elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError( "Invalid state transition: [{0}] 'start'".format( state)) elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS fs = self._parse_feature_structure( typesystem, elem, children) feature_structures[fs.xmiID] = fs children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError( "Invalid state transition: [{0}] 'end'".format( state)) else: raise RuntimeError( "Invalid XML event: [{0}]".format(event)) # Free already processed elements from memory if event == "end": self._clear_elem(elem) if len(sofas) != len(views): raise RuntimeError("Number of views and sofas is not equal!") # Post-process feature values for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type) for feature in t.all_features: feature_name = feature.name if feature_name == "sofa": continue if typesystem.is_primitive( feature.rangeTypeName ) or typesystem.is_primitive_collection(feature.rangeTypeName): # TODO: Parse feature values to their real type here, e.g. parse ints or floats continue # Resolve references here value = getattr(fs, feature_name) if value is None: continue # Resolve references if typesystem.is_collection(feature.rangeTypeName): # A collection of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) setattr(fs, feature_name, targets) else: target_id = int(value) target = feature_structures[target_id] setattr(fs, feature_name, target) cas = Cas(typesystem) for sofa in sofas: proto_view = views[sofa.xmiID] if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") else: view = cas.create_view(sofa.sofaID) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType for member_id in proto_view.members: annotation = feature_structures[member_id] view.add_annotation(annotation) return cas
def xml2bin(infilename, outfilename, type='pickle', insist_on_living=True): genes = list() # list of dicts xf = open(infilename, 'rb') #xf=open('short1.xml','r') ; if not isfile(outfilename): outfile = open(outfilename, 'wb') outfile.close() outfile = '' if type == 'pickle': outfile = open(outfilename, 'ab') elif type == 'tsv': outfile = open(outfilename, 'at') pp = pprint.PrettyPrinter(indent=4) #tree=etree.parse(xf) ; #root=tree.getroot() ; count = 0 for event, element in etree.iterparse(xf, events=('end', ), tag='Entrezgene'): #for element in root.iter("Entrezgene") : #element=prelement ; #element=etree.fromstring(etree.tostring(prelement)) if event != 'end': continue try: genetype = eseek(element, 'Entrezgene_type').get( 'value') # == 'protein-coding' and if (not insist_on_living or element[0][0][1].get('value') == 'live'): newGene = dict() extDBrefs = list() newGene.update({'External': extDBrefs}) newGene.update({'genetype': genetype}) else: element.clear() for ancestor in element.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] continue except KeyError: sys.stderr.write('Unclear type.n') newGene.update({'EID': element[0][0][0].text}) # 3 is the Entrezgene_gene child newGene.update({'Symbol': element[3][0][0].text}) try: newGene.update({'Location': element[3][0][2].text}) except IndexError: newGene.update({'Location': None}) newGene.update( {'Taxon': gettext(element[2][0].find(".//Object-id_id"))}) #<Entrezgene_source> 2 # <BioSource> 0 # <BioSource_genome value="genomic">1</BioSource_genome> # <BioSource_origin value="natural">1</BioSource_origin> # <BioSource_org> 2 # <Org-ref> 0 # <Org-ref_taxname>H**o sapiens</Org-ref_taxname> # <Org-ref_common>human</Org-ref_common> # <Org-ref_db> 2 # <Dbtag> 0 # <Dbtag_db>taxon</Dbtag_db> # <Dbtag_tag> 1 # <Object-id> 0 # <Object-id_id>9606</Object-id_id> 0 # </Object-id> try: # gene-ref_db for c in list(eseek(element[3][0], 'Gene-ref_db')): # 3-0-3 : Gene-ref_db (i tried, anyway) #newGene['External'].update({ c[0].text : c[1][0][0].text }) ; newGene['External'].append(c[0].text + ":" + c[1][0][0].text) except (IndexError, KeyError): newGene.update({'External': None}) try: syns = list() for c in list(element[3][0][4]): syns.append(c.text) newGene.update({'Synonym': syns}) except IndexError: newGene.update({'Synonym': None}) # save this one for later-- we may need to do a sub-iteration try: for dbitem in element.iterfind('.//Dbtag_db'): if dbitem.text == 'UniProtKB/Swiss-Prot': if 'SwissProt' not in newGene: # get the first one newGene.update( {'SwissProt': dbitem.getparent()[1][0][0].text}) break else: newGene.update({'SwissProt': None}) except IndexError: newGene.update({'SwissProt': None}) try: for dbitem in element.iterfind('.//Dbtag_db'): if dbitem.text == 'UniProtKB/TrEMBL': newGene.update( {'TrEMBL': dbitem.getparent()[1][0][0].text}) break else: newGene.update({'TrEMBL': None}) except IndexError: newGene.update({'TrEMBL': None}) # changed to list try: newGene.update( {'Pubmed': [e.text for e in element.findall('.//PubMedId')]}) except KeyError: newGene.update({'Pubmed': None}) try: #newGene.update({ 'Peptide' : \ # getProtAccs(eseek(element,'Entrezgene_locus').findall(".//Gene-commentary_type")) + # getProtAccs(eseek(element,'Entrezgene_comments').findall(".//Gene-commentary_type")) }) ; newGene.update({ 'Peptide': getProtAccs(element.findall(".//Gene-commentary_type")) }) except KeyError: newGene.update({'Peptide': None}) # 8-0-5 : Entrexgene_comments, Gene-commentary, Gene-commentary_products try: newGene.update({ 'mRNA': getRNAAccs(element.findall(".//Gene-commentary_type")) }) except KeyEror: newGene.update({'mRNA': None}) try: newGene.update( {'Summary': eseek(element, 'Entrezgene_summary').text}) except KeyError: newGene.update({'Summary': None}) ### NEW AND EXTREMELY BOSS : CDD DOMAINS !!!!1111ONE!! try: CDDentries = set() for dbitem in element.iterfind('.//Dbtag_db'): if dbitem.text == 'CDD': CDDentries.add(dbitem.getparent()[1][0][0].text) if not CDDentries: newGene.update({'CDD': None}) else: newGene.update({'CDD': list(CDDentries)}) except IndexError: # not super sure why this would ever happen here newGene.update({'CDD': None}) raise ValueError element.clear() for ancestor in element.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] if type == 'pickle': pickle.dump(newGene, outfile, protocol=2) elif type == 'tsv': line = '' counter = 0 for k, v in sorted(newGene.items()): if isinstance(v, list): v = ";".join(v) elif v == None: v = '' if counter == 0: line = v else: line = line + "\t" + v counter = counter + 1 line = re.sub(r'\n', '', line) outfile.write(line + "\n") outfile.flush() outfile.close()
def collect_entities_from_dump( self, limit_per_query, # for consistent API n_queries=None, # no effect, for consistent API only include_wikipedia=True, delay_wikipedia_retrieval=True, pre_filter=None, **kwargs): """ iteratively parse a xml-dump (with embedded json entities) for entities of interest, using bz2file. Note: the pure JSON does not contain all relevant meta-info (e. g. timestamps and revision IDs) :param pre_filter: :param n_queries: :param limit_per_query: maximum items to be read in (for debugging/testing) :type limit_per_query: int :return: list of entities to be updated :rtype: list """ if self.dump_path is None: raise ValueError('Dump path required!') if pre_filter is None: pre_filter = [(lambda entity, entity_type: True, {})] def best_guess_open(file_name): """ Use bz2file to iterate over a compressed file, regular open otherwise.""" if file_name.endswith('.bz2'): return BZ2File(file_name) elif file_name.endswith('.gz'): return gzip.open(file_name) else: return open(file_name) dump_path = self.dump_path try: if not self.all_relevant_categories and not self.process_all: self.all_relevant_categories = self.get_relevant_category_ids( self.entity_types) except Exception as e: raise e with best_guess_open(dump_path) as xml_file: parser = et.iterparse(xml_file, events=('end', )) try: for events, elem in parser: if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}timestamp': timestamp = elem.text elif elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}text': if not elem.text: del elem del events continue try: elem_content = ujson.loads(elem.text) assert isinstance(elem_content, dict) except (ValueError, AssertionError): del elem del events continue try: elem_content['timestamp'] = timestamp del timestamp except NameError: logger.warning( "Item %s cannot be assigned a timestamp!", elem_content['id']) try: category = self.determine_relevant_category( elem_content) assert category except ( ValueError, # if the JSON is empty AssertionError # if the entity doesn't fit search categories ): del elem del events continue pre_filter_result = all([ filter_function(entity=elem_content, entity_type=category, **filter_params) for filter_function, filter_params in pre_filter ]) if pre_filter_result: try: for entity in collect_attributes_from_wp_and_wd( elem_content, include_wikipedia=include_wikipedia, delay_wikipedia_retrieval= delay_wikipedia_retrieval, entity_type=category, **kwargs): entity['category'] = category if include_wikipedia and not delay_wikipedia_retrieval: for language_result in merge_with_wikipedia_by_language( entity=entity, languages=kwargs['languages']): yield language_result else: yield entity except (DoesNotMatchFilterError, ValueError) as e: # this probably means no # Wikipedia page in any of our languages, or # failute to mach filter criteria # have no use for such entities. del elem del events continue except Exception as e: raise e del elem del events except (EOFError, IOError) as e: logger.warning('Error parsing file {dump_path}: %s', e, exc_info=True)
def load_data(data_type, min_usr_freq=5): question_dict, question_order = {}, [] usr_idx_dict = {} # usr post_file = PROJECT_PATH + 'Posts.xml' active_usrs, temp_usrs = get_active_usrs(post_file, min_usr_freq=min_usr_freq) temp_question_ids, temp_usr_ids = [], [ ] # questions used for temporal test parser = etree.iterparse(post_file, events=('end', ), tag='row') for i, (_, elem) in enumerate(parser): attr = dict(elem.attrib) # Output to separate files if attr['PostTypeId'] == '1': # question post id, title, content, date, ans_id = parse_question(attr) # if question doesn't contain the accepted answer, skip the question if not ans_id: continue question_dict[id] = { "id": id, "title": title, "content": content, "date": date, "answers": [], "accept_ans": ans_id } question_order.append(id) pass elif attr['PostTypeId'] == '2': # answer post parent_id, id, content, usr_name, score, date = parse_answer(attr) if (parent_id not in question_dict) or (not usr_name): continue if (usr_name not in active_usrs) and (usr_name not in temp_usrs): continue # assign user id if usr_name not in usr_idx_dict: usr_idx = len(usr_idx_dict.keys()) usr_idx_dict[usr_name] = usr_idx else: usr_idx = usr_idx_dict[usr_name] answer_tuple = (id, content, usr_idx, score, date) question_dict[parent_id]["answers"].append(answer_tuple) if usr_name in temp_usrs: temp_question_ids.append(parent_id) # filter questions without any answers question_dict, num_removed = filter_unanswer_question(question_dict) # add data by its different data orders if data_type == "rand": shuffle(question_order) questions, temp_questions = [], [] for id in question_order: if id not in question_dict: continue if id in temp_question_ids: temp_questions.append(question_dict[id]) else: questions.append(question_dict[id]) temp_usr_ids = [usr_idx_dict[i] for i in temp_usrs if i in usr_idx_dict] print("Total questions: ", str(len(questions)), " Users: ", str(len(usr_idx_dict.keys())), sep="") print("Temp Users IDs:", " ".join([str(i) for i in temp_usr_ids])) return questions, usr_idx_dict, temp_questions, temp_usr_ids
cluster = MongoClient( 'mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&ssl=false' ) path_to_file = 'dblp-2021-04-01.xml' dtd_path = 'dblp.dtd' dtd = etree.DTD(dtd_path) count = 0 db = cluster["dblp"] coll = db['data'] context = etree.iterparse(path_to_file, dtd_validation=True, tag="phdthesis", events=('start', 'end')) for event, element in context: dict = {} ee = [] for child in element: if (child.tag == 'ee'): ee.append(child.text) else: dict[child.tag] = child.text dict['ee'] = ee coll.insert_one(dict) element.clear() del dict del ee count = count + 1
def restore_tags(SOURCENOTAGSTOKSP, SOURCETAGSTOKSP, SELECTEDALIGNMENT, TARGETNOTAGSTOKSP): relations = {} for t in SELECTEDALIGNMENT.split(" "): camps = t.split("-") if not int(camps[0]) in relations: relations[int(camps[0])] = [] relations[int(camps[0])].append(int(camps[1])) SOURCETAGSTOKSPMOD = "<s> " + SOURCETAGSTOKSP + " </s>" f = io.BytesIO(SOURCETAGSTOKSPMOD.encode('utf-8')) events = ("start", "end") context = ET.iterparse(f, events=events, recover=True) cont_g = -1 tags = [] tagpairs = [] LISTSOURCETAGSTOKSP = splitTags(SOURCETAGSTOKSP) #LISTSOURCETAGSTOK=removeSpeChar(LISTSOURCETAGSTOKSP,spechar) LISTSOURCETAGSTOK = LISTSOURCETAGSTOKSP LISTSOURCENOTAGSTOKSP = splitTags(SOURCENOTAGSTOKSP) LISTTARGETNOTAGSTOKSP = splitTags(TARGETNOTAGSTOKSP) TEMPLIST = LISTSOURCETAGSTOKSP charbefore = {} charafter = {} print("SOURCETAGSTOKSP", SOURCETAGSTOKSP) for event, elem in context: if not elem.tag == "s": tag = elem.tag attr = elem.items() if event == "start": if len(attr) == 0: xmltag = "<" + tag + ">" if SOURCETAGSTOKSP.find(xmltag) > -1: tags.append(xmltag) else: lat = [] for at in attr: cadena = at[0] + "='" + str(at[1]) + "'" lat.append(cadena) cat = " ".join(lat) xmltag1 = "<" + tag + " " + cat + ">" if SOURCETAGSTOKSP.find(xmltag1) > -1: tags.append(xmltag1) xmltag = xmltag1 lat = [] for at in attr: cadena = at[0] + '="' + str(at[1]) + '"' lat.append(cadena) cat = " ".join(lat) xmltag2 = "<" + tag + " " + cat + ">" if SOURCETAGSTOKSP.find(xmltag2) > -1: tags.append(xmltag2) xmltag = xmltag2 closingtag = "</" + tag + ">" if SOURCETAGSTOKSP.find(closingtag) > -1: tripleta = (xmltag, closingtag, elem.text) tagpairs.append(tripleta) elif event == "end": xmltag = "</" + tag + ">" if SOURCETAGSTOKSP.find(xmltag) > -1: tags.append(xmltag) xmltag = "<" + tag + "/>" if SOURCETAGSTOKSP.find(xmltag) > -1: tags.append(xmltag) preTags = [] postTags = [] for xmltag in tags: if SOURCETAGSTOKSP.find(xmltag) > -1: chbf = SOURCETAGSTOKSP[SOURCETAGSTOKSP.index(xmltag) - 1] charbefore[xmltag] = chbf chaf = SOURCETAGSTOKSP[SOURCETAGSTOKSP.index(xmltag) + len(xmltag)] charafter[xmltag] = chaf tagsCHAR = {} for tag in tags: tagC = tag if tag in charbefore: cb = charbefore[tag].strip() tagC = cb + tag if tag in charafter: ca = charafter[tag].strip() tagC = tagC + ca tagsCHAR[tag] = tagC for i in range(0, len(LISTTARGETNOTAGSTOKSP) + 1): preTags.insert(i, None) postTags.insert(i, None) for tripleta in tagpairs: #source positions sourcepositions = [] for ttrip in tripleta[2].strip().split(" "): try: postemp = LISTSOURCENOTAGSTOKSP.index(ttrip.strip()) sourcepositions.append(postemp) except: pass try: tags.remove(tripleta[0]) TEMPLIST.remove(tripleta[0]) except: pass try: tags.remove(tripleta[1]) TEMPLIST.remove(tripleta[1]) except: pass #target positions targetpositions = [] for position in sourcepositions: if position in relations: targetpositions.extend(relations[position]) preTags[min(targetpositions)] = tagsCHAR[tripleta[0]] postTags[max(targetpositions)] = tagsCHAR[tripleta[1]] #isolated tags for tag in tags: try: preTags[relations[TEMPLIST.index(tag)][0]] = tag TEMPLIST.remove(tag) except: pass LISTTARGETTAGSTOKSP = [] for i in range(0, len(LISTTARGETNOTAGSTOKSP)): try: if preTags[i]: LISTTARGETTAGSTOKSP.append(preTags[i]) LISTTARGETTAGSTOKSP.append(LISTTARGETNOTAGSTOKSP[i]) if postTags[i]: LISTTARGETTAGSTOKSP.append(postTags[i]) except: pass translationTagsSP = " ".join(LISTTARGETTAGSTOKSP) return (translationTagsSP)
def _validate_event_files(self): """ Validates all event files in the currently active project. The following tasks are performed: * Validate against QuakeML 1.2 scheme. * Check for duplicate ids amongst all QuakeML files. * Make sure they contain at least one origin, magnitude and focal mechanism object. * Some simply sanity checks so that the event depth is reasonable and the moment tensor values as well. This is rather fragile and mainly intended to detect values specified in wrong units. * Events that are too close in time. Events that are less then one hour apart can in general not be used for adjoint tomography. This will naturally also detect duplicate events. """ import collections import itertools import math from obspy import read_events from obspy.io.quakeml.core import _validate as validate_quakeml from lxml import etree print "Validating %i event files ..." % self.comm.events.count() # Start with the schema validation. print "\tValidating against QuakeML 1.2 schema ", all_valid = True for event in self.comm.events.get_all_events().values(): filename = event["filename"] self._flush_point() if validate_quakeml(filename) is not True: all_valid = False msg = ( "ERROR: " "The QuakeML file '{basename}' did not validate against " "the QuakeML 1.2 schema. Unfortunately the error messages " "delivered by lxml are not useful at all. To get useful " "error messages make sure jing is installed " "('brew install jing' (OSX) or " "'sudo apt-get install jing' (Debian/Ubuntu)) and " "execute the following command:\n\n" "\tjing http://quake.ethz.ch/schema/rng/QuakeML-1.2.rng " "{filename}\n\n" "Alternatively you could also use the " "'lasif add_spud_event' command to redownload the event " "if it is in the GCMT " "catalog.\n\n").format(basename=os.path.basename(filename), filename=os.path.relpath(filename)) self._add_report(msg) if all_valid is True: self._print_ok_message() else: self._print_fail_message() # Now check for duplicate public IDs. print "\tChecking for duplicate public IDs ", ids = collections.defaultdict(list) for event in self.comm.events.get_all_events().values(): filename = event["filename"] self._flush_point() # Now walk all files and collect all public ids. Each should be # unique! with open(filename, "rt") as fh: for event, elem in etree.iterparse(fh, events=("start", )): if "publicID" not in elem.keys() or \ elem.tag.endswith("eventParameters"): continue ids[elem.get("publicID")].append(filename) ids = { key: list(set(value)) for (key, value) in ids.iteritems() if len(value) > 1 } if not ids: self._print_ok_message() else: self._print_fail_message() self._add_report( "Found the following duplicate publicIDs:\n" + "\n".join([ "\t%s in files: %s" % (id_string, ", ".join( [os.path.basename(i) for i in faulty_files])) for id_string, faulty_files in ids.iteritems() ]), error_count=len(ids)) def print_warning(filename, message): self._add_report("WARNING: File '{event_name}' " "contains {msg}.\n".format( event_name=os.path.basename(filename), msg=message)) # Performing simple sanity checks. print "\tPerforming some basic sanity checks ", all_good = True for event in self.comm.events.get_all_events().values(): filename = event["filename"] self._flush_point() cat = read_events(filename) filename = os.path.basename(filename) # Check that all files contain exactly one event! if len(cat) != 1: all_good = False print_warning(filename, "%i events instead of only one." % len(cat)) event = cat[0] # Sanity checks related to the origin. if not event.origins: all_good = False print_warning(filename, "no origin") continue origin = event.preferred_origin() or event.origins[0] if (origin.depth % 100.0): all_good = False print_warning( filename, "a depth of %.1f meters. This kind of accuracy " "seems unrealistic. The depth in the QuakeML " "file has to be specified in meters. Checking " "all other QuakeML files for the correct units " "might be a good idea" % origin.depth) if (origin.depth > (800.0 * 1000.0)): all_good = False print_warning( filename, "a depth of more than 800 km. This is" " likely wrong.") # Sanity checks related to the magnitude. if not event.magnitudes: all_good = False print_warning(filename, "no magnitude") continue # Sanity checks related to the focal mechanism. if not event.focal_mechanisms: all_good = False print_warning(filename, "no focal mechanism") continue focmec = event.preferred_focal_mechanism() or \ event.focal_mechanisms[0] if not hasattr(focmec, "moment_tensor") or \ not focmec.moment_tensor: all_good = False print_warning(filename, "no moment tensor") continue mt = focmec.moment_tensor if not hasattr(mt, "tensor") or \ not mt.tensor: all_good = False print_warning(filename, "no actual moment tensor") continue tensor = mt.tensor # Convert the moment tensor to a magnitude and see if it is # reasonable. mag_in_file = event.preferred_magnitude() or event.magnitudes[0] mag_in_file = mag_in_file.mag M_0 = 1.0 / math.sqrt(2.0) * math.sqrt(tensor.m_rr**2 + tensor.m_tt**2 + tensor.m_pp**2) magnitude = 2.0 / 3.0 * math.log10(M_0) - 6.0 # Use some buffer to account for different magnitudes. if not (mag_in_file - 1.0) < magnitude < (mag_in_file + 1.0): all_good = False print_warning( filename, "a moment tensor that would result in a moment " "magnitude of %.2f. The magnitude specified in " "the file is %.2f. Please check that all " "components of the tensor are in Newton * meter" % (magnitude, mag_in_file)) if all_good is True: self._print_ok_message() else: self._print_fail_message() # Collect event times event_infos = self.comm.events.get_all_events().values() # Now check the time distribution of events. print "\tChecking for duplicates and events too close in time %s" % \ (self.comm.events.count() * "."), all_good = True # Sort the events by time. event_infos = sorted(event_infos, key=lambda x: x["origin_time"]) # Loop over adjacent indices. a, b = itertools.tee(event_infos) next(b, None) for event_1, event_2 in itertools.izip(a, b): time_diff = abs(event_2["origin_time"] - event_1["origin_time"]) # If time difference is under one hour, it could be either a # duplicate event or interfering events. if time_diff <= 3600.0: all_good = False self._add_report( "WARNING: " "The time difference between events '{file_1}' and " "'{file_2}' is only {diff:.1f} minutes. This could " "be either due to a duplicate event or events that have " "interfering waveforms.\n".format( file_1=event_1["filename"], file_2=event_2["filename"], diff=time_diff / 60.0)) if all_good is True: self._print_ok_message() else: self._print_fail_message() # Check that all events fall within the chosen boundaries. print "\tAssure all events are in chosen domain %s" % \ (self.comm.events.count() * "."), all_good = True domain = self.comm.project.domain for event in event_infos: if domain.point_in_domain(latitude=event["latitude"], longitude=event["longitude"]): continue all_good = False self._add_report( "\nWARNING: " "Event '{filename}' is out of bounds of the chosen domain." "\n".format(filename=event["filename"])) if all_good is True: self._print_ok_message() else: self._print_fail_message()
def element_generator(input_file, template, root_tag, is_whole_element): root_tag_with_namespace = "{*}" + root_tag for event, elem in etree.iterparse(input_file, tag=root_tag_with_namespace): yield (etree.tostring(elem), template.get(root_tag), is_whole_element) elem.clear()
stats = {} stats['addresses'] = 0 stats['ways'] = {} stats['nodes'] = {} # Prepare changesets and stats to hold changes by tag name for tag in tags: stats['nodes'][tag] = 0 stats['ways'][tag] = 0 changesets[tag]= {} sys.stderr.write('finding points\n') # ------------------------------------------ # Find nodes that fall within specified area # ------------------------------------------ context = iter(etree.iterparse(osc_file, events=('start', 'end'))) event, root = context.next() for event, n in context: if event == 'start': if n.tag == 'node': lon = float(n.get('lon', 0)) lat = float(n.get('lat', 0)) if point_in_box(lon, lat, aoi_box) and point_in_poly(lon, lat, aoi_poly): cid = n.get('changeset') nid = n.get('id', -1) nids.add(nid) ntags = n.findall(".//tag[@k]") addr_tags = getaddresstags(ntags) version = int(n.get('version')) # Capture address changes
def preprocess( self, xml_directory='RawXML', name_space='http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord', process_name=True, num_file_lines=10**6, show_progress=True): """ Bulk preprocess of the Web of Science raw data. Parameters ---------- process_name: bool, default True If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. xml_file_name: str, default 'dblp.xml.gz' The xml file name. num_file_lines: int, default 10**6 The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows. show_progress: bool, default True Show progress with processing of the data. """ pub_column_names = [ 'PublicationId', 'Year', 'JournalId', 'Doi', 'ISSN', 'Title', 'Date', 'Volume', 'Issue', 'Pages', 'DocType', 'TeamSize' ] author_column_names = ['AuthorId', 'FullName', 'FirstName', 'LastName'] if show_progress: print("Starting to preprocess the WOS database.") for hier_dir_type in [ 'publication', 'author', 'publicationauthoraffiliation', 'pub2field', 'pub2ref', 'affiliation' ]: if not os.path.exists( os.path.join(self.path2database, hier_dir_type)): os.mkdir(os.path.join(self.path2database, hier_dir_type)) pub2year = {} pub2doctype = {} found_aids = set([]) found_affiliations = {} ns = {"ns": name_space} xmlfiles = sorted([ fname for fname in os.listdir( os.path.join(self.path2database, xml_directory)) if '.xml' in fname ]) ifile = 0 for xml_file_name in tqdm(xmlfiles, desc='WOS xml files', leave=True, disable=not show_progress): publication_df = [] author_df = [] paa_df = [] pub2field_df = [] pub2ref_df = [] affiliation_df = [] field_df = [] name, extension = os.path.splitext(xml_file_name) if extension == '.gz': with gzip.open( os.path.join(self.path2database, xml_directory, xml_file_name), 'r') as infile: xml_file = infile.read() bytesxml = BytesIO(xml_file) elif extension == '.xml': with open( os.path.join(self.path2database, xml_directory, xml_file_name), 'r') as infile: xml_file = infile.read() # extract the desired fields from the XML tree # xmltree = etree.iterparse(bytesxml, events=('end', ), tag="{{{0}}}REC".format(name_space)) if show_progress: print("{} Xml tree parsed, iterating through elements.".format( xml_file_name)) last_position = 0 for event, elem in xmltree: # scrape the publication information PublicationId = load_html_str( elem.xpath('./ns:UID', namespaces=ns)[0].text) pub_record = self._blank_wos_publication(PublicationId) pub_record['Title'] = load_html_str( load_xml_text( elem.xpath( './ns:static_data/ns:summary/ns:titles/ns:title[@type="item"]', namespaces=ns))) pub_record['JournalId'] = load_html_str( load_xml_text( elem.xpath( './ns:static_data/ns:summary/ns:titles/ns:title[@type="source"]', namespaces=ns))) pub_info = elem.xpath( './ns:static_data/ns:summary/ns:pub_info', namespaces=ns)[0] pub_record['Year'] = load_int(pub_info.get('pubyear', '')) pub_record['Date'] = load_html_str(pub_info.get( 'sortdate', '')) pub_record['Volume'] = load_int(pub_info.get('vol', '')) pub_record['Issue'] = load_int(pub_info.get('issue', '')) pub2year[PublicationId] = pub_record['Year'] pub_record['Pages'] = load_html_str( load_xml_text(elem.xpath( './ns:static_data/ns:summary/ns:pub_info/ns:page', namespaces=ns), default='')) for ident in ['ISSN', 'Doi']: identobject = elem.xpath( './ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="{}"]' .format(ident.lower()), namespaces=ns) if len(identobject) > 0: pub_record[ident] = load_html_str(identobject[0].get( 'value', '')) #load_html_str(load_xml_text(elem.xpath('./ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="doi"]', namespaces=ns))) pub_record['DocType'] = load_html_str( load_xml_text( elem.xpath( './ns:static_data/ns:summary/ns:doctypes/ns:doctype', namespaces=ns))) pub2doctype[PublicationId] = pub_record['DocType'] # now scrape the authors pub_authors = {} author_objects = elem.xpath( './ns:static_data/ns:summary/ns:names/ns:name[@role="author"]', namespaces=ns) pub_record['TeamSize'] = len(author_objects) for author_obj in author_objects: author_record = self._blank_wos_author(None) author_record['AuthorId'] = author_obj.get('dais_id', None) author_record['FullName'] = load_html_str( load_xml_text( author_obj.xpath('./ns:full_name', namespaces=ns))) author_record['FirstName'] = load_html_str( load_xml_text( author_obj.xpath('./ns:first_name', namespaces=ns))) author_record['LastName'] = load_html_str( load_xml_text( author_obj.xpath('./ns:last_name', namespaces=ns))) author_record['Affiliations'] = author_obj.get( 'addr_no', '') author_record['Affiliations'] = [ int(single_addr_no) for single_addr_no in author_record['Affiliations'].split(' ') if len(single_addr_no) > 0 ] author_record['AuthorOrder'] = int( author_obj.get('seq_no', None)) pub_authors[author_record['AuthorOrder']] = author_record #contributor_objects = elem.xpath('./ns:static_data/ns:contributors/ns:contributor/ns:name[@role="researcher_id"]', namespaces=ns) address_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec', namespaces=ns) for addr_obj in address_objects: addr_record = self._blank_wos_affiliation() organization_objects = addr_obj.xpath( './ns:organizations/ns:organization[@pref="Y"]', namespaces=ns) if len(organization_objects) == 0: organization_objects = addr_obj.xpath( './ns:organizations/ns:organization', namespaces=ns) if len(organization_objects) == 0: orgtext = '' else: orgtext = organization_objects[0].text address_no = int(addr_obj.get('addr_no')) affiliation_df.append([PublicationId, addr_no, orgtext]) #if found_affiliations #article['addresses'][address_no] = address_info field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:headings/ns:heading', namespaces=ns) field_df.extend([[PublicationId, field_obj.text, 'heading'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subheadings/ns:subheading', namespaces=ns) field_df.extend([[PublicationId, field_obj.text, 'subheading'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="traditional"]', namespaces=ns) field_df.extend([[ PublicationId, field_obj.text, 'ASCA traditional subject' ] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="extended"]', namespaces=ns) field_df.extend( [[PublicationId, field_obj.text, 'ASCA extended subject'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:keywords/ns:keyword', namespaces=ns) field_df.extend([[PublicationId, field_obj.text, 'keyword'] for field_obj in field_objects if field_obj is not None]) field_objects = elem.xpath( './ns:static_data/ns:item/ns:keywords_plus/ns:keyword', namespaces=ns) field_df.extend( [[PublicationId, field_obj.text, 'keyword plus'] for field_obj in field_objects if field_obj is not None]) reference_objects = elem.xpath( './ns:static_data/ns:fullrecord_metadata/ns:references/ns:reference', namespaces=ns) for ref_obj in reference_objects: for ref_elem in ref_obj: if ref_elem.tag == "{{{0}}}uid".format(name_space): refid = load_html_str( ref_elem.text.replace('WOS:', '')) pub2ref_df.append([PublicationId, refid]) elif ref_elem.tag == "{{{0}}}year".format(name_space): pub2year[refid] = load_int(ref_elem.text) publication_df.append( [pub_record[k] for k in pub_column_names]) for aorder, author_record in pub_authors.items(): if not author_record[ 'AuthorId'] is None and not author_record[ 'AuthorId'] in found_aids: found_aids.add(author_record['AuthorId']) author_df.append( [author_record[k] for k in author_column_names]) paa_df.append([ PublicationId, author_record['AuthorId'], aorder, author_record['FullName'] ]) self._save_dataframes(ifile, publication_df, pub_column_names, author_df, author_column_names, paa_df, pub2ref_df, affiliation_df, field_df) ifile += 1 with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2year).encode('utf8')) with gzip.open(os.path.join(self.path2database, 'pub2doctype.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2doctype).encode('utf8'))
def read_marc_file(f): for event, elem in etree.iterparse(f, tag=record_tag): yield MarcXml(elem) elem.clear()
def parse(self, languages, replace=False, bn_to_wn_mapping=None): """ Parses sentences of the given languages. In case replace=False: for each sentence, returns its attributes, text, text attributes, annotations, annotations attributes. In case replace=True: for each sentence, returns the processed sentence according the replacing rule. :param languages: languages to be considered (in ISO code), as List :param replace: True: replaces anchors with lemma_annotation; False: nothing (default) :param bn_to_wn_mapping: dictionary mapping BabelNet IDs to WordNet IDs :return: appropriate return, in a generator fashion """ assert (replace is False) or ( replace and bn_to_wn_mapping is not None ), "BabelNet to WordNet mapping must be provided if replace=True" for event, sentence in etree.iterparse(self.xml, tag="sentence"): proc_sentences = [] sentence_attrs = sentence.attrib text_attrs, text, annotations, annotations_attrs = [], [], [], [] if event == "end": for element in sentence: if element.tag == "text" and element.get( "lang") in languages: text_attrs.append(element.attrib) text.append(element.text) if element.tag == "annotations": for annotation in element: if annotation.get("lang") in languages: annotations_attrs.append(annotation.attrib) annotations.append(annotation.text) if not replace: yield sentence_attrs, text, text_attrs, annotations, annotations_attrs else: # used to keep track of possible replacements in order to select the longest mention Replacement = namedtuple("Replacement", "anchor synset lemma") # iterate over all texts, one for each language selected for single_text, single_attrs in zip(text, text_attrs): # skip processing of null texts (i.e. sentence id = 2031) if single_text is None: continue proc_sentence = [] for word in single_text.split(" "): curr_replacement = Replacement( anchor=[], synset="<NO_SYNSET>", lemma="") for annotation, ann_attrs in zip( annotations, annotations_attrs): # no need to parse annotations of another language than the text's if ann_attrs["lang"] != single_attrs["lang"]: continue curr_anchor = ann_attrs["anchor"].split(" ") if word in curr_anchor: # longest mention is preferred in case of multiple annotations for the same word if len(curr_anchor) > len( curr_replacement.anchor): curr_replacement = Replacement( anchor=curr_anchor, synset=annotation, lemma=ann_attrs["lemma"]) # no annotation for this word if curr_replacement.synset == "<NO_SYNSET>": proc_sentence.append(word) # annotation found and word is the last in its mention elif curr_replacement.synset != "<NO_SYNSET>" and \ word == curr_replacement.anchor[-1]: # build the lemma_synset format for the whole mention replacement_word = "%s_%s" % ( curr_replacement.lemma.replace( " ", "_"), curr_replacement.synset) proc_sentence.append(replacement_word) # form a string concatenated by space proc_sentences.append(" ".join(proc_sentence)) yield proc_sentences sentence.clear()
def _iterparse(source): for _, node in etree.iterparse(source, tag='document'): yield node node.clear()
def _parse_results(self, file_report): # Special thanks to: # http://codereview.stackexchange.com/questions/2449/parsing-huge-xml-file-with-lxml-etree-iterparse-in-python context = etree.iterparse(file_report, huge_tree=True, remove_blank_text=True, dtd_validation=False, events=("start", "end")) for hostCounter, element in enumerate( self._extract_host_elements(context)): item_info = { 'scan_start': '', 'scan_stop': '', 'os': '', 'hostname': '', 'netbios_name': '', 'mac_address': '', 'ip': '', } # for some reason i keep getting the "report" element too. # continue if i see it. if element.tag == 'Report': continue # make sure the element is formatted properly. ip = element.get('name') if ip is None: continue host_properties = element.find('HostProperties') if host_properties is not None: self._results[ip] = [] host_tags = host_properties.findall('tag') if host_tags is not None: for host_tag in host_tags: if host_tag.get("name") == 'HOST_START': item_info['scan_start'] = host_tag.text if host_tag.get("name") == 'HOST_END': item_info['scan_stop'] = host_tag.text if host_tag.get("name") == 'operating-system': item_info['os'] = host_tag.text if host_tag.get("name") == 'host-fqdn': item_info['hostname'] = host_tag.text if host_tag.get("name") == 'netbios-name': item_info['netbios_name'] = host_tag.text if host_tag.get("name") == 'mac-address': item_info['mac_address'] = host_tag.text if host_tag.get("name") == 'host-ip': item_info['ip'] = host_tag.text self._results[ip].append(item_info) else: if ip is not None: # this means that, for some reason, etree was not able to parse the element. print "I found IP:", ip, "but there was an empty element." report_items = element.findall('ReportItem') data_items = [ 'description', 'solution', 'plugin_type', 'cvss_base_score', 'cvss_vector', 'exploit_available', 'exploitability_ease', 'exploit_framework_metasploit', 'cve' ] if report_items is not None: for report_item in report_items: vuln = { 'plugin_name': '', 'plugin_id': '', 'plugin_type': '', 'port': '', 'protocol': '', 'description': '', 'solution': '', 'service_name': '', 'cvss_base_score': '0.0', 'cvss_vector': '', 'exploit_available': '', 'metasploit': '', 'cve': '', } # Skip specific vulnerability if it is into a blacklist if report_item.get('pluginID') in self._blacklist: self._blacklist_hit += 1 continue vuln['plugin_name'] = report_item.get('pluginName') vuln['plugin_id'] = report_item.get('pluginID') vuln['port'] = report_item.get('port') vuln['protocol'] = report_item.get('protocol') vuln['description'] = report_item.get('description') vuln['service_name'] = report_item.get('svc_name') for data_item in data_items: data = report_item.find(data_item) if data is not None: # set the following to false initially. vuln['exploit_framework_metasploit'] = 'false' vuln['exploit_available'] = 'false' vuln['patch_avail'] = 'false' if data.tag == 'description': vuln['description'] = data.text if data.tag == 'solution': vuln['solution'] = data.text if data.tag == 'plugin_type': vuln['plugin_type'] = data.text if data.tag == 'cvss_base_score': if data.text is not None: vuln['cvss_base_score'] = data.text if data.tag == 'cvss_vector': vuln['cvss_vector'] = data.text if data.tag == 'exploit_available': vuln['exploit_available'] = 'true' if data.tag == 'exploitability_ease': vuln['exploit_available'] = 'true' if data.tag == 'exploit_framework_metasploit': vuln['metasploit'] = 'true' if data.tag == 'cve': vuln['cve'] = data.text self._results[ip].append(vuln)
def parse_unzipped(fp1): context = etree.iterparse( fp1, tag='{http://www.mediawiki.org/xml/export-0.10/}page', encoding='utf-8') fast_iter(context)
def parse(self): results_base = os.path.join(self.data_path_base, 'results') if not os.path.isdir(results_base): os.makedirs(results_base) page_info_results_file = os.path.join(results_base, 'page_info.csv') revision_info_results_file = os.path.join(results_base, 'revisions.csv') no_text_error_results_file = os.path.join(results_base, 'no_text_error.csv') author_info_results_file = os.path.join(results_base, 'author_info.csv') results_path = os.path.join(results_base, os.path.splitext(self.file_name)[0]) if not os.path.isdir(results_path): os.makedirs(results_path) cat_results_file = os.path.join(results_path, 'cats.csv') link_results_file = os.path.join(results_path, 'links.csv') for file in glob.glob(self.data_path + '/*'): size = os.path.getsize(file) if size < 10485760000: for event, elem in etree.iterparse( file, tag='{http://www.mediawiki.org/xml/export-0.10/}page', huge_tree=True): for data in elem.iterchildren( reversed=False, tag='{http://www.mediawiki.org/xml/export-0.10/}ns' ): ns = data.text if ns == '0' or ns == '14': page_info, revision_info, no_text_error, author_info = self.get_data( elem, cat_results_file, link_results_file) page_info.to_csv(page_info_results_file, sep='\t', mode='a', header=False, index=False) revision_info.to_csv(revision_info_results_file, sep='\t', mode='a', header=False, index=False) no_text_error.to_csv(no_text_error_results_file, sep='\t', mode='a', header=False, index=False) author_info.to_csv(author_info_results_file, sep='\t', mode='a', header=False, index=False) else: pass elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] os.remove(file) else: too_large = os.path.join(self.data_path_base, 'too_large_to_parse') if not os.path.isdir(too_large): os.makedirs(too_large) try: subprocess.call([ '7z', 'a', os.path.join(os.getcwd(), file + '.7z'), os.path.join(os.getcwd(), file) ]) shutil.copy2(file + '.7z', too_large) os.remove(file) os.remove(file + '.7z') except: pass return True
def parseDBLP(facultydict): authlogs = {} interestingauthors = {} authorscores = {} authorscoresAdjusted = {} with gzip.open("dblp.xml.gz") as f: oldnode = None for (event, node) in ElementTree.iterparse(f, events=["start", "end"]): if oldnode is not None: oldnode.clear() oldnode = node foundArticle = True # include all venues # foundArticle = False inRange = False authorsOnPaper = 0 authorName = "" confname = "" year = -1 pageCount = -1 foundOneInDict = False volume = 0 if node.tag == "inproceedings" or node.tag == "article": # First, check if this is one of the conferences we are looking for. for child in node: if child.tag == "booktitle" or child.tag == "journal": confname = child.text if True: # INCLUDE ALL VENUES # was: if (confname in confdict): foundArticle = True if child.tag == "volume": volume = child.text if child.tag == "year": if child.text is not None: year = int(child.text) if child.tag == "pages": pageCount = csrankings.pagecount(child.text) if child.tag == "author": authorName = child.text if authorName is not None: authorName = authorName.strip() authorsOnPaper += 1 if authorName in facultydict: foundOneInDict = True if not foundArticle: # Not one of our conferences. continue if confname is None: continue if confname not in csrankings.confdict: areaname = "na" else: areaname = csrankings.confdict[confname] # Check that dates are in the specified range. if (year >= startyear) and (year <= endyear): inRange = True if year == -1: # No year. continue tooFewPages = False if (pageCount != -1) and ( pageCount < csrankings.pageCountThreshold ): tooFewPages = True exceptionConference = confname == "SC" exceptionConference |= ( confname == "SIGSOFT FSE" and year == 2012 ) exceptionConference |= ( confname == "ACM Trans. Graph." and int(volume) >= 26 and int(volume) <= 36 ) if exceptionConference: tooFewPages = False if (not inRange) or (not foundOneInDict) or tooFewPages: continue # If we got here, we have a winner. for child in node: if child.tag == "author": authorName = child.text authorName = authorName.strip() if authorName in facultydict: print( "here we go" + authorName + " " + confname + " " + str(authorsOnPaper) + " " + str(year) ) logstring = authorName.encode("utf-8") logstring += " ; ".encode("utf-8") logstring += confname.encode("utf-8") logstring += " ".encode("utf-8") logstring += str(year).encode("utf-8") tmplist = authlogs.get(authorName, []) tmplist.append(logstring) authlogs[authorName] = tmplist interestingauthors[authorName] = ( interestingauthors.get(authorName, 0) + 1 ) authorscores[(authorName, areaname, year)] = ( authorscores.get( (authorName, areaname, year), 0 ) + 1.0 ) authorscoresAdjusted[ (authorName, areaname, year) ] = ( authorscoresAdjusted.get( (authorName, areaname, year), 0 ) + 1.0 / authorsOnPaper ) return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
fd.close() del zfile source = None for f in os.listdir("."): if f.startswith("f_amp2_"): source = f gc.collect() now = datetime.datetime.now().isoformat() print "Loading from ", source items = [] data = {"_when": now, "_source": source} for event, elem in etree.iterparse(source): if elem.tag == "AMP": if 'APID' in data: items.append(data.copy()) data = {"_when": now, "_source": source} if len(items) == 100: scraperwiki.sqlite.save(['APID'], items) items = [] else: print 'No APID in %s and event was %s' %(data,event) if elem.text: data[elem.tag] = elem.text if elem.tag == "DESC": m = supplier_re.match(elem.text) if m: data['SUPPLIER'] = m.groups()[0]
def createFullCSV(marcxml_file, output): writer = csv.writer(open(output), 'w') header = getHeader(marcxml_file) writer.writerow(header) for event, record in etree.iterparse(open(marcxml_file, 'rb')): if record.tag == marcxmlNS + "record": nextrow = [] for field in header: if field == 'leader': field_value = record.xpath('marcxml:leader', namespaces=ns) if len(field_value) == 1: value = field_value[0].text.encode('utf8') else: value = None nextrow.append(value) elif len(field) == 3: field_value = record.xpath('marcxml:controlfield[@tag=' + field + ']', namespaces=ns) if len(field_value) > 1: value = '' for n in range(len(field_value)): value += field_value[n].text.encode('utf8') + ";" value.strip(';') elif len(field_value) == 1: value = field_value[0].text.encode('utf8') else: value = None nextrow.append(value) # elif field.strip('_') in fieldsHeadings: # field_value = record.xpath('marcxml:datafield[@tag=' + # field.strip('_') + ']', # namespaces=ns) # if len(field_value) > 1: # value = '' # for resp in field_value: # resp_value = '' # for subfield in resp.getchildren(): # if subfield.get('code') not in headingsSkip: # resp_value += subfield.text.encode('utf8') + ' ' # if middleInits_re.search(resp_value.strip()) is None: # resp_value = resp_value.strip().strip('.') # value += resp_value + ';' # value = value.strip(';') # print(value) # elif len(field_value) == 1: # value = '' # for resp in field_value: # for subfield in resp.getchildren(): # value += subfield.text.encode('utf8') + ' ' # value = value.strip() # if middleInits_re.search(value) is None: # value = value.strip('.') # else: # value = None # nextrow.append(value) # print(value) else: tag = field[:3] inds_subf = field.split('_', 1)[1] ind1 = inds_subf.split("$", 1)[0][:1].replace('#', ' ') ind2 = inds_subf.split("$", 1)[0][-1:].replace('#', ' ') subfield = inds_subf.split("$", 1)[1] if tag in fieldsNonFile1: xpath = ('marcxml:datafield[@tag="' + tag + '"][@ind1="' + ind1 + '"]/' + 'marcxml:subfield[@code="' + subfield + '"]') elif tag in fieldsNonFile2: xpath = ('marcxml:datafield[@tag="' + tag + '"][@ind2="' + ind2 + '"]/' + 'marcxml:subfield[@code="' + subfield + '"]') else: xpath = ('marcxml:datafield[@tag="' + tag + '"][@ind1="' + ind1 + '"][@ind2="' + ind2 + '"]/' + 'marcxml:subfield[@code="' + subfield + '"]') field_value = record.xpath(xpath, namespaces=ns) if len(field_value) > 1: value = '' for n in range(len(field_value)): value += field_value[n].text.encode('utf8') + ';' print(value) value.strip(';') elif len(field_value) == 1: value = field_value[0].text.encode('utf8') else: value = None nextrow.append(value) writer.writerow(nextrow)
def _set_up_context(self, xmlfile): return etree.iterparse(xmlfile, tag='token', events=('end', ))
def main(filein, hadvcode, fileout): """ Use lxml """ recs = [] #tree = etree.parse(filein) # better to use iterparse # elements can be of two kinds, 'f' or 'pv'. # We are interested only in 'f' elements # An '<f>' element is an 'inflected form entry'. # iterate through all the <f> elements. stemdict = {} stemlist = [] n = 0 ncaus = 0 ndes = 0 for _, element in etree.iterparse(filein, tag='f'): n = n + 1 form = element.get("form") # "form" is the sole required attribute of an <f> element. Its value is the inflected form. # most of the time, the element has exactly 2 children. However, sometimes there # are multiple children. The first instance in file SL_roots.xml is # <f form="akaRqayata"> # <v><cj><prim/></cj><sys><prs gn="10"><md><im/></md><atma/></prs></sys><np><sg/><trd/></np></v> # <v><cj><prim/></cj><sys><prs gn="10"><md><im/></md><para/></prs></sys><np><pl/><snd/></np></v> # <s stem="kaRq"/></f> # The rest of the <f> element (its xml 'children') is used to describe the form. children = list(element) nchildren = len(children) # The xml '<s>' element is one of the children. "lexicon stem or root generating the form" # We assume '<s>' is the 'last' child of '<f>' s = children[-1] # The actual stem is the value of the "stem" attribute of <s>. # This stem may have a 'homonym' number, represented as '#1','#2' suffixing the stem value. stem = s.get("stem") assert ((form != None) and (stem != None)) # Remove homonym marker from stem, if present stem = re.sub(r'#.*$', '', stem) """ if stem not in stemdict: stemdict[stem]=Huet_verb_prim_prs(stem) stemlist.append(stem) rec = stemdict[stem] """ # the rest of the children describe details regarding how the inflected form arises from the stem. inflected_forms = children[0:-1] #tags = [inflected_form.tag for inflected_form in inflected_forms] for inflected_form in inflected_forms: tag = inflected_form.tag if tag != 'vu': # indeclineable verbal form continue [cjelt, ivelt] = list(inflected_form) # nominal, kridanta assert (cjelt.tag == 'cj' ), "Unexpected cj tag %s for stem=%s" % (cjelt.tag, stem) [primelt] = list(cjelt) # can be primary or causal conjugation time assert (primelt.tag in [ 'prim', 'ca', 'des' ]), "Unexpected prim tag %s for stem=%s" % (primelt.tag, stem) if primelt.tag == 'ca': ncaus = ncaus + 1 print "%s:%s:%s of %s" % (stem, form, hadvcode, primelt.tag) continue # don't handle these for now elif primelt.tag == 'des': ndes = ndes + 1 print "%s:%s:%s of %s" % (stem, form, hadvcode, primelt.tag) continue [elt] = list(ivelt) if elt.tag != hadvcode: continue if stem not in stemdict: # stem is a root stemdict[stem] = Huet_adverb(stem) stemlist.append(stem) rec = stemdict[stem] rec.update_form(form, ) element.clear() # for efficiency, free memory of this element print len(stemlist), "stems found in", filein fout = codecs.open(fileout, "w", "utf-8") # utf-8 not required # sort stemlist in Sanskrit alphabetical order stemlist.sort(cmp=slp_cmp) for stem in stemlist: rec = stemdict[stem] forms = rec.forms forms.sort(cmp=slp_cmp) formstr = ','.join(forms) out = "%s:%s" % (stem, formstr) fout.write(out + "\n") fout.close() print ncaus, "causal forms skipped" print ndes, "desiderative forms skipped"
def readcity(file): context = etree.iterparse(file, events=('end', )) fast_iter(context)
def on_epoch_end(self): self.parser_gen = iter(etree.iterparse(self.my_file_path, tag="s"))
#!/usr/bin/env python import urllib, sys, re from lxml import etree word = ' '.join(sys.argv[5:]).lower() f = urllib.urlopen('http://kill-or-cure.heroku.com/a-z/%s' % word[0]) foundterm = False msg = causetitle = preventtitle = None ns = '' for ev, el in etree.iterparse(f, events=['start-ns', 'end']): if ev == 'start-ns': prefix, ns = el continue if el.tag == '{%s}h2' % ns: if foundterm: break classes = el.get('class').split(' ') if 'termHeading' in classes: term = el.findtext('{%s}em' % ns) if term.lower() == word: foundterm = True result, = set(['both', 'cause', 'prevent']) & set(classes) msg = etree.tostring(el, method='text') msg = re.sub(r'\s+', ' ', msg).strip('# ')
def _poll_collection(self, poll_service, begin, end): req = Taxii11.poll_request(collection_name=self.collection, exclusive_begin_timestamp=begin, inclusive_end_timestamp=end) reqhdrs = Taxii11.headers(protocol=poll_service.split(':', 1)[0]) result = self._send_request(url=poll_service, headers=reqhdrs, data=req, stream=True) result.raw.decode_content = True while True: result_part_number = None result_id = None more = None tag_stack = collections.deque() # type: ignore try: for action, element in etree.iterparse(result.raw, events=('start', 'end'), recover=True): if action == 'start': tag_stack.append(element.tag) else: last_tag = tag_stack.pop() if last_tag != element.tag: raise RuntimeError( '{} - error parsing poll response, mismatched tags' .format(INTEGRATION_NAME)) if action == 'end' and element.tag.endswith( 'Status_Message') and len(tag_stack) == 0: self._raise_for_taxii_error( BeautifulSoup( etree.tostring(element, encoding='unicode'), 'xml')) return elif action == 'end' and element.tag.endswith( 'Poll_Response') and len(tag_stack) == 0: result_id = element.get('result_id', None) more = element.get('more', None) result_part_number = element.get( 'result_part_number', None) if result_part_number is not None: result_part_number = int(result_part_number) elif action == 'end' and element.tag.endswith( 'Content_Block') and len(tag_stack) == 1: for c in element: if c.tag.endswith('Content'): if len(c) == 0: continue content = etree.tostring(c[0], encoding='unicode') timestamp, indicators = StixDecode.decode( content) for indicator in indicators: yield indicator if timestamp: if self.last_stix_package_ts is None or timestamp > self.last_stix_package_ts: self.last_stix_package_ts = timestamp elif c.tag.endswith('Timestamp_Label'): timestamp = Taxii11.parse_timestamp_label( c.text) if timestamp: if self.last_taxii_content_ts is None or timestamp > self.last_taxii_content_ts: self.last_taxii_content_ts = timestamp element.clear() finally: result.close() if not more or more == '0' or more.lower() == 'false': break if result_id is None or result_part_number is None: break req = Taxii11.poll_fulfillment_request( collection_name=self.collection, result_id=result_id, result_part_number=result_part_number + 1) result = self._send_request(url=poll_service, headers=reqhdrs, data=req, stream=True)
print(len(community)) def show_big_communities(communites, authors, authorship): ten_biggest_communities = get_biggest_communities(communites, 200) for i, community in enumerate(ten_biggest_communities): print('DETAILED INFO - COMMUNITY ', i) show_community_size(community) show_the_most_popular_journals(community, authors, authorship, 5) authors = [] i = 0 max_authors = 0 for event, elem in etree.iterparse(source=xml, dtd_validation=False, load_dtd=True): # ET.iterparse(xml, events=('start', 'end', 'start-ns', 'end-ns')): if i % 100000 == 0: print(i / 1000000, len(graph_edges), max_authors) max_authors = 0 if event == 'end': i = i + 1 if elem.tag == 'title': title = elem.text if elem.tag == 'article' or elem.tag == 'inproceedings' or elem.tag == 'incollection' or elem.tag == 'proceedings' or elem.tag == 'www' or elem.tag == 'phdthesis' or elem.tag == 'mastersthesis' or elem.tag == 'book': if len(authors) > 1: if elem.tag == 'article' or elem.tag == 'inproceedings' or elem.tag == 'incollection' or elem.tag == 'proceedings': if len(authors) > max_authors: max_authors = len(authors)
import csv import sys from lxml.etree import iterparse writer = csv.writer(sys.stdout, quoting=csv.QUOTE_NONNUMERIC) group_name = '' parsing = iterparse('podcasts.opml', events=['start']) for (event, node) in parsing: if node.tag != 'outline': continue if not node.attrib.get('xmlUrl'): group_name = node.attrib['text'] else: writer.writerow( (group_name, node.attrib['text'], node.attrib['xmlUrl'], node.attrib.get('htmlUrl', '')))
def make_tmpfile(pagenum, dir='tempdata'): print("creates new file %d" % pagenum) '''returns a file object for a small chunk file; must close it yourself''' import os if not os.path.exists(dir): os.mkdir(dir) fp = os.path.join(dir, 'chunk_%d.xml' % pagenum) return open(fp, mode='w') # USAGE context1 = etree.iterparse( "./file1", tag='{http://www.mediawiki.org/xml/export-0.10/}page', encoding='utf-8') context2 = etree.iterparse( "file2", tag='{http://www.mediawiki.org/xml/export-0.10/}page', encoding='utf-8') fast_iter(context1) fast_iter(context2) et = etree.parse('tempdata/chunk_20.xml') root = et.getroot() nsmap = {'ns': 'http://www.mediawiki.org/xml/export-0.10/'} root.findall('ns:page', nsmap) # find all pages root.xpath('*/*/*/ns:username', namespaces=nsmap) # extract all username tags