def ETIteratorFromObj(obj, events=None, parser=None): """obj can be 1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned 2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned 3) an open input stream -> the input is parsed and the resulting ElementTree is returned 4) an ElementTree or an Element -> obj is returned as-is, nothing is done""" if isinstance(obj, str) or isinstance(obj, unicode): if obj.endswith(".gz"): fStream = GzipFile(obj, "rt") #fStream = codecs.getreader("utf-8")(GzipFile(obj,"rt")) else: fStream = open(obj, "rt") #fStream=codecs.open(obj, "rt", "utf-8") for rv in ElementTree.iterparse(fStream, events): yield rv elif isinstance(obj, ElementTree.ElementTree) or ElementTree.iselement(obj): if ElementTree.iselement(obj): root = obj else: root = obj.getroot() #if events == None: # events = ["END"] for element in root.getiterator(): yield ("memory", element) else: #not a string, not a tree, not an element, should be a stream #let's parse it for rv in ElementTree.iterparse(obj, events): yield rv
def ETIteratorFromObj(obj, events=None, parser=None): """obj can be 1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned 2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned 3) an open input stream -> the input is parsed and the resulting ElementTree is returned 4) an ElementTree or an Element -> obj is returned as-is, nothing is done""" if isinstance(obj, str) or isinstance(obj, unicode): if obj.endswith(".gz"): fStream = GzipFile(obj, "rt") # fStream = codecs.getreader("utf-8")(GzipFile(obj,"rt")) else: fStream = open(obj, "rt") # fStream=codecs.open(obj, "rt", "utf-8") for rv in ElementTree.iterparse(fStream, events): yield rv elif isinstance(obj, ElementTree.ElementTree) or ElementTree.iselement(obj): if ElementTree.iselement(obj): root = obj else: root = obj.getroot() # if events == None: # events = ["END"] for element in root.getiterator(): yield ("memory", element) else: # not a string, not a tree, not an element, should be a stream # let's parse it for rv in ElementTree.iterparse(obj, events): yield rv
def parse(self): var_names = [] bindings = [] events = iter(ET.iterparse(self.stream, events=('start', 'end'))) # lets gather up the variable names in head for (event, node) in events: if event == 'start' and node.tag == _VARIABLE: var_names.append(node.get('name')) elif event == 'end' and node.tag == _HEAD: break # now let's yield each result as we parse them for (event, node) in events: if event == 'start': if node.tag == _BINDING: idx = var_names.index(node.get('name')) elif node.tag == _RESULT: bindings = [None, ] * len(var_names) elif event == 'end': if node.tag == _URI: bindings[idx] = URIRef(node.text) elif node.tag == _BNODE: bindings[idx] = BNode(node.text) elif node.tag == _LITERAL: bindings[idx] = Literal(node.text or '', datatype=node.get('datatype'), lang=node.get(_LANG)) elif node.tag == _RESULT: node.clear() yield tuple(bindings)
def iterparse(file, elementName, callback, limit=-1): """ Parse iteratively xml-files This function offers a simple way to use the cElementTree iterparse-function the way it is often used. Keyword arguments: file -- (file) file or file-like object to parse elementName -- (string) matching elements are passed to the callback callback -- (function) called when parser has parsed an element of name elementName limit -- (int) stop after reading "limit" elements. If -1, read until end of file. This is mostly useful when debugging programs that parse large files. """ context = ElementTree.iterparse(file, events=("start", "end")) root = None for event, elem in context: if limit == 0: return if event == "start" and root is None: root = elem # the first element is root if event == "end" and elem.tag == elementName: #elem.tag == "record": #... process record elements ... callback(elem) root.clear() if limit != -1: limit -= 1
def iterparse(file, elementName, callback, limit = -1): """ Parse iteratively xml-files This function offers a simple way to use the cElementTree iterparse-function the way it is often used. Keyword arguments: file -- (file) file or file-like object to parse elementName -- (string) matching elements are passed to the callback callback -- (function) called when parser has parsed an element of name elementName limit -- (int) stop after reading "limit" elements. If -1, read until end of file. This is mostly useful when debugging programs that parse large files. """ context = ElementTree.iterparse(file, events=("start", "end")) root = None for event, elem in context: if limit == 0: return if event == "start" and root is None: root = elem # the first element is root if event == "end" and elem.tag == elementName: #elem.tag == "record": #... process record elements ... callback(elem) root.clear() if limit != -1: limit -= 1
def iterparse2(file, events=("start", "end")): """ Parse iteratively xml-files This function offers a simple way to use the cElementTree iterparse-function the way it is often used. Keyword arguments: file -- (file) file or file-like object to parse elementName -- (string) matching elements are passed to the callback callback -- (function) called when parser has parsed an element of name elementName limit -- (int) stop after reading "limit" elements. If -1, read until end of file. This is mostly useful when debugging programs that parse large files. """ # get an iterable context = ElementTree.iterparse(file, events=events) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() yield (event, root) for event, elem in context: yield (event, elem) if event == "end": root.clear()
def parse(self): var_names = [] bindings = [] events = iter(ET.iterparse(self.stream, events=('start', 'end'))) # lets gather up the variable names in head for (event, node) in events: if event == 'start' and node.tag == _VARIABLE: var_names.append(node.get('name')) elif event == 'end' and node.tag == _HEAD: break # now let's yield each result as we parse them for (event, node) in events: if event == 'start': if node.tag == _BINDING: idx = var_names.index(node.get('name')) elif node.tag == _RESULT: bindings = [ None, ] * len(var_names) elif event == 'end': if node.tag == _URI: bindings[idx] = URIRef(node.text) elif node.tag == _BNODE: bindings[idx] = BNode(node.text) elif node.tag == _LITERAL: bindings[idx] = Literal(node.text or '', datatype=node.get('datatype'), lang=node.get(_LANG)) elif node.tag == _RESULT: node.clear() yield tuple(bindings)
def handle(self, *args, **options): # download file r = requests.get('https://data.icecat.biz/export/level4/refs/SuppliersList.xml.gz', auth=(settings.API_USERNAME, settings.API_PASSWORD)) tmp_filename = settings.TMP_PATH + '/suppliers.xml.gz' with open(tmp_filename, "wb") as f: f.write(r.content) # the suppliers list is gzipped f = gzip.open(tmp_filename, 'rb') # use cElement, it's faaaaaaaaast from cElementTree import iterparse context = iterparse(f, events=("start", "end")) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() # loop through suppliers for event, elem in context: if event == "end" and elem.tag == "Supplier": values = dict(elem.items()) # get or update the supplier supplier, created = Supplier.objects.get_or_create(name=values['Name'], pk=values['ID']) if created: print supplier.name + ' added' root.clear() # close gzip file f.close()
def read(f, ignore_tags=None): """ Generator for reading a wikiprep XML file from a file object. """ print >> sys.stderr, "Reading %s..." % f # print >> sys.stderr, stats() doc = {} cnt = 0 if not ignore_tags: ignore_tags = set() for event, elem in cElementTree.iterparse(f): if elem.tag in ignore_tags: continue if elem.tag == "title": doc["title"] = ("".join(elem.itertext())) elif elem.tag == "text": doc["text"] = ("".join(elem.itertext())) elif elem.tag == "link": # Skip internal links if elem.get("url") is None: continue if "external links" not in doc: doc["external links"] = [] doc["external links"].append([elem.get("url"), ("".join(elem.itertext()))]) elif elem.tag == "links": doc["links"] = [int(i) for i in string.split("".join(elem.itertext()))] elif elem.tag == "categories": doc["categories"] = [int(i) for i in string.split("".join(elem.itertext()))] elif elem.tag == "page": doc["_id"] = int(elem.get("id")) doc["length"] = int(elem.get("newlength")) if elem.get("stub"): doc["stub"] = bool(elem.get("stub") == "1") if elem.get("disambig"): doc["disambig"] = bool(elem.get("disambig") == "1") if elem.get("image"): doc["image"] = bool(elem.get("image") == "1") if elem.get("category"): doc["category"] = bool(elem.get("category") == "1") cnt += 1 yield doc doc = {} # Free the memory of the building tree elem.clear() if cnt % 20000 == 0: print >> sys.stderr, "Read %d articles from %s" % (cnt, f) # print >> sys.stderr, stats() print >> sys.stderr, "...done reading %s" % f
def __init__(self, filename): self.deltainfo = {} if filename.endswith(".gz"): fo = gzip.open(filename) else: fo = open(filename, "rt") for event, elem in iterparse(fo): if elem.tag == "newpackage": p = NewPackage(elem) self.deltainfo[p.nevra()] = p
def __init__(self, filename): self.deltainfo = {} if filename.endswith(".gz"): fo = gzip.open(filename) else: fo = open(filename, 'rt') for event, elem in iterparse(fo): if elem.tag == "newpackage": p = NewPackage(elem) self.deltainfo[p.nevra()] = p
def __init__(self, filename): parser = ET.iterparse(filename) for action, elem in parser: unmarshal = self.unmarshallers.get(elem.tag) if unmarshal: data = unmarshal(elem) elem.clear() elem.text = data elif elem.tag != "plist": raise IOError("unknown plist type: %r" % elem.tag) self.plist = parser.root[0].text
def __init__(self, file): MoleculeFactory.__init__(self) for event, element in iterparse(file): tag = element.tag ob_id = element.attrib.get('id', None) if tag == 'molecule' and ob_id is not None: self.makeGroup(element) element.clear() elif tag == 'templates': element.clear() elif tag == 'universe': self.makeUniverse(element)
def read_mast_output(mastfile, output, genes): MAST = MastResult(genes) for event, elem in cET.iterparse(mastfile): tag = elem.tag if tag == "model": MAST.model(elem) elif tag == "sequences": MAST.sequences(elem) annotations = MAST.getAnnotD() MAST.combine_p_values() comb_pe_values = MAST.get_pe_values() return (comb_pe_values, annotations)
def handle(self, *args, **options): # download file r = requests.get('https://data.icecat.biz/export/level4/refs/CategoriesList.xml.gz', auth=(settings.API_USERNAME, settings.API_PASSWORD)) tmp_filename = settings.TMP_PATH + '/categories.xml.gz' with open(tmp_filename, "wb") as f: f.write(r.content) # the suppliers list is gzipped f = gzip.open(tmp_filename, 'rb') # use cElement, it's faaaaaaaaast from cElementTree import iterparse context = iterparse(f, events=("start", "end")) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() # loop through suppliers for event, elem in context: if event == "end" and elem.tag == "Category": values = dict(elem.items()) category, created = Category.objects.get_or_create(pk=values['ID']) # find description descriptions = elem.findall('Description') for descr in descriptions: descr_values = dict(descr.items()) if descr_values['langid'] == str(settings.API_LANGUAGE): category.description = descr_values['Value'] # find name names = elem.findall('Name') for name in names: name_values = dict(name.items()) if name_values['langid'] == str(settings.API_LANGUAGE): category.name = name_values['Value'] # see if we have a parent category parent = elem.find('ParentCategory') if parent: parent = dict(parent.items()) category.parent_id = parent['ID'] category.save() root.clear() # All done!
def termExtraction(appid, context, query=None): d = dict(appid=appid, context=context.encode("utf-8")) if query: d["query"] = query.encode("utf-8") result = [] f = urllib.urlopen(URI + '?' + urllib.urlencode(d)) try: for event, elem in ElementTree.iterparse(f): if elem.tag == "{urn:yahoo:cate}Result": result.append(elem.text) except SyntaxError: return [] return result
def loads(data): params = method = None for action, elem in iterparse(StringIO(data)): unmarshal = unmarshallers.get(elem.tag) if unmarshal: data = unmarshal(elem) elem.clear() elem.text = data elif elem.tag == "methodName": method = elem.text elif elem.tag == "params": params = tuple([v.text for v in elem]) return params, method
def parse_troves(filename): """Parse given fm troves""" f = {} for event, elem in iterparse(open(filename, "r")): if elem.tag == "id": id = int(elem.text) elem.clear() elif elem.tag == "name": name = "%s" % elem.text elem.clear() f[id] = {'id': id, 'name': name} elem.clear() file = open(TROVES_DICT, 'w') cPickle.dump(f, file)
def load(file): """ Loads an Apple Property List (XML) and parses it Source: http://effbot.org/zone/element-iterparse.htm """ parser = ET.iterparse(file) for action, elem in parser: unmarshal = unmarshallers.get(elem.tag) if unmarshal: data = unmarshal(elem) elem.clear() elem.text = data elif elem.tag != "plist": raise IOError("Unknown plist type: %r" % elem.tag) return parser.root[0].text
def termExtraction(appid, context, query=None): d = dict( appid=appid, context=context.encode("utf-8") ) if query: d["query"] = query.encode("utf-8") result = [] f = urllib.urlopen(URI + '?' + urllib.urlencode(d)) try: for event, elem in ElementTree.iterparse(f): if elem.tag == "{urn:yahoo:cate}Result": result.append(elem.text) except SyntaxError: return [] return result
def parse_troves(filename): """Parse given fm troves""" f = {} for event, elem in iterparse(open(filename, "r")): if elem.tag == "id": id = int(elem.text) elem.clear() elif elem.tag == "name": name = "%s" % elem.text elem.clear() f[id] = {'id': id, 'name': name } elem.clear() file = open(TROVES_DICT, 'w') cPickle.dump(f, file)
def parse_rdf(filename): """Parse given fm rdf""" f = {} trove = "" #Number of years to check back in time. years = get_last_four_years() for event, elem in iterparse(open(filename, "r")): if elem.tag == "project_id": project_id = int(elem.text) elem.clear() elif elem.tag == "projectname_short": projectname_short = "%s" % elem.text elem.clear() elif elem.tag == "desc_short": desc_short = "%s" % elem.text elem.clear() elif elem.tag == "latest_release_version": latest_release_version = "%s" % elem.text elem.clear() elif elem.tag == "url_homepage": url_homepage = "%s" % elem.text elem.clear() elif elem.tag == "url_changelog": url_changelog = "%s" % elem.text elem.clear() elif elem.tag == "latest_release_date": latest_release_date = "%s" % elem.text[0:10] elem.clear() elif elem.tag == "descriminators": t = "" for trove in elem[:]: t = "%s %s" % (t, trove.text) #If it hasn't been updated in four years, screw it. if latest_release_date[0:4] in years: f[projectname_short] = { 'id': project_id, 'descShort': desc_short, 'fmName': projectname_short, 'latestReleaseVersion': latest_release_version, 'urlHomepage': url_homepage, 'urlChangelog': url_changelog, 'latestReleaseDate': latest_release_date, 'troveId': t } elem.clear() file = open(FM_DICT, 'w') cPickle.dump(f, file)
def _parseInternal(self,fileLike): for event,elem in et.iterparse(fileLike): if elem.tag == 'request': id = elem.attrib.get('id') status = elem.attrib.get('status') reason = elem.attrib.get('reason') if id is None: raise ParseException('missing id') if status is None: raise ParseException('missing status') if reason is None: reason = '' cval = ResponseData(id,status,reason) self.requests.append(cval) for itemel in elem.getiterator(): if itemel.tag == 'value': cval.params[itemel.attrib['name']] = itemel.text
def processResults(self,data): results = [] citem = None #domtree = parseString(data) #print domtree.toprettyxml() for event,elem in et.iterparse(StringIO(data)): #print elem.tag if elem.tag == '{urn:ebay:apis:eBLBaseComponents}Item': citem = self.ebayItem() results.append(citem) for itemel in elem.getiterator(): if itemel.tag == '{urn:ebay:apis:eBLBaseComponents}ViewItemURL': citem.url = itemel.text elif itemel.tag == '{urn:ebay:apis:eBLBaseComponents}CurrentPrice': citem.price = itemel.text return results
def parse_rdf(filename): """Parse given fm rdf""" f = {} trove = "" #Number of years to check back in time. years = get_last_four_years() for event, elem in iterparse(open(filename, "r")): if elem.tag == "project_id": project_id = int(elem.text) elem.clear() elif elem.tag == "projectname_short": projectname_short = "%s" % elem.text elem.clear() elif elem.tag == "desc_short": desc_short = "%s" % elem.text elem.clear() elif elem.tag == "latest_release_version": latest_release_version = "%s" % elem.text elem.clear() elif elem.tag == "url_homepage": url_homepage = "%s" % elem.text elem.clear() elif elem.tag == "url_changelog": url_changelog = "%s" % elem.text elem.clear() elif elem.tag == "latest_release_date": latest_release_date = "%s" % elem.text[0:10] elem.clear() elif elem.tag == "descriminators": t = "" for trove in elem[:]: t = "%s %s" % (t, trove.text) #If it hasn't been updated in four years, screw it. if latest_release_date[0:4] in years: f[projectname_short] = {'id': project_id, 'descShort': desc_short, 'fmName': projectname_short, 'latestReleaseVersion': latest_release_version, 'urlHomepage': url_homepage, 'urlChangelog': url_changelog, 'latestReleaseDate': latest_release_date, 'troveId': t } elem.clear() file = open(FM_DICT, 'w') cPickle.dump(f, file)
def parse_sparql_xml(self, data): """ New cElementTree SPARQL Results format parser, Copyright (c) 2011 Daniel A. Smith (written 2011-02-23) """ current_type = None current_name = "" current_chars = "" results = [] current = {} logging.debug(data) for event, elem in cElementTree.iterparse(StringIO(data), events=("start", "end")): if event == "start": if elem.tag == '{http://www.w3.org/2005/sparql-results#}uri': current_type = 'uri' elif elem.tag == '{http://www.w3.org/2005/sparql-results#}literal': current_type = 'literal' elif elem.tag == '{http://www.w3.org/2005/sparql-results#}binding': current_name = elem.attrib['name'] elif elem.tag == '{http://www.w3.org/2005/sparql-results#}bnode': current_type = 'bnode' elif event == "end": if elem.tag == '{http://www.w3.org/2005/sparql-results#}uri': current_chars = elem.text elif elem.tag == '{http://www.w3.org/2005/sparql-results#}literal': current_chars = elem.text elif elem.tag == '{http://www.w3.org/2005/sparql-results#}bnode': current_chars = elem.text elif elem.tag == '{http://www.w3.org/2005/sparql-results#}binding': current[current_name] = { 'value': current_chars, 'type': current_type } if elem.attrib.has_key('xml:lang'): current[current_name]['xml:lang'] = elem.attrib[ 'xml:lang'] current_chars = "" elif elem.tag == '{http://www.w3.org/2005/sparql-results#}result': results.append(current) current = {} elem.clear() return results
def handle(self, *args, **options): # TODO: make this configurable? filename = args[0] with open(filename, "r") as f: # use cElement, it's faaaaaaaaast from cElementTree import iterparse context = iterparse(f, events=("start", "end")) # turn it into an iterator context = iter(context) # # get the root element event, root = context.next() # loop through suppliers for event, elem in context: if event == "end" and elem.tag == "file": values = dict(elem.items()) supplier, created = Supplier.objects.get_or_create( pk=values['Supplier_id']) category, created = Category.objects.get_or_create( pk=values['Catid']) product = Product() product.pk = values['Product_ID'] product.supplier = supplier product.category = category product.model_name = values['Model_Name'] product.part = values['Prod_ID'] product.created_at = datetime.strptime( values['Date_Added'], '%Y%m%d%H%M%S') product.updated_at = datetime.strptime( values['Updated'], '%Y%m%d%H%M%S') product.thumbnail = values['HighPic'] if values['On_Market'] == '1': product.on_market = True product.save() print product.model_name, product.part root.clear()
def parse_sparql_xml(self, data): """ New cElementTree SPARQL Results format parser, Copyright (c) 2011 Daniel A. Smith (written 2011-02-23) """ current_type = None current_name = "" current_chars = "" results = [] current = {} logging.debug(data) for event, elem in cElementTree.iterparse(StringIO(data), events=("start","end")): if event == "start": if elem.tag == '{http://www.w3.org/2005/sparql-results#}uri': current_type = 'uri' elif elem.tag == '{http://www.w3.org/2005/sparql-results#}literal': current_type = 'literal' elif elem.tag == '{http://www.w3.org/2005/sparql-results#}binding': current_name = elem.attrib['name'] elif elem.tag == '{http://www.w3.org/2005/sparql-results#}bnode': current_type = 'bnode' elif event == "end": if elem.tag == '{http://www.w3.org/2005/sparql-results#}uri': current_chars = elem.text elif elem.tag == '{http://www.w3.org/2005/sparql-results#}literal': current_chars = elem.text elif elem.tag == '{http://www.w3.org/2005/sparql-results#}bnode': current_chars = elem.text elif elem.tag == '{http://www.w3.org/2005/sparql-results#}binding': current[current_name] = {'value': current_chars, 'type': current_type} if elem.attrib.has_key('xml:lang'): current[current_name]['xml:lang'] = elem.attrib['xml:lang'] current_chars = "" elif elem.tag == '{http://www.w3.org/2005/sparql-results#}result': results.append(current) current = {} elem.clear() return results
def handle(self, *args, **options): # TODO: make this configurable? filename = args[0] with open(filename, "r") as f: # use cElement, it's faaaaaaaaast from cElementTree import iterparse context = iterparse(f, events=("start", "end")) # turn it into an iterator context = iter(context) # # get the root element event, root = context.next() # loop through suppliers for event, elem in context: if event == "end" and elem.tag == "file": values = dict(elem.items()) supplier, created = Supplier.objects.get_or_create(pk=values['Supplier_id']) category, created = Category.objects.get_or_create(pk=values['Catid']) product = Product() product.pk = values['Product_ID'] product.supplier = supplier product.category = category product.model_name = values['Model_Name'] product.part = values['Prod_ID'] product.created_at = datetime.strptime(values['Date_Added'], '%Y%m%d%H%M%S') product.updated_at = datetime.strptime(values['Updated'], '%Y%m%d%H%M%S') product.thumbnail = values['HighPic'] if values['On_Market'] == '1': product.on_market = True product.save() print product.model_name, product.part root.clear()
def parseRequest(self,data): currentCat = None tempcat = {} # save off the file. temporary hack!! domtree = parseString(data) f = open('/tmp/output.xml','w') f.write(domtree.toprettyxml().encode('utf-8')) f.close() try: for event,elem in et.iterparse(StringIO(data)): if elem.tag == '{urn:ebay:apis:eBLBaseComponents}Category': for itemel in elem.getiterator(): if itemel.tag == '{urn:ebay:apis:eBLBaseComponents}CategoryID': currentCat = itemel.text if itemel.tag == '{urn:ebay:apis:eBLBaseComponents}CategoryName': if itemel.text == 'Cycling': print 'cycling found,value is',currentCat,itemel.text.lower() in tempcat if itemel.text.lower() not in tempcat: tempcat[itemel.text.lower()] = currentCat except Exception,e: print 'error loading catalog; keeping existing entries',e
def inlinkData(appid, # see http://developer.yahoo.net/faq/index.html#appid query, # The domain or path to get inlink data for. results=50, # The number of results to return. start=1, # The starting result position to return (1-based). # The finishing position (start + results - 1) cannot # exceed 1000. entire_site=None # Specifies whether to provide results for the # entire site, or just the page referenced by # the query. If the query is not a domain URL # (i.e. it contains path information, such as # http://smallbusiness.yahoo.com/webhosting/), # this parameter has no effect. ): d = dict(appid=appid, query=unicodify(query).encode('utf-8'), results=int(results), start=int(start) ) if entire_site: # xxx perhaps this parameter should be automatically # set based on the 'query' having a path d['entire_site'] = 1 u = URI+'?'+urllib.urlencode(d) f = urllib.urlopen(u) results = [] Title = Url = ClickUrl = None for event, elem in ElementTree.iterparse(f): if elem.tag == '{urn:yahoo:srch}ClickUrl': ClickUrl = elem.text elif elem.tag == '{urn:yahoo:srch}Url': Url = elem.text elif elem.tag == '{urn:yahoo:srch}Title': Title = elem.text elif not (Title is None or Url is None or ClickUrl is None): yield dict(title=Title, url=Url, clickurl=ClickUrl) Title = Url = ClickUrl = None
def handle(self, *args, **options): # download file r = requests.get( 'https://data.icecat.biz/export/level4/refs/SuppliersList.xml.gz', auth=(settings.API_USERNAME, settings.API_PASSWORD)) tmp_filename = settings.TMP_PATH + '/suppliers.xml.gz' with open(tmp_filename, "wb") as f: f.write(r.content) # the suppliers list is gzipped f = gzip.open(tmp_filename, 'rb') # use cElement, it's faaaaaaaaast from cElementTree import iterparse context = iterparse(f, events=("start", "end")) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() # loop through suppliers for event, elem in context: if event == "end" and elem.tag == "Supplier": values = dict(elem.items()) # get or update the supplier supplier, created = Supplier.objects.get_or_create( name=values['Name'], pk=values['ID']) if created: print supplier.name + ' added' root.clear() # close gzip file f.close()
def parseFile(self, file): current_comp_id = None current_seq_id = None current_asym_id = None current_chain = None current_residue = None for event, element in ET.iterparse(file): tag = element.tag if self.prefix is None: self.prefix = tag[:tag.find('}')+1] if tag == self.prefix+"atom_site": atom_spec = self.parseAtom(element) if (atom_spec['alt_id'] is None or atom_spec['alt_id'] == self.alternate) \ and atom_spec['model'] == self.model: atom = Atom(atom_spec['name'], atom_spec['position'], element=atom_spec['element'], occupancy=atom_spec['occupancy'], temperature_factor=atom_spec['beta']) self.atoms[atom_spec['atom_id']] = atom if atom_spec['asym_id'] != current_asym_id: # start new chain or molecule entity = self.entities[atom_spec['entity_id']] residue_name = atom_spec['comp_id'] if entity['type'] == 'polymer': if residue_name in amino_acids: current_chain = MMTK.PDB.PDBPeptideChain(chain_id = atom_spec['asym_id'], segment_id = '') self.peptide_chains.append(current_chain) elif residue_name in nucleic_acids: current_chain = MMTK.PDB.PDBNucleotideChain(chain_id = atom_spec['asym_id'], segment_id = '') self.nucleotide_chains.append(current_chain) else: raise ValueError('Unknown polymer type' + 'containing residue ' + residue_name) current_comp_id = None current_residue = None else: current_chain = None current_residue = MMTK.PDB.PDBMolecule(residue_name) current_comp_id = residue_name current_seq_id = atom_spec['seq_id'] mol_list = self.molecules.get(residue_name, []) mol_list.append(current_residue) self.molecules[residue_name] = mol_list self.residues.append(current_residue) current_asym_id = atom_spec['asym_id'] if atom_spec['comp_id'] != current_comp_id or \ atom_spec['seq_id'] != current_seq_id: # start a new residue current_comp_id = atom_spec['comp_id'] current_seq_id = atom_spec['seq_id'] if current_comp_id in amino_acids: current_residue = AminoAcidResidue(current_comp_id, [], current_seq_id) elif current_comp_id in nucleic_acids: current_residue = NucleotideResidue(current_comp_id, [], current_seq_id) else: raise ValueError('Unknown residue ' + residue_name) current_chain.addResidue(current_residue) self.residues.append(current_residue) if current_residue is not None: # Ultimately, this should never be None, but for # now we skip whatever we can't handle yet current_residue.addAtom(atom) element.clear() elif tag == self.prefix+'chem_compCategory': self.chem_comp = element elif tag == self.prefix+'pdbx_entity_nameCategory': self.entity_names = element elif tag == self.prefix+'entityCategory': self.entities = {} for e in element: entity_id = e.attrib['id'] entity_def = {} for field in e: entity_def[field.tag[len(self.prefix):]] = field.text self.entities[entity_id] = entity_def elif tag == self.prefix+'entity_poly_seqCategory': self.chain_entities = None elif tag == self.prefix+'pdbx_poly_seq_schemeCategory': self.chains = element elif tag == self.prefix+'pdbx_nonpoly_schemeCategory': self.nonchains = element elif tag == self.prefix+'atom_siteCategory': # This event happens when all atoms have been treated element.clear()
def __init__(self, source): self.elements = [] # map element names to ElementTypes elemDict = {} # elements currently open elemStack = [] # map (father-name, child-name) to _childInfo childInfoDict = {} namespaces = deque([('xml','http://www.w3.org/XML/1998/namespace')]) pendingNamespaces = [] for event,elem in iterparse(source, events=('start', 'end', 'start-ns', 'end-ns')): if event == 'start-ns': namespaces.appendleft(elem) pendingNamespaces.append(elem) elif event == 'end-ns': namespaces.popleft() elif event == 'start': # add the namespace declarations as attributes for prefix,url in pendingNamespaces: attr = prefix and 'xmlns:%s' % prefix or 'xmlns' elem.attrib[attr] = escape_attrib(url) del pendingNamespaces[:] # convert name from clark format to prefix:local name = _clark_to_orig(elem.tag,namespaces) elemType = elemDict.get(name) if elemType is None: elemType = ElementType(name) elemDict[name] = elemType self.elements.append(elemType) elemType._occurrences += 1 # update atttibute declarations for attr,value in elem.items(): # convert attribute names from clark format to prefix:local elemType.updateAttribute(_clark_to_orig(attr,namespaces),value) # keep track of the nesting and sequence of child elements if elemStack: parentEntry = elemStack[-1] # for sequencing, we're interested in consecutive groups # of the same child element type isFirstInGroup = parentEntry.latestChild != name if isFirstInGroup: parentEntry.latestChild = name parentEntry.sequenceNumber += 1 # check if we've seen this child of this parent before parent = parentEntry.elemType childInfo = childInfoDict.get((parent.name,name)) if childInfo is None: # this is the first time we've seen this child # belonging to this parent. if the child is not on # the first instance of the parent, then we allow it # as an optional element childInfo = _ChildInfo(name, parent._occurrences>1) childInfoDict[parent.name,name] = childInfo parent._children.append(childInfo) elif ( # we've seen this child before: check if it makes # parent non-consecutive parent._occurrences == 1 and isFirstInGroup # check whether the position of this group of children in # this parent element is the same as its position in # previous instances of the parent. or len(parent._children) <= parentEntry.sequenceNumber or parent._children[parentEntry.sequenceNumber].name != name): parent._sequenced = False # if there's more than one child element, mark it as repeatable if not isFirstInGroup: childInfo.repeatable = True #fi elemStack elemStack.append(_StackEntry(elemType)) elif event == 'end': entry = elemStack.pop() elemType = entry.elemType for txt in elem.text, elem.tail: if txt is not None and not txt.isspace(): elemType._hasCharacterContent = True break # check that all expected children are accounted for. # If the number of child element groups in this parent element # is less than the number in previous elements, then the # absent children are marked as optional if elemType._sequenced: for c in elemType._children[entry.sequenceNumber+1:]: c.optional = True elem.clear()
def parse_intact_xml_file(self, curs, input_fname, expt_table, interaction_table, acc_tax_id2gene_id_list): """ 12-28-05 xmlns="net:sf:psidev:mi" causes a namespace header to be added for each element 12-29-05 add acc_tax_id2gene_id_list """ sys.stderr.write("Parsing %s...\n"%input_fname) namespace = '{net:sf:psidev:mi}' for event, elem in ElementTree.iterparse(input_fname): if elem.tag == '%sentry'%namespace: interactor_intact_id2uniprot_id_tax_id = {} #later used in parsing interactionList for sub_elem in elem: if sub_elem.tag == '%sexperimentList'%namespace: for expt_desc_elem in sub_elem: expt_attrib = expt_attribute() expt_attrib.expt_id = expt_desc_elem.get("id") expt_attrib.short_label = expt_desc_elem.findtext('%snames/%sshortLabel'%(namespace, namespace)) expt_attrib.full_name = expt_desc_elem.findtext("%snames/%sfullName"%(namespace, namespace)) pubmed_ref_elem = expt_desc_elem.find("%sbibref/%sxref/%sprimaryRef"%(namespace, namespace, namespace)) if pubmed_ref_elem.get("db")=="pubmed": expt_attrib.pubmed_id = int(pubmed_ref_elem.get("id")) else: expt_attrib.pubmed_id = -1 self.submit_expt_table(curs, expt_attrib, expt_table) expt_desc_elem.clear() #release memory if sub_elem.tag == '%sinteractorList'%namespace: for interactor_elem in sub_elem: interactor_intact_id = interactor_elem.get("id") uniprot_id_elem = interactor_elem.find('%sxref/%sprimaryRef'%(namespace, namespace)) uniprot_id = uniprot_id_elem.get("id") tax_id_elem = interactor_elem.find("%sorganism"%namespace) if not tax_id_elem: interactor_intact_id2uniprot_id_tax_id[interactor_intact_id] = (uniprot_id.upper(), None, None) else: tax_id = int(tax_id_elem.get("ncbiTaxId")) #12-29-05 key = (uniprot_id.upper(), tax_id) gene_id_list = acc_tax_id2gene_id_list.get(key) if gene_id_list and len(gene_id_list)==1: interactor_intact_id2uniprot_id_tax_id[interactor_intact_id] = (uniprot_id.upper(), int(gene_id_list[0]), tax_id) else: #sys.stderr.write("\t Warning: %s gets entrez gene_id_list: %s\n"%(uniprot_id, gene_id_list)) interactor_intact_id2uniprot_id_tax_id[interactor_intact_id] = (uniprot_id.upper(), None, tax_id) #12-29-05 use None #10-25-06 mrinal if not self.gene_symbol2gene_id.has_key(str(tax_id)): print "Getting gene symbol mappings for tax_id",tax_id self.gene_symbol2gene_id[str(tax_id)] = get_gene_symbol2gene_id(curs, tax_id) gs2gid = self.gene_symbol2gene_id[str(tax_id)] names_elem = interactor_elem.find('%snames'%namespace) gene_names = Set() gene_ids = Set() for alias_elem in names_elem: if alias_elem.tag == '%salias'%namespace: alias_type = alias_elem.get("type") if alias_type =="gene name": gene_names.add(alias_elem.text) elif alias_type =="gene name synonym": gene_names.add(alias_elem.text) for gene_name in gene_names: if gs2gid.has_key(gene_name): gene_ids.add(gs2gid[gene_name]) if len(gene_ids)==1: interactor_intact_id2uniprot_id_tax_id[interactor_intact_id] = (uniprot_id.upper(), int(gene_ids.pop()), tax_id) else: sys.stderr.write("\t Warning: Couldn't find gene ids for interactor id %s\n"%(interactor_intact_id)) interactor_elem.clear() #release memory if sub_elem.tag == "%sinteractionList"%namespace: skipped=0 wrote=0 for interaction_elem in sub_elem: interaction_attrib = interaction_attribute() interaction_attrib.expt_id_array = [expt_ref_elem.text \ for expt_ref_elem in interaction_elem.find("%sexperimentList"%namespace)] interaction_attrib.interaction_type_id = \ interaction_elem.find('%sinteractionType/%sxref/%sprimaryRef'%(namespace, namespace, namespace)).get("id") interaction_attrib.intact_id = interaction_elem.find("%sxref/%sprimaryRef"%(namespace, namespace)).get("id") for prot_part_elem in interaction_elem.find("%sparticipantList"%namespace): prot_intact_id = prot_part_elem.find("%sinteractorRef"%namespace).text #12-29-05 uniprot_id, gene_id, tax_id = interactor_intact_id2uniprot_id_tax_id[prot_intact_id] interaction_attrib.uniprot_id_array.append(uniprot_id) if interaction_attrib.tax_id and tax_id!=interaction_attrib.tax_id: sys.stderr.write("\t Warning: interaction %s has >1 tax_id: %s, %s(ignored).\n"%\ (interaction_attrib.intact_id, interaction_attrib.tax_id, tax_id)) interaction_attrib.is_cross_species = 1 #interaction not just within one species else: interaction_attrib.tax_id = tax_id #10-25-06 (mrinal) if gene_id: interaction_attrib.gene_id_array.append(gene_id) else: sys.stderr.write("\t Warning: prot_intact_id %s doesn't have proper NCBI gene id.\n"%prot_intact_id) interaction_attrib.is_cross_species = 1 # tag it as bad if interaction_attrib.uniprot_id_array and interaction_attrib.gene_id_array: #12-29-05 not empty self.submit_interaction_table(curs, interaction_attrib, interaction_table) wrote+=1 else: skipped+=1 interaction_elem.clear() #release memory sub_elem.clear() #release the sub_elem sys.stderr.write("Done.\n") print "wrote:",wrote,"skipped:",skipped
print "all.xml is fresh" except IOError: print "Creating all.xml" open('update', 'w').write(update) x = urllib2.urlopen(all).read() open('all.xml', 'w').write(x) try: os.mkdir('./backup') except OSError: pass os.chdir('./backup') gmail = gmail_connect(gmail_user, gmail_pw) for e, post in ce.iterparse('../all.xml'): if post.tag != "post": continue data = dict(post.items()) #turn any utf-8 data into binary strings name = data['href'][7:].encode('utf-8') desc = data['description'].encode('utf-8') ext = data.get('extended', '').encode('utf-8') tags = data.get('tag', '').encode('utf-8') #make the url into a valid filename #\W = not([a-zA-Z0-9_]), except unicode-aware name = re.sub('\W', '_', name)[-200:] try: if not os.path.isfile(name): print "getting %s as %s" % (data['href'], name) f = open(name, 'w')
SFX = ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'] t0 = time.time() count = 0 size = 0 files = sys.argv[1:] if os.path.isdir(files[0]): print "No dirs please, use wildcards" sys.exit() for file in files: if os.path.isfile(os.path.abspath(file)): count += 1 size += os.path.getsize(file) print "processing %s ..." % file date = None try: for event, elem in cElementTree.iterparse(file): if elem.tag.split("}")[1] == "time": if date is None: date = isodate.parse_datetime(elem.text) else: newdate = isodate.parse_datetime(elem.text) if newdate > date: date = newdate elem.clear() except SyntaxError: print "\tnot valid xml" pass if date is not None: t = time.mktime(date.timetuple()) os.utime(file, (time.time(), t)) else:
def programmes(input_file): for event, elem in ET.iterparse(input_file): if event == "end" and elem.tag == 'programme': yield Programme(elem)
def __init__(self, source): self.elements = [] # map element names to ElementTypes elemDict = {} # elements currently open elemStack = [] # map (father-name, child-name) to _childInfo childInfoDict = {} namespaces = deque([('xml', 'http://www.w3.org/XML/1998/namespace')]) pendingNamespaces = [] for event, elem in iterparse(source, events=('start', 'end', 'start-ns', 'end-ns')): if event == 'start-ns': namespaces.appendleft(elem) pendingNamespaces.append(elem) elif event == 'end-ns': namespaces.popleft() elif event == 'start': # add the namespace declarations as attributes for prefix, url in pendingNamespaces: attr = prefix and 'xmlns:%s' % prefix or 'xmlns' elem.attrib[attr] = escape_attrib(url) del pendingNamespaces[:] # convert name from clark format to prefix:local name = _clark_to_orig(elem.tag, namespaces) elemType = elemDict.get(name) if elemType is None: elemType = ElementType(name) elemDict[name] = elemType self.elements.append(elemType) elemType._occurrences += 1 # update atttibute declarations for attr, value in elem.items(): # convert attribute names from clark format to prefix:local elemType.updateAttribute(_clark_to_orig(attr, namespaces), value) # keep track of the nesting and sequence of child elements if elemStack: parentEntry = elemStack[-1] # for sequencing, we're interested in consecutive groups # of the same child element type isFirstInGroup = parentEntry.latestChild != name if isFirstInGroup: parentEntry.latestChild = name parentEntry.sequenceNumber += 1 # check if we've seen this child of this parent before parent = parentEntry.elemType childInfo = childInfoDict.get((parent.name, name)) if childInfo is None: # this is the first time we've seen this child # belonging to this parent. if the child is not on # the first instance of the parent, then we allow it # as an optional element childInfo = _ChildInfo(name, parent._occurrences > 1) childInfoDict[parent.name, name] = childInfo parent._children.append(childInfo) elif ( # we've seen this child before: check if it makes # parent non-consecutive parent._occurrences == 1 and isFirstInGroup # check whether the position of this group of children in # this parent element is the same as its position in # previous instances of the parent. or len(parent._children) <= parentEntry.sequenceNumber or parent._children[parentEntry.sequenceNumber].name != name): parent._sequenced = False # if there's more than one child element, mark it as repeatable if not isFirstInGroup: childInfo.repeatable = True #fi elemStack elemStack.append(_StackEntry(elemType)) elif event == 'end': entry = elemStack.pop() elemType = entry.elemType for txt in elem.text, elem.tail: if txt is not None and not txt.isspace(): elemType._hasCharacterContent = True break # check that all expected children are accounted for. # If the number of child element groups in this parent element # is less than the number in previous elements, then the # absent children are marked as optional if elemType._sequenced: for c in elemType._children[entry.sequenceNumber + 1:]: c.optional = True elem.clear()
def channels(input_file): for event, elem in ET.iterparse(input_file): if event == "end" and elem.tag == 'channel': yield Channel(elem)
def __init__(self, source): self.elements = [] # map element names to ElementTypes elemDict = {} # elements currently open elemStack = [] namespaces = deque([('xml','http://www.w3.org/XML/1998/namespace')]) pendingNamespaces = [] for event,elem in iterparse(source, events=('start', 'end', 'start-ns', 'end-ns')): if event == 'start-ns': namespaces.appendleft(elem) pendingNamespaces.append(elem) elif event == 'end-ns': namespaces.popleft() elif event == 'start': # add the namespace declarations as attributes for prefix,url in pendingNamespaces: attr = prefix and 'xmlns:%s' % prefix or 'xmlns' elem.attrib[attr] = escape_attrib(url) del pendingNamespaces[:] # convert name from clark format to prefix:local name = _clark_to_orig(elem.tag,namespaces) elemType = elemDict.get(name) if elemType is None: elemType = ElementType(name) elemDict[name] = elemType self.elements.append(elemType) elemType._occurrences += 1 # update atttibute declarations for attr,value in elem.items(): # convert attribute names from clark format to prefix:local elemType.updateAttribute(_clark_to_orig(attr,namespaces),value) # keep track of the nesting and sequence of child elements if elemStack: parentEntry = elemStack[-1] parent = parentEntry.elemType # for sequencing, we're interested in consecutive groups # of the same child element type isFirstInGroup = parentEntry.latestChild != name if isFirstInGroup: parentEntry.latestChild = name parentEntry.groupIndex += 1 parent.setChildInfo(name,parentEntry.groupIndex) else: parent.getChildInfo(name,parentEntry.groupIndex).repeatable = True #fi elemStack elemStack.append(_StackEntry(elemType)) elif event == 'end': entry = elemStack.pop() elemType = entry.elemType for txt in elem.text, elem.tail: if txt is not None and not txt.isspace(): elemType._hasCharacterContent = True break # check that all expected children are accounted for. # If the number of child element groups in this parent element # is less than the number in previous elements, then the # absent children are marked as optional for c in elemType.iterChildInfo(entry.groupIndex+1,None): c.optional = True elem.clear()
def __init__(self, source): self.elements = [] # map element names to ElementTypes elemDict = {} # elements currently open elemStack = [] namespaces = deque([("xml", "http://www.w3.org/XML/1998/namespace")]) pendingNamespaces = [] for event, elem in iterparse(source, events=("start", "end", "start-ns", "end-ns")): if event == "start-ns": namespaces.appendleft(elem) pendingNamespaces.append(elem) elif event == "end-ns": namespaces.popleft() elif event == "start": # add the namespace declarations as attributes for prefix, url in pendingNamespaces: attr = prefix and "xmlns:%s" % prefix or "xmlns" elem.attrib[attr] = escape_attrib(url) del pendingNamespaces[:] # convert name from clark format to prefix:local name = _clark_to_orig(elem.tag, namespaces) elemType = elemDict.get(name) if elemType is None: elemType = ElementType(name) elemDict[name] = elemType self.elements.append(elemType) elemType._occurrences += 1 # update atttibute declarations for attr, value in elem.items(): # convert attribute names from clark format to prefix:local elemType.updateAttribute(_clark_to_orig(attr, namespaces), value) # keep track of the nesting and sequence of child elements if elemStack: parentEntry = elemStack[-1] parent = parentEntry.elemType # for sequencing, we're interested in consecutive groups # of the same child element type isFirstInGroup = parentEntry.latestChild != name if isFirstInGroup: parentEntry.latestChild = name parentEntry.groupIndex += 1 parent.setChildInfo(name, parentEntry.groupIndex) else: parent.getChildInfo(name, parentEntry.groupIndex).repeatable = True # fi elemStack elemStack.append(_StackEntry(elemType)) elif event == "end": entry = elemStack.pop() elemType = entry.elemType for txt in elem.text, elem.tail: if txt is not None and not txt.isspace(): elemType._hasCharacterContent = True break # check that all expected children are accounted for. # If the number of child element groups in this parent element # is less than the number in previous elements, then the # absent children are marked as optional for c in elemType.iterChildInfo(entry.groupIndex + 1, None): c.optional = True elem.clear()
def parseFile(path, source): global handles, buffers if os.path.exists(path + source + ".dat"): print ".dat already exists" return initOutput() postsFile = False if not source.find("posts.xml") == -1: if os.path.exists(path + "questions.xml.dat"): print "questions.xml.dat already exists" return postsFile = True questionsFile = open(path + "questions.xml.dat", "wb+") answersFile = open(path + "answers.xml.dat", "wb+") addHandle(questionsFile) addHandle(answersFile) else: outputFile = open(path + source + ".dat", "wb+") addHandle(outputFile) lineCounter = 1 context = iterparse(path + source, events=("start", "end")) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() for event, elem in context: if lineCounter % 100 == 0: flushOutput() if elem.tag == "row": if event == "end": root.clear() else: if postsFile: if elem.get("PostTypeId") == "2": # Answer post index = 1 keys = FIELD_KEYS["answers.xml"] else: index = 0 keys = FIELD_KEYS["questions.xml"] else: index = 0 keys = FIELD_KEYS[source] vals = [] for key in keys: val = elem.get(key) if val == None: val = "NULL" vals.append(val) joined = FIELD_DELIMITER.join(vals) appendBuffer(index, joined) appendBuffer(index, LINE_DELIMITER) lineCounter += 1 flushOutput() closeOutput()
#!/usr/bin/env python import cElementTree as ce import re, os infile = file("ted.xml") outdir = "." #\W = not [a-zA-Z0-9], except unicode-aware mkfilename = re.compile('\W').sub for e, item in ce.iterparse(infile): #print e, item if item.tag == 'item': title = item.findtext('title') dir = item.findtext('category') if not os.path.isdir(item.findtext('category')): os.makedirs(item.findtext('category')) filename = os.path.join(outdir, dir, mkfilename('_', title)) + '.txt' filename = filename[:200] if not os.path.isfile(filename): outfile = file(filename, 'w') else: raise "file %s already exists" % filename print >> outfile, title for n in item: if n.tag.find('content') != -1: print >> outfile, n.text elif n.tag == 'description': print >> outfile, n.text