def process_pom(self, config, pom_path): doc = ElementTree(file=pom_path) mc_version = "" try: mc_version = doc.findall('/{POM}properties/{POM}minecraft_version'.format(POM=POM_NS))[0].text except: mc_version = "" config["minecraft_version"] = mc_version
def xml_to_dict(fPath): ''' Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the title of the nth paper and a_n is the abstract ''' ref_ids_to_abs = {} num_no_abs = 0 tree = ElementTree(file=fPath) for record in tree.findall('.//record'): pubmed_id = None refmanid = eval(record.findtext('.//rec-number')) # attempt to grab the pubmed id pubmed_id = "" try: pubmed = record.findtext('.//notes/style') pubmed = pubmed.split("-") for i in range(len(pubmed)): if "UI" in pubmed[i]: pubmed_str = pubmed[i + 1].strip() pubmed_id = eval("".join( [x for x in pubmed_str if x in string.digits])) except Exception, ex: print "problem getting pmid ..." print ex ab_text = record.findtext('.//abstract/style') if ab_text is None: num_no_abs += 1 title_text = record.findtext('.//titles/title/style') # Also grab keywords keywords = [ keyword.text.strip().lower() for keyword in record.findall(".//keywords/keyword/style") ] # and authors authors = [ author.text for author in record.findall( ".//contributors/authors/author/style") ] # journal journal = record.findtext(".//periodical/abbr-1/style") ref_ids_to_abs[refmanid] = {"title":title_text, "abstract":ab_text, "journal":journal,\ "keywords":keywords, "pmid":pubmed_id, "authors":authors}
def PrintStats(): """Looks at the XML output and dumps render time.""" try: from elementtree.ElementTree import ElementTree except: print "Unable to load ElementTree, skipping statistics." else: doc = ElementTree(file='stats.xml') for timer in doc.findall('//timer'): if "totaltime" == timer.get("name"): print "Render time was %s seconds" % timer[0].text break
def PrintStats(): """Looks at the XML output and dumps render time.""" try: from elementtree.ElementTree import ElementTree except: print "Unable to load ElementTree, skipping statistics." else: doc = ElementTree(file='stats.xml') for timer in doc.findall('//timer'): if "totaltime" == timer.get("name"): print "Render time was %s seconds" % timer[0].text break
def get_dependencies(path): dependencies = {} doc = ElementTree(file=path) deps = doc.findall('/%sdependencies' % POM_NS) for dep in deps[0]: groupId = dep.findall("%sgroupId" % POM_NS)[0].text artifactId = dep.findall("%sartifactId" % POM_NS)[0].text version = dep.findall("%sversion" % POM_NS)[0].text path = ".".join([groupId, artifactId]) dependencies[path] = version return dependencies
def xml_to_dict(fPath): """ Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the title of the nth paper and a_n is the abstract """ ref_ids_to_abs = {} num_no_abs = 0 tree = ElementTree(file=fPath) for record in tree.findall(".//record"): pubmed_id = None refmanid = eval(record.findtext(".//rec-number")) # attempt to grab the pubmed id pubmed_id = "" try: pubmed = record.findtext(".//notes/style") pubmed = pubmed.split("-") for i in range(len(pubmed)): if "UI" in pubmed[i]: pubmed_str = pubmed[i + 1].strip() pubmed_id = eval("".join([x for x in pubmed_str if x in string.digits])) except Exception, ex: print "problem getting pmid ..." print ex ab_text = record.findtext(".//abstract/style") if ab_text is None: num_no_abs += 1 title_text = record.findtext(".//titles/title/style") # Also grab keywords keywords = [keyword.text.strip().lower() for keyword in record.findall(".//keywords/keyword/style")] # and authors authors = [author.text for author in record.findall(".//contributors/authors/author/style")] # journal journal = record.findtext(".//periodical/abbr-1/style") ref_ids_to_abs[refmanid] = { "title": title_text, "abstract": ab_text, "journal": journal, "keywords": keywords, "pmid": pubmed_id, "authors": authors, }
def update_dependency(config, plugins, path, force=False): if path in updated_dependencies and not force: return if path in config["dependencies"]: git_url = config["dependencies"][path]["git_url"] dependency_path = os.path.join(config["staging_path"], slugify(unicode(path))) dependency_path = os.path.expanduser(dependency_path) if not os.path.isdir(dependency_path): os.makedirs(dependency_path) repo = init_repo(dependency_path, git_url) pom_path = os.path.join(dependency_path, 'pom.xml') doc = ElementTree(file=pom_path) version = doc.findall('/%sversion' % POM_NS)[0].text for p in plugins: p.process_pom(config["dependencies"][path], pom_path) config["dependencies"][path]["version"] = version updated_dependencies.append(path)
fpath = os.path.join(comp_path, comp_fname) et.parse(fpath) version = et.getroot().get("version") if not version: print "\tTransforming %s..." % comp_fname category = "" if et.find("category"): category = et.find("category").text.strip() root = Element("component", {"version": "1.0", "name": et.find("name").text.strip(), "description": et.find("description").text.strip(), "category": category}) tpcl_req = SubElement(root, "tpcl_requirements") tpcl_req.text = et.find("tpcl_requirements").text.strip() root.append(Comment("propertylist")) for prop in et.findall("property"): propelem = SubElement(root, "property", {"name": prop.find("name").text.strip()}) tpcl_cost = SubElement(propelem, "tpcl_cost") tpcl_cost.text = prop.find("tpcl_cost").text.strip() et = ElementTree(root) et.write(fpath, indent=True) elif version == "1.0": print "\tTransforming %s..." % comp_fname old_root = et.getroot() category = old_root.get("category") root = Element("component", {"version": "1.1", "name": old_root.get("name"), "description": old_root.get("description")}) tpcl_req = SubElement(root, "tpcl_requirements")
etree = ElementTree(file=StringIO.StringIO(content)) feed = XML(content) print etree print feed #print len(feed) #print feed[0] #print feed.keys() ATOM = "http://www.w3.org/2005/Atom" entry = etree.getiterator('{%s}entry'%ATOM)[0] new_lin = SubElement(entry, '{%s}link'%ATOM) new_lin.set('rel', 'source') new_lin.set('href', 'http://somthing.org') title = etree.findall('{%s}title'%ATOM)[0] print tostring(title) missing = etree.findall('{%s}missing'%ATOM) print missing for e in etree.findall('//{%s}link'%ATOM): print e.get('rel', 'alternate') s = StringIO.StringIO() etree.write(s) s.seek(0) print s.getvalue()
def xmlToDict(fPath, stopPath=None, splitTxt= False, get_pubmed = False): ''' Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the title of the nth paper and a_n is the abstract ''' refIDToAbs = {} numNoPubmeds = 0 numNoAbs = 0 # Keep track of how many studies have no abstracts. tree = ElementTree(file=fPath) for record in tree.findall('.//record'): pubmed_id = None refmanid = eval(record.findall('.//rec-number')[0].text) try: pubmed = record.findall('.//notes/style')[0].text pubmed = pubmed.split("-") for i in range(len(pubmed)): if "UI" in pubmed[i]: pubmed_str = pubmed[i+1].strip() pubmed_id = eval("".join([x for x in pubmed_str if x in string.digits])) #pubmed_id = eval(pubmed[i+1].replace("PT", "").replace("IN", "")) #print pubmed break except Exception, ex: print ex if pubmed_id is None: #if not "Cochrane" in pubmed[2]: # pdb.set_trace() numNoPubmeds+=1 print "%s has no pubmed id" % refmanid abstract = record.findall('.//abstract/style') abText = "" try: if abstract and splitTxt: abText = (abstract[0].text).split(" ") abText = [string.lower(s) for s in abText] abText = cleanUpTxt(abText, stopListPath=stopPath) elif abstract: abText = abstract[0].text else: numNoAbs += 1 except: pdb.set_trace() title = "" if splitTxt: title = cleanUpTxt(string.lower(record.findall('.//titles/title/style')[0].text).split(" "), stopListPath=stopPath) else: try: title = record.findall('.//titles/title/style')[0].text except: pdb.set_trace() # Also grab keywords keywords = [keyword.text.strip().lower() for keyword in record.findall(".//keywords/keyword/style")] if pubmed_id is not None or True: refIDToAbs[refmanid] = [title, abText, keywords, pubmed_id]
# check list _ckeck_list = list() _p_id = 0 # for each day for _idx_d, _d in enumerate(_days): _day = SubElement( _root, "day", date=str(_d.strftime("%Y-%m-%d")), index="%s" % str((1 + _idx_d)) ) # for each room for _idx_r, _r in enumerate(_rooms): _room = SubElement(_day, "room", name=_r) # fora each talk for _idx_t, _t in enumerate(_src_xml.findall("Talk")): # get _date value _date = None if _t.find("start_time") is None \ else datetime.strptime( _t.findtext("start_time").split(" ")[0], "%Y-%m-%d") # get location value _loc = None if _t.find("location") is None \ else string.capwords(_t.findtext("location")) # get the eventid _e_id = _t.findtext("eid") # get Track value _e_track = None if not PERSON_TRACK_DICT.has_key(int(_e_id)) \ else PERSON_TRACK_DICT[int(_e_id)]["track"] # track, date and room check if _e_track is None or int(_e_id) in _ckeck_list:
# Please only use these spiders and their derivatives in accordance # with the terms of service and acceptable use policies of the data # providers. import codecs from elementtree.ElementTree import ElementTree input = ElementTree(file='./countries.html') output = codecs.open('./country_links.csv', 'w', 'utf-8') countries = [] xpath = '//a' for element in input.findall(xpath): country = element.get('title') href = element.get('href') if not country: continue if country in countries: continue else: countries.append(country) print country, href output.write('%s\t%s\n' % (country, href))
def update_plugin(project): project_path = os.path.join(config["staging_path"], slugify(project)) project_path = os.path.expanduser(project_path) if not os.path.isdir(project_path): os.makedirs(project_path) repo = init_repo(project_path, config["plugins"][project]["git_url"]) pom_path = os.path.join(project_path, "pom.xml") updated_files = [] dependencies = get_dependencies(pom_path) chdeps = [] pl = [] for p in plugins: mask = '*' if hasattr(plugins[p], 'dep_mask'): mask=plugins[p].dep_mask else: plugins[p].dep_mask = "*" if matches_dependency(dependencies.keys(), mask): pl.append(plugins[p]) for dep in dependencies.keys(): update_dependency(config, pl, dep) if dep in config["dependencies"].keys(): if config["dependencies"][dep]["version"] != dependencies[dep]: chdeps.append(dep) for p in pl: for root, subfolders, files in os.walk(project_path): for f in files: file_path = os.path.join(root, f) if fnmatch(dep, p.dep_mask): if f.endswith('.java'): f = open(file_path, 'r') c = f.read() f.close() p.process_file(dep, config, file_path) f = open(file_path, 'r') n = f.read() f.close() if n.strip() != c.strip(): updated_files.append(file_path) if file_path.endswith("pom.xml"): s = dep.rsplit(".", 1) groupId = s[0] artifactId = s[1] doc = ElementTree(file=file_path) deps = doc.findall('/{POM}dependencies/{POM}dependency'.format(POM=POM_NS)) for d in deps: if (d.find("{POM}groupId".format(POM=POM_NS)).text == groupId) and d.find("{POM}artifactId".format(POM=POM_NS)).text == artifactId: dependency = d break re_dep = re.compile(r'(<dependency>\s+<groupId>{groupId}</groupId>\s+<artifactId>{artifactId}</artifactId>\s+<version>)([A-Za-z0-9.\-]+)(</version>)'.format(groupId=groupId, artifactId=artifactId),re.MULTILINE) f = open(pom_path, 'r') contents = f.read() c = contents f.close() contents = re_dep.sub(r'\1{{replaceme}}\3', contents) contents = contents.replace('{{replaceme}}', config["dependencies"][dep]["version"]) f = open(pom_path, 'w') f.write(contents) f.close() if c.strip() != contents.strip(): updated_files.append(file_path) message = [] for change in chdeps: message.append("%s for version %s" % (config["dependencies"][change]["name"], dependencies[change])) cwd = os.getcwd() os.chdir(project_path) for f in updated_files: p = Popen(["git", "add", f]) out, err = p.communicate() p = Popen(["git", "commit", "-m", "Update for: " + ", ".join(message)]) out, err = p.communicate() p = Popen(["git", "push", "origin", "master"]) out, err = p.communicate() os.chdir(cwd)
import sys from elementtree.ElementTree import ElementTree mydoc = ElementTree(file=sys.argv[1]) for e in mydoc.findall(sys.argv[2]): print e.text
topelement=Element('top') i=1 for vf in votesfiles: print vf try: votetree=ElementTree(file=vf) voteroot=votetree.getroot() date=voteroot.get('date') m=re.match('(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})',date) if not m: print "internal error in date format" sys.exit() mgd=m.groupdict() mgd.update({'date':date}) acts=votetree.findall('//royal_assent/act') if len(acts)>0: assent=Element('assent',mgd) for j in range(len(acts)): assent.insert(j,acts[j]) topelement.insert(i,assent) i=i+1 except xml.parsers.expat.ExpatError, errorinst: print errorinst print "XML parsing error in %s" % vf, sys.exc_info()[0] top=ElementTree(topelement) top.write('allvotes.xml')
def xml_to_dict(fpath): ''' Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the title of the nth paper and a_n is the abstract ''' ref_ids_to_abs = {} parsing_errors = [] num_no_abs = 0 tree = ElementTree(file=fpath) num_failed = 0 for record in tree.findall('.//record'): pubmed_id, refmanid = None, None refman_version = record.findtext('.//source-app') path_str = None ### here we check the RefMan version, and change # the xml path accordingly. this fixes issue #7 if refman_version == 'Reference Manager 12.0': path_str = './/rec-number/style' journal_path_str = './/periodical/full-title/style' elif refman_version == 'Reference Manager 11.0': path_str = './/rec-number' journal_path_str = './/periodical/abbr-1/style' try: refmanid = int(record.findtext(path_str)) except: error = "Unable to parse record '%s' in '%s'" % ( record, os.path.basename(fpath)) #print "failed to parse refman document" parsing_errors.append(error) if refmanid is not None: # attempt to grab the pubmed id pubmed_id = "" try: pubmed = record.findtext('.//notes/style') pubmed = pubmed.split("-") for i in range(len(pubmed)): if "UI" in pubmed[i]: pubmed_str = pubmed[i + 1].strip() pubmed_id = int("".join( [x for x in pubmed_str if x in string.digits])) except Exception, ex: error = "Problem getting pmid from '%s' in '%s'" % ( record, os.path.basename(fpath)) parsing_errors.append(error) #print "problem getting pmid ..." #print ex #print("\n") ab_text = record.findtext('.//abstract/style') if ab_text is None: num_no_abs += 1 title_text = record.findtext('.//titles/title/style') # Also grab keywords keywords = [ keyword.text.strip().lower() for keyword in record.findall(".//keywords/keyword/style") ] # and authors authors = [ author.text for author in record.findall( ".//contributors/authors/author/style") ] # journal journal = record.findtext(journal_path_str) ref_ids_to_abs[refmanid] = {"title":title_text, "abstract":ab_text, "journal":journal,\ "keywords":keywords, "pmid":pubmed_id, "authors":authors}
# -*- coding: utf-8 -*- # This is just an illustration... from elementtree.ElementTree import ElementTree mydoc = ElementTree(file='tst.xml') for e in mydoc.findall('/foo/bar'): print(e.get('title').text)
version = et.getroot().get("version") if not version: print "\tTransforming %s..." % comp_fname category = "" if et.find("category"): category = et.find("category").text.strip() root = Element( "component", { "version": "1.0", "name": et.find("name").text.strip(), "description": et.find("description").text.strip(), "category": category }) tpcl_req = SubElement(root, "tpcl_requirements") tpcl_req.text = et.find("tpcl_requirements").text.strip() root.append(Comment("propertylist")) for prop in et.findall("property"): propelem = SubElement(root, "property", {"name": prop.find("name").text.strip()}) tpcl_cost = SubElement(propelem, "tpcl_cost") tpcl_cost.text = prop.find("tpcl_cost").text.strip() et = ElementTree(root) et.write(fpath, indent=True) elif version == "1.0": print "\tTransforming %s..." % comp_fname old_root = et.getroot() category = old_root.get("category") root = Element( "component", { "version": "1.1", "name": old_root.get("name"), "description": old_root.get("description")
def main(): pp = pprint.PrettyPrinter(indent=4) try: lastMod = int(os.path.getmtime(basePath + "xmlviruses.xml")) except: lastMod = 0 curTime = int(calendar.timegm(time.gmtime())) #print("Last modified: " + str(lastMod)) #print("Current time: " + str(curTime)) #print("Age: " + str(int((curTime - lastMod)/60)) + " minutes") if (lastMod + delay) < curTime: age = int((curTime - lastMod)) age_d = age / DAY age = age - (age_d * DAY) age_h = age / HOUR age = age - (age_h * HOUR) age_m = age / MINUTE age = age - (age_m * MINUTE) print("It has been " + str(age_d) + " days, " + str(age_h) + " hours, " + str(age_m) + " minutes and " + str(age) + " seconds since last update") #print("Been at least 30 minutes since last checked") urllib.urlretrieve("http://support.clean-mx.de/clean-mx/xmlviruses.php?response=alive", basePath + "xmlviruses.xml") #with open(basePath + "xmlviruses.xml", "r+") as f: # newF = [] # for line in f.readline(): # line = re.sub('\]\]\>\<\/url\>\]\]\>\<\/url\>', '\]\]\>\<\/url\>', line) # line = re.sub('\<\/url\>\/\]\]\>\<\/url\>', '\<\/url\>', line) # newF.append(line) # newLines = ''.join(newF) # f.seek(0) # f.write(newLines) else: print("Not updating virus list as it is less then 30 minutes old") # sed -e s:']]></url>]]></url>':']]></url>': -e s:'</url>/]]></url>':'</url>': -i xmlviruses.xml # s:'</url>]].*':'</url>': cmd = [ 'sed', '-i', '-e', 's:\'</url>.*\':\'</url>\':g', basePath + "xmlviruses.xml" ] pp.pprint(cmd) print("Running command: " + ' '.join(cmd)) subprocess.check_call(cmd) cmd = [ 'xmllint', '-noout', basePath + "xmlviruses.xml" ] pp.pprint(cmd) print("Running command: " + ' '.join(cmd)) subprocess.check_call(cmd) tree = ElementTree(file=basePath + "xmlviruses.xml") entryList = tree.findall("entries/entry") for entry in entryList: #print url.text urlString = entry[9].text md5String = entry[4].text #print "urlString: " + urlString #print "md5String: " + md5String re.IGNORECASE #result = re.match("^.*\.[Ee][Xx][Ee]$", urlString) result = re.match(".*", urlString) if result: try: filename = malwarePath + md5String generated_filename = False except: print("Filename generation error") filename = malwarePath + base64.urlsafe_b64encode(os.urandom(30)) generated_filename = True files = glob(filename + "*") #if len(files) == 0 and os.path.isfile(filename) == False: if len(files) == 0: print("Downloading " + urlString + " as " + filename) try: urllib.urlretrieve(urlString, filename) if generated_filename == True: md5String = hashlib.md5(open(filename, 'rb').read()).hexdigest() newFilename = malwarePath + md5String print("Renaming " + filename + " to " + newFilename) os.rename(filename, newFilename) filename = newFilename # Add the correct file extension extension=magicFileExtension.getExt(filename) if( len(extension) > 0): print("Renaming " + filename + " to " + filename + extension) os.rename(filename, filename + extension) except Exception as e: print("Error while downloading " + urlString + " %s" % e) #else: #print ("Not downloading " + urlString + " - already exists") else: print ("Not downloading " + urlString + " - not a exe file") print ("Finished downloading all available samples")
def xml_to_dict(fpath): ''' Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the title of the nth paper and a_n is the abstract ''' ref_ids_to_abs = {} parsing_errors = [] num_no_abs = 0 tree = ElementTree(file=fpath) num_failed = 0 for record in tree.findall('.//record'): pubmed_id, refmanid = None, None refman_version = record.findtext('.//source-app') path_str = None ### here we check the RefMan version, and change # the xml path accordingly. this fixes issue #7 if refman_version == 'Reference Manager 12.0': path_str = './/rec-number/style' journal_path_str = './/periodical/full-title/style' elif refman_version == 'Reference Manager 11.0': path_str = './/rec-number' journal_path_str = './/periodical/abbr-1/style' try: refmanid = int(record.findtext(path_str)) except: error = "Unable to parse record '%s' in '%s'" % (record, os.path.basename(fpath)) #print "failed to parse refman document" parsing_errors.append(error) if refmanid is not None: # attempt to grab the pubmed id pubmed_id = "" try: pubmed = record.findtext('.//notes/style') pubmed = pubmed.split("-") for i in range(len(pubmed)): if "UI" in pubmed[i]: pubmed_str = pubmed[i+1].strip() pubmed_id = int("".join([x for x in pubmed_str if x in string.digits])) except Exception, ex: error = "Problem getting pmid from '%s' in '%s'" % (record, os.path.basename(fpath)) parsing_errors.append(error) #print "problem getting pmid ..." #print ex #print("\n") ab_text = record.findtext('.//abstract/style') if ab_text is None: num_no_abs += 1 title_text = record.findtext('.//titles/title/style') # Also grab keywords keywords = [keyword.text.strip().lower() for keyword in record.findall(".//keywords/keyword/style")] # and authors authors = [author.text for author in record.findall(".//contributors/authors/author/style")] # journal journal = record.findtext(journal_path_str) ref_ids_to_abs[refmanid] = {"title":title_text, "abstract":ab_text, "journal":journal,\ "keywords":keywords, "pmid":pubmed_id, "authors":authors}
topelement = Element('top') i = 1 for vf in votesfiles: print vf try: votetree = ElementTree(file=vf) voteroot = votetree.getroot() date = voteroot.get('date') m = re.match('(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})', date) if not m: print "internal error in date format" sys.exit() mgd = m.groupdict() mgd.update({'date': date}) acts = votetree.findall('//royal_assent/act') if len(acts) > 0: assent = Element('assent', mgd) for j in range(len(acts)): assent.insert(j, acts[j]) topelement.insert(i, assent) i = i + 1 except xml.parsers.expat.ExpatError, errorinst: print errorinst print "XML parsing error in %s" % vf, sys.exc_info()[0] top = ElementTree(topelement) top.write('allvotes.xml')