Beispiel #1
0
    def process_pom(self, config, pom_path):
        doc = ElementTree(file=pom_path)
        mc_version = ""
        try:
            mc_version = doc.findall('/{POM}properties/{POM}minecraft_version'.format(POM=POM_NS))[0].text
        except:
            mc_version = ""

        config["minecraft_version"] = mc_version
Beispiel #2
0
def xml_to_dict(fPath):
    '''
    Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to
    title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the
    title of the nth paper and a_n is the abstract
    '''

    ref_ids_to_abs = {}
    num_no_abs = 0
    tree = ElementTree(file=fPath)

    for record in tree.findall('.//record'):
        pubmed_id = None
        refmanid = eval(record.findtext('.//rec-number'))

        # attempt to grab the pubmed id
        pubmed_id = ""
        try:
            pubmed = record.findtext('.//notes/style')
            pubmed = pubmed.split("-")
            for i in range(len(pubmed)):
                if "UI" in pubmed[i]:

                    pubmed_str = pubmed[i + 1].strip()
                    pubmed_id = eval("".join(
                        [x for x in pubmed_str if x in string.digits]))
        except Exception, ex:
            print "problem getting pmid ..."
            print ex

        ab_text = record.findtext('.//abstract/style')
        if ab_text is None:
            num_no_abs += 1

        title_text = record.findtext('.//titles/title/style')

        # Also grab keywords
        keywords = [
            keyword.text.strip().lower()
            for keyword in record.findall(".//keywords/keyword/style")
        ]

        # and authors
        authors = [
            author.text for author in record.findall(
                ".//contributors/authors/author/style")
        ]

        # journal
        journal = record.findtext(".//periodical/abbr-1/style")

        ref_ids_to_abs[refmanid] = {"title":title_text, "abstract":ab_text, "journal":journal,\
                    "keywords":keywords, "pmid":pubmed_id, "authors":authors}
Beispiel #3
0
def PrintStats():
    """Looks at the XML output and dumps render time."""
    try:    
        from elementtree.ElementTree import ElementTree
    except:
        print "Unable to load ElementTree, skipping statistics."
    else:
        doc = ElementTree(file='stats.xml')
        for timer in doc.findall('//timer'):
            if "totaltime" == timer.get("name"):
                print "Render time was %s seconds" % timer[0].text
                break
Beispiel #4
0
def PrintStats():
    """Looks at the XML output and dumps render time."""
    try:
        from elementtree.ElementTree import ElementTree
    except:
        print "Unable to load ElementTree, skipping statistics."
    else:
        doc = ElementTree(file='stats.xml')
        for timer in doc.findall('//timer'):
            if "totaltime" == timer.get("name"):
                print "Render time was %s seconds" % timer[0].text
                break
Beispiel #5
0
def get_dependencies(path):
    dependencies = {}
    doc = ElementTree(file=path)
    deps = doc.findall('/%sdependencies' % POM_NS)
    for dep in deps[0]:
        groupId = dep.findall("%sgroupId" % POM_NS)[0].text
        artifactId = dep.findall("%sartifactId" % POM_NS)[0].text
        version = dep.findall("%sversion" % POM_NS)[0].text

        path = ".".join([groupId, artifactId])
        dependencies[path] = version

    return dependencies
def xml_to_dict(fPath):
    """
    Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to
    title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the
    title of the nth paper and a_n is the abstract
    """

    ref_ids_to_abs = {}
    num_no_abs = 0
    tree = ElementTree(file=fPath)

    for record in tree.findall(".//record"):
        pubmed_id = None
        refmanid = eval(record.findtext(".//rec-number"))

        # attempt to grab the pubmed id
        pubmed_id = ""
        try:
            pubmed = record.findtext(".//notes/style")
            pubmed = pubmed.split("-")
            for i in range(len(pubmed)):
                if "UI" in pubmed[i]:
                    pubmed_str = pubmed[i + 1].strip()
                    pubmed_id = eval("".join([x for x in pubmed_str if x in string.digits]))
        except Exception, ex:
            print "problem getting pmid ..."
            print ex

        ab_text = record.findtext(".//abstract/style")
        if ab_text is None:
            num_no_abs += 1

        title_text = record.findtext(".//titles/title/style")

        # Also grab keywords
        keywords = [keyword.text.strip().lower() for keyword in record.findall(".//keywords/keyword/style")]

        # and authors
        authors = [author.text for author in record.findall(".//contributors/authors/author/style")]

        # journal
        journal = record.findtext(".//periodical/abbr-1/style")

        ref_ids_to_abs[refmanid] = {
            "title": title_text,
            "abstract": ab_text,
            "journal": journal,
            "keywords": keywords,
            "pmid": pubmed_id,
            "authors": authors,
        }
Beispiel #7
0
def update_dependency(config, plugins, path, force=False):
    if path in updated_dependencies and not force:
        return

    if path in config["dependencies"]:
        git_url = config["dependencies"][path]["git_url"]

        dependency_path = os.path.join(config["staging_path"], slugify(unicode(path)))
        dependency_path = os.path.expanduser(dependency_path)
        if not os.path.isdir(dependency_path):
            os.makedirs(dependency_path)
        repo = init_repo(dependency_path, git_url)

        pom_path = os.path.join(dependency_path, 'pom.xml')

        doc = ElementTree(file=pom_path)
        version = doc.findall('/%sversion' % POM_NS)[0].text

        for p in plugins:
            p.process_pom(config["dependencies"][path], pom_path)

        config["dependencies"][path]["version"] = version
        updated_dependencies.append(path)
 fpath = os.path.join(comp_path, comp_fname)
 et.parse(fpath)
 version = et.getroot().get("version")
 if not version:
     print "\tTransforming %s..." % comp_fname
     category = ""
     if et.find("category"): category = et.find("category").text.strip()
     root = Element("component",
                     {"version": "1.0",
                      "name": et.find("name").text.strip(),
                      "description": et.find("description").text.strip(),
                      "category": category})
     tpcl_req = SubElement(root, "tpcl_requirements")
     tpcl_req.text = et.find("tpcl_requirements").text.strip()
     root.append(Comment("propertylist"))
     for prop in et.findall("property"):
         propelem = SubElement(root, "property",
                                 {"name": prop.find("name").text.strip()})
         tpcl_cost = SubElement(propelem, "tpcl_cost")
         tpcl_cost.text = prop.find("tpcl_cost").text.strip()
     et = ElementTree(root)
     et.write(fpath, indent=True)
 elif version == "1.0":
     print "\tTransforming %s..." % comp_fname
     old_root = et.getroot()
     category = old_root.get("category")
     root = Element("component",
                     {"version": "1.1",
                      "name": old_root.get("name"),
                      "description": old_root.get("description")})
     tpcl_req = SubElement(root, "tpcl_requirements")
Beispiel #9
0
etree = ElementTree(file=StringIO.StringIO(content))
feed = XML(content)

print etree
print feed

#print len(feed)
#print feed[0]
#print feed.keys()

ATOM = "http://www.w3.org/2005/Atom"

entry = etree.getiterator('{%s}entry'%ATOM)[0]
new_lin = SubElement(entry, '{%s}link'%ATOM)
new_lin.set('rel', 'source')
new_lin.set('href', 'http://somthing.org')

title = etree.findall('{%s}title'%ATOM)[0]
print tostring(title)

missing = etree.findall('{%s}missing'%ATOM)
print missing

for e in etree.findall('//{%s}link'%ATOM):
    print e.get('rel', 'alternate')

s = StringIO.StringIO()
etree.write(s)
s.seek(0)
print s.getvalue()
Beispiel #10
0
def xmlToDict(fPath, stopPath=None, splitTxt= False, get_pubmed = False):
    '''
    Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to 
    title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the
    title of the nth paper and a_n is the abstract
    '''
    
    refIDToAbs = {}
    numNoPubmeds = 0
    numNoAbs = 0 # Keep track of how many studies have no abstracts.
    tree = ElementTree(file=fPath)
    
    for record in tree.findall('.//record'):
            pubmed_id = None
	    refmanid = eval(record.findall('.//rec-number')[0].text)
	    
            try:
                pubmed = record.findall('.//notes/style')[0].text
                pubmed = pubmed.split("-")
                for i in range(len(pubmed)):
                        if "UI" in pubmed[i]:
			    pubmed_str = pubmed[i+1].strip()
			    pubmed_id = eval("".join([x for x in pubmed_str if x in string.digits]))
                            #pubmed_id = eval(pubmed[i+1].replace("PT", "").replace("IN", ""))
                            #print pubmed
                            break
            except Exception, ex:
                print ex
                    
          
	    if pubmed_id is None:
		    #if not "Cochrane" in pubmed[2]:
		    #	pdb.set_trace()
		    numNoPubmeds+=1
		    print "%s has no pubmed id" % refmanid

            abstract = record.findall('.//abstract/style')
	    abText = ""
	    try:
		    if abstract and splitTxt:
			    abText = (abstract[0].text).split(" ")
			    abText = [string.lower(s) for s in abText]
			    abText = cleanUpTxt(abText, stopListPath=stopPath)
		    elif abstract:
			    abText = abstract[0].text
		    else:
			    numNoAbs += 1
	    except:
		    pdb.set_trace()


	    title = ""
	    if splitTxt:
	            title = cleanUpTxt(string.lower(record.findall('.//titles/title/style')[0].text).split(" "), stopListPath=stopPath)
	    else:
		    try:
			title = record.findall('.//titles/title/style')[0].text
		    except:
			pdb.set_trace()
		    

	    # Also grab keywords
	    keywords = [keyword.text.strip().lower() for keyword in record.findall(".//keywords/keyword/style")]
	    if pubmed_id is not None or True:
                     refIDToAbs[refmanid] = [title, abText, keywords, pubmed_id]
Beispiel #11
0
# check list
_ckeck_list = list()
_p_id = 0
# for each day
for _idx_d, _d in enumerate(_days):
    _day = SubElement(
            _root, "day",
            date=str(_d.strftime("%Y-%m-%d")),
            index="%s" % str((1 + _idx_d))
            )
    # for each room
    for _idx_r, _r in enumerate(_rooms):
        _room = SubElement(_day, "room", name=_r)
        # fora each talk
        for _idx_t, _t in enumerate(_src_xml.findall("Talk")):
            # get _date value
            _date = None if _t.find("start_time") is None \
                    else datetime.strptime(
                            _t.findtext("start_time").split(" ")[0],
                            "%Y-%m-%d")
            # get location value
            _loc = None if _t.find("location") is None \
                    else string.capwords(_t.findtext("location"))
            # get the eventid
            _e_id = _t.findtext("eid")
            # get Track value
            _e_track = None if not PERSON_TRACK_DICT.has_key(int(_e_id)) \
                    else PERSON_TRACK_DICT[int(_e_id)]["track"]
            # track, date and room check
            if _e_track is None or int(_e_id) in _ckeck_list:
Beispiel #12
0
# Please only use these spiders and their derivatives in accordance
# with the terms of service and acceptable use policies of the data
# providers.


import codecs

from elementtree.ElementTree import ElementTree

input = ElementTree(file='./countries.html')
output = codecs.open('./country_links.csv', 'w', 'utf-8')

countries = []
xpath = '//a'

for element in input.findall(xpath):
	country = element.get('title')
	href = element.get('href')

	if not country:
		continue

	if country in countries:
		continue
	else:
		countries.append(country)

	print country, href
	output.write('%s\t%s\n' % (country, href))

Beispiel #13
0
def update_plugin(project):
    project_path = os.path.join(config["staging_path"], slugify(project))
    project_path = os.path.expanduser(project_path)
    if not os.path.isdir(project_path):
        os.makedirs(project_path)
    repo = init_repo(project_path, config["plugins"][project]["git_url"])
    pom_path = os.path.join(project_path, "pom.xml")
    updated_files = []

    dependencies = get_dependencies(pom_path)
    chdeps = []

    pl = []

    for p in plugins:
        mask = '*'
        if hasattr(plugins[p], 'dep_mask'):
            mask=plugins[p].dep_mask
        else:
            plugins[p].dep_mask = "*"

        if matches_dependency(dependencies.keys(), mask):
            pl.append(plugins[p])

    for dep in dependencies.keys():
        update_dependency(config, pl, dep)
        if dep in config["dependencies"].keys():
            if config["dependencies"][dep]["version"] != dependencies[dep]:
                chdeps.append(dep)
                for p in pl:
                    for root, subfolders, files in os.walk(project_path):
                        for f in files:
                            file_path = os.path.join(root, f)
                            if fnmatch(dep, p.dep_mask):
                                if f.endswith('.java'):
                                    f = open(file_path, 'r')
                                    c = f.read()
                                    f.close()
                                    p.process_file(dep, config, file_path)
                                    f = open(file_path, 'r')
                                    n = f.read()
                                    f.close()

                                    if n.strip() != c.strip():
                                        updated_files.append(file_path)

                            if file_path.endswith("pom.xml"):
                                s = dep.rsplit(".", 1)
                                groupId = s[0]
                                artifactId = s[1]
                                doc = ElementTree(file=file_path)
                                deps = doc.findall('/{POM}dependencies/{POM}dependency'.format(POM=POM_NS))
                                for d in deps:
                                    if (d.find("{POM}groupId".format(POM=POM_NS)).text == groupId) and d.find("{POM}artifactId".format(POM=POM_NS)).text == artifactId:
                                        dependency = d
                                        break

                                re_dep = re.compile(r'(<dependency>\s+<groupId>{groupId}</groupId>\s+<artifactId>{artifactId}</artifactId>\s+<version>)([A-Za-z0-9.\-]+)(</version>)'.format(groupId=groupId, artifactId=artifactId),re.MULTILINE)

                                f = open(pom_path, 'r')
                                contents = f.read()
                                c = contents
                                f.close()

                                contents = re_dep.sub(r'\1{{replaceme}}\3', contents)
                                contents = contents.replace('{{replaceme}}', config["dependencies"][dep]["version"])

                                f = open(pom_path, 'w')
                                f.write(contents)
                                f.close()

                                if c.strip() != contents.strip():
                                    updated_files.append(file_path)

    message = []
    for change in chdeps:
        message.append("%s for version %s" % (config["dependencies"][change]["name"], dependencies[change]))

    cwd = os.getcwd()
    os.chdir(project_path)

    for f in updated_files:
        p = Popen(["git", "add", f])
        out, err = p.communicate()

    p = Popen(["git", "commit", "-m", "Update for: " + ", ".join(message)])
    out, err = p.communicate()
    p = Popen(["git", "push", "origin", "master"])
    out, err = p.communicate()
    os.chdir(cwd)
Beispiel #14
0
import sys

from elementtree.ElementTree import ElementTree
mydoc = ElementTree(file=sys.argv[1])
for e in mydoc.findall(sys.argv[2]):
    print e.text
Beispiel #15
0
topelement=Element('top')
i=1

for vf in votesfiles:
	print vf
	try:
		votetree=ElementTree(file=vf)
		voteroot=votetree.getroot()
		date=voteroot.get('date')
		m=re.match('(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})',date)
		if not m:
			print "internal error in date format"
			sys.exit()
		mgd=m.groupdict()
		mgd.update({'date':date})
		acts=votetree.findall('//royal_assent/act')
		if len(acts)>0:
			assent=Element('assent',mgd)
			for j in range(len(acts)):
				assent.insert(j,acts[j])
			topelement.insert(i,assent)
			i=i+1
	except xml.parsers.expat.ExpatError, errorinst:
		print errorinst
		print "XML parsing error in %s" % vf, sys.exc_info()[0]
	


top=ElementTree(topelement)

top.write('allvotes.xml')
Beispiel #16
0
def xml_to_dict(fpath):
    '''
    Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to
    title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the
    title of the nth paper and a_n is the abstract
    '''
    ref_ids_to_abs = {}
    parsing_errors = []
    num_no_abs = 0
    tree = ElementTree(file=fpath)

    num_failed = 0

    for record in tree.findall('.//record'):
        pubmed_id, refmanid = None, None

        refman_version = record.findtext('.//source-app')
        path_str = None
        ### here we check the RefMan version, and change
        # the xml path accordingly. this fixes issue #7
        if refman_version == 'Reference Manager 12.0':
            path_str = './/rec-number/style'
            journal_path_str = './/periodical/full-title/style'
        elif refman_version == 'Reference Manager 11.0':
            path_str = './/rec-number'
            journal_path_str = './/periodical/abbr-1/style'

        try:
            refmanid = int(record.findtext(path_str))
        except:
            error = "Unable to parse record '%s' in '%s'" % (
                record, os.path.basename(fpath))
            #print "failed to parse refman document"
            parsing_errors.append(error)

        if refmanid is not None:
            # attempt to grab the pubmed id
            pubmed_id = ""
            try:
                pubmed = record.findtext('.//notes/style')
                pubmed = pubmed.split("-")
                for i in range(len(pubmed)):
                    if "UI" in pubmed[i]:
                        pubmed_str = pubmed[i + 1].strip()
                        pubmed_id = int("".join(
                            [x for x in pubmed_str if x in string.digits]))
            except Exception, ex:
                error = "Problem getting pmid from '%s' in '%s'" % (
                    record, os.path.basename(fpath))
                parsing_errors.append(error)
                #print "problem getting pmid ..."
                #print ex
                #print("\n")

            ab_text = record.findtext('.//abstract/style')
            if ab_text is None:
                num_no_abs += 1

            title_text = record.findtext('.//titles/title/style')

            # Also grab keywords
            keywords = [
                keyword.text.strip().lower()
                for keyword in record.findall(".//keywords/keyword/style")
            ]

            # and authors
            authors = [
                author.text for author in record.findall(
                    ".//contributors/authors/author/style")
            ]

            # journal
            journal = record.findtext(journal_path_str)

            ref_ids_to_abs[refmanid] = {"title":title_text, "abstract":ab_text, "journal":journal,\
                        "keywords":keywords, "pmid":pubmed_id, "authors":authors}
Beispiel #17
0
# -*- coding: utf-8 -*-
# This is just an illustration...

from elementtree.ElementTree import ElementTree
mydoc = ElementTree(file='tst.xml')
for e in mydoc.findall('/foo/bar'):
    print(e.get('title').text)
Beispiel #18
0
 version = et.getroot().get("version")
 if not version:
     print "\tTransforming %s..." % comp_fname
     category = ""
     if et.find("category"): category = et.find("category").text.strip()
     root = Element(
         "component", {
             "version": "1.0",
             "name": et.find("name").text.strip(),
             "description": et.find("description").text.strip(),
             "category": category
         })
     tpcl_req = SubElement(root, "tpcl_requirements")
     tpcl_req.text = et.find("tpcl_requirements").text.strip()
     root.append(Comment("propertylist"))
     for prop in et.findall("property"):
         propelem = SubElement(root, "property",
                               {"name": prop.find("name").text.strip()})
         tpcl_cost = SubElement(propelem, "tpcl_cost")
         tpcl_cost.text = prop.find("tpcl_cost").text.strip()
     et = ElementTree(root)
     et.write(fpath, indent=True)
 elif version == "1.0":
     print "\tTransforming %s..." % comp_fname
     old_root = et.getroot()
     category = old_root.get("category")
     root = Element(
         "component", {
             "version": "1.1",
             "name": old_root.get("name"),
             "description": old_root.get("description")
Beispiel #19
0
def main():
	pp = pprint.PrettyPrinter(indent=4)

	try:
		lastMod = int(os.path.getmtime(basePath + "xmlviruses.xml"))
	except:
		lastMod = 0

	curTime = int(calendar.timegm(time.gmtime()))

	#print("Last modified: " + str(lastMod))
	#print("Current time:  " + str(curTime))
	#print("Age:           " + str(int((curTime - lastMod)/60)) + " minutes")

	if (lastMod + delay) < curTime:
		age = int((curTime - lastMod))

		age_d = age / DAY
		age   = age - (age_d * DAY)

		age_h = age / HOUR
		age   = age - (age_h * HOUR)

		age_m = age / MINUTE
		age   = age - (age_m * MINUTE)

		print("It has been " + str(age_d) + " days, " + str(age_h) + " hours, " + str(age_m) + " minutes and " + str(age) + " seconds since last update")

		#print("Been at least 30 minutes since last checked")
	
		urllib.urlretrieve("http://support.clean-mx.de/clean-mx/xmlviruses.php?response=alive", basePath + "xmlviruses.xml")


		#with open(basePath + "xmlviruses.xml", "r+") as f:
		#	newF = []
		#	for line in f.readline():
		#		line = re.sub('\]\]\>\<\/url\>\]\]\>\<\/url\>', '\]\]\>\<\/url\>', line)
		#		line = re.sub('\<\/url\>\/\]\]\>\<\/url\>', '\<\/url\>', line)
		#		newF.append(line)
		#	newLines = ''.join(newF)
		#	f.seek(0)
		#	f.write(newLines)	
		
	else:
		print("Not updating virus list as it is less then 30 minutes old")

	# sed -e s:']]></url>]]></url>':']]></url>': -e s:'</url>/]]></url>':'</url>': -i xmlviruses.xml 
	# s:'</url>]].*':'</url>':
	cmd = [ 'sed', '-i', 
		'-e', 's:\'</url>.*\':\'</url>\':g',
		basePath + "xmlviruses.xml" ]
	pp.pprint(cmd)
	print("Running command: " + ' '.join(cmd))
	subprocess.check_call(cmd)

	cmd = [ 'xmllint', '-noout', basePath + "xmlviruses.xml" ]
	pp.pprint(cmd)
	print("Running command: " + ' '.join(cmd))
	subprocess.check_call(cmd)

	tree = ElementTree(file=basePath + "xmlviruses.xml")
	entryList = tree.findall("entries/entry")

	for entry in entryList:
		#print url.text
		urlString = entry[9].text
		md5String = entry[4].text

		#print "urlString: " + urlString
		#print "md5String: " + md5String

		re.IGNORECASE
		#result = re.match("^.*\.[Ee][Xx][Ee]$", urlString)
		result = re.match(".*", urlString)

		if result:
			try:
				filename = malwarePath + md5String
				generated_filename = False
			except:
				print("Filename generation error")
				filename = malwarePath + base64.urlsafe_b64encode(os.urandom(30))
				generated_filename = True

			files = glob(filename + "*")
			#if len(files) == 0 and os.path.isfile(filename) == False:
			if len(files) == 0:
				print("Downloading " + urlString + " as " + filename)
				try:
					urllib.urlretrieve(urlString, filename)
					if generated_filename == True:
						md5String = hashlib.md5(open(filename, 'rb').read()).hexdigest()
						newFilename = malwarePath + md5String
						print("Renaming " + filename + " to " + newFilename)
						os.rename(filename, newFilename)
						filename = newFilename

					# Add the correct file extension
					extension=magicFileExtension.getExt(filename)

					if( len(extension) > 0):
						print("Renaming " + filename + " to " + filename + extension)
						os.rename(filename, filename + extension)
					
				except Exception as e:
					print("Error while downloading " + urlString + " %s" % e)

			#else:
				#print ("Not downloading " + urlString + " - already exists")
		else:
			print ("Not downloading " + urlString + " - not a exe file")

	print ("Finished downloading all available samples")
Beispiel #20
0
def xml_to_dict(fpath):
    '''
    Converts study data from (ref man generated) XML to a dictionary matching study IDs (keys) to
    title/abstract tuples (values). For example: dict[n] might map to a tuple [t_n, a_n] where t_n is the
    title of the nth paper and a_n is the abstract
    '''
    ref_ids_to_abs = {}
    parsing_errors = []
    num_no_abs = 0
    tree = ElementTree(file=fpath)
    
    num_failed = 0
    
    for record in tree.findall('.//record'):
        pubmed_id, refmanid = None, None

        refman_version = record.findtext('.//source-app')
        path_str = None
        ### here we check the RefMan version, and change
        # the xml path accordingly. this fixes issue #7
        if refman_version == 'Reference Manager 12.0':
            path_str = './/rec-number/style'
            journal_path_str = './/periodical/full-title/style'
        elif refman_version == 'Reference Manager 11.0':
            path_str = './/rec-number'
            journal_path_str = './/periodical/abbr-1/style'

        try:
            refmanid = int(record.findtext(path_str))
        except:
            error = "Unable to parse record '%s' in '%s'" % (record, os.path.basename(fpath))
            #print "failed to parse refman document"
            parsing_errors.append(error)

        if refmanid is not None:
            # attempt to grab the pubmed id
            pubmed_id = ""
            try:
                pubmed = record.findtext('.//notes/style')
                pubmed = pubmed.split("-")
                for i in range(len(pubmed)):
                    if "UI" in pubmed[i]:
                        pubmed_str = pubmed[i+1].strip()
                        pubmed_id = int("".join([x for x in pubmed_str if x in string.digits]))
            except Exception, ex:
                error = "Problem getting pmid from '%s' in '%s'" % (record, os.path.basename(fpath))
                parsing_errors.append(error)
                #print "problem getting pmid ..."
                #print ex
                #print("\n")
    
            ab_text = record.findtext('.//abstract/style')
            if ab_text is None:
                num_no_abs += 1
    
            title_text = record.findtext('.//titles/title/style')
    
            # Also grab keywords
            keywords = [keyword.text.strip().lower() for keyword in record.findall(".//keywords/keyword/style")]

            # and authors
            authors = [author.text for author in record.findall(".//contributors/authors/author/style")]

            # journal
            journal = record.findtext(journal_path_str)

            ref_ids_to_abs[refmanid] = {"title":title_text, "abstract":ab_text, "journal":journal,\
                        "keywords":keywords, "pmid":pubmed_id, "authors":authors}
Beispiel #21
0
topelement = Element('top')
i = 1

for vf in votesfiles:
    print vf
    try:
        votetree = ElementTree(file=vf)
        voteroot = votetree.getroot()
        date = voteroot.get('date')
        m = re.match('(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})', date)
        if not m:
            print "internal error in date format"
            sys.exit()
        mgd = m.groupdict()
        mgd.update({'date': date})
        acts = votetree.findall('//royal_assent/act')
        if len(acts) > 0:
            assent = Element('assent', mgd)
            for j in range(len(acts)):
                assent.insert(j, acts[j])
            topelement.insert(i, assent)
            i = i + 1
    except xml.parsers.expat.ExpatError, errorinst:
        print errorinst
        print "XML parsing error in %s" % vf, sys.exc_info()[0]

top = ElementTree(topelement)

top.write('allvotes.xml')