def get_header(metaheaders, a, b): A = metaheaders.get_item(a) if A: return A B = metaheaders.get_item(b) if B: return B return None
metaheaders = metaheaders.MetaHeaders("http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=%d" % ar_number) root = metaheaders.root abstract = '' abstractDiv = root.xpath("//a[@name='Abstract']/../*/text()") if abstractDiv: abstract = abstractDiv[0] abstract = re.sub("^Abstract\s*", "", abstract).strip() #print etree.tostring(root, pretty_print=True) doi = metaheaders.get_item("citation_doi") if not doi: aLinks = root.cssselect("a") for a in aLinks: if not a.attrib.has_key("href"): continue href = a.attrib["href"] if href.startswith("http://dx.doi.org/"): match = re.search(r'(10\..*)', href) if match: doi = match.group(1) break
<meta name="citation_issn" content="0025-5718"> <meta name="citation_issn" content="1088-6842"> <meta name="citation_author" content="LeVeque, Randall J."> <meta name="citation_author" content="Oliger, Joseph"> <meta name="citation_title" content="Numerical methods based on additive splittings for hyperbolic partial differential equations"> <meta name="citation_online_date" content=""> <meta name="citation_publication_date" content="1983"> <meta name="citation_volume" content="40"> <meta name="citation_issue" content="162"> <meta name="citation_firstpage" content="469"> <meta name="citation_lastpage" content="497"> <meta name="citation_doi" content="10.1090/S0025-5718-1983-0689466-8"> <meta name="citation_abstract_html_url" content="http://www.ams.org/mcom/1983-40-162/S0025-5718-1983-0689466-8/"> """ doi = metaheaders.get_item("citation_doi") if not doi: bail('Unable to find a DOI') sys.exit(0) print "begin_tsv" print "linkout\tDOI\t\t%s\t\t" % (doi) print "type\tJOUR" print "doi\t" + doi for f in key_map.keys(): k = key_map[f] v = metaheaders.get_item(k) if not v: continue v = v.strip()
opener = urllib2.build_opener(*handlers) opener.addheaders = [("User-Agent", "CiteULike/1.0 +http://www.citeulike.org/") ] urllib2.install_opener(opener) try: ris_file = urllib2.urlopen(ris_file_url).read() except: bail("Could not fetch RIS file (" + ris_file_url + ")") metaheaders = metaheaders.MetaHeaders(url) print "begin_tsv" if metaheaders.get_item("citation_conference") or metaheaders.get_item( "citation_conference_title"): print "type\tINCONF" else: print "type\tJOUR" doi = metaheaders.get_item("citation_doi") if doi: doi = doi.replace("doi:", "") print "doi\t%s" % doi print "linkout\tDOI\t\t%s\t\t" % (doi) else: bail("Couldn't find an DOI") print "end_tsv" print "begin_ris" print "%s" % (ris_file)
# # DOI is in the page # metaheaders = metaheaders.MetaHeaders(page=page) dois = metaheaders.get_multi_item("DC.identifier") doi = None if dois: for doi_str in dois: doi_match = re.search(r'doi:(10\.[^/]+/[^\s]+)', doi_str, re.IGNORECASE) if doi_match: doi = doi_match.group(1) if not doi: bail("Couldn't find a DOI") if not metaheaders.get_item("DC.title"): bail("Unable to find the article title") print "begin_tsv" print "publisher\tDryad Digital Repository" print "type\tGEN" metaheaders.print_item("title","DC.title") authors = metaheaders.get_multi_item("DC.creator") if authors: for a in authors: print "author\t%s" % a metaheaders.print_date("DCTERMS.issued") abstract = metaheaders.get_item("DC.description"); if abstract:
if not matched: bail("Cannot parse IUCR journal. Unrecognized URL: " + url + " - does the plugin need updating?") # # Fetch the page # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") print "begin_tsv" metaheaders = metaheaders.MetaHeaders(page=page) if not doi: doiMatch = metaheaders.get_item("citation_doi"); match = re.search(r'10.1107/([0-9a-zA-Z]+)', doiMatch, re.IGNORECASE) if match: doi = "10.1107/" + match.group(1) key = match.group(1) if doi: print "linkout\tIUCR\t\t%s\t\t" % key print "linkout\tDOI\t\t%s\t\t" % doi print "url\thttp://dx.doi.org/" + doi print "doi\t" + doi else: bail("Couldn't find a DOI") if not metaheaders.get_item("DC.title"): bail("Cannot find a title in that article")
opener=urllib2.build_opener(*handlers) opener.addheaders = [("User-Agent", "CiteULike/1.0 +http://www.citeulike.org/")] urllib2.install_opener(opener) try: ris_file = urllib2.urlopen(ris_file_url).read() except: bail("Could not fetch RIS file (" + ris_file_url + ")") metaheaders = metaheaders.MetaHeaders(url) print "begin_tsv" if metaheaders.get_item("citation_conference") or metaheaders.get_item("citation_conference_title"): print "type\tINCONF" else: print "type\tJOUR" doi = metaheaders.get_item("citation_doi") if doi: doi = doi.replace("doi:","") print "doi\t%s" % doi print "linkout\tDOI\t\t%s\t\t" % (doi) else: bail("Couldn't find an DOI") print "end_tsv" print "begin_ris" print "%s" % (ris_file) print "end_ris"
dc.creator = ['D. G. Aggelis', 'N. K. Paschos', 'N. M. Barkoula', 'A. S. Paipetis', 'T. E. Matikas', 'A. D. Georgoulis'] """ key_map = { "publisher" : "citation_publisher", "abstract" : "description", "issue": "citation_issue", "issn": "citation.issn", "title": "citation_title", "volume": "citation_volume", "start_page": "citation_firstpage", "end_page": "citation_lastpage" } doi = metaheaders.get_item("citation_doi") if not doi: bail('Unable to find a DOI') sys.exit(0) doi = doi.replace("doi:","") print "begin_tsv" print "linkout\tDOI\t\t%s\t\t" % (doi) print "type\tJOUR" print "doi\t" + doi for f in key_map.keys(): k = key_map[f] v = metaheaders.get_item(k) if not v:
zid = record_match.group(1) httpUrl = "http://" + url_host + "/record/" + zid; # # Fetch the page # try: page = urllib2.urlopen(httpUrl).read().strip() except: bail("Couldn't fetch page (" + httpUrl + ")") # # DOI is in the page # metaheaders = metaheaders.MetaHeaders(page=page) doi = metaheaders.get_item("citation_doi") print "begin_tsv" print "publisher\tZENODO" print "type\tGEN" if metaheaders.get_item("citation_title"): metaheaders.print_item("title","citation_title") authors = metaheaders.get_multi_item("citation_author") if authors: for a in authors: print "author\t%s" % a.encode('utf-8') metaheaders.print_date("citation_publication_date") zenodoURL = metaheaders.get_item("citation_abstract_html_url") if zid:
metaheaders = metaheaders.MetaHeaders( "http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=%d" % ar_number) root = metaheaders.root abstract = '' abstractDiv = root.xpath("//a[@name='Abstract']/../*/text()") if abstractDiv: abstract = abstractDiv[0] abstract = re.sub("^Abstract\s*", "", abstract).strip() #print etree.tostring(root, pretty_print=True) doi = metaheaders.get_item("citation_doi") if not doi: aLinks = root.cssselect("a") for a in aLinks: if not a.attrib.has_key("href"): continue href = a.attrib["href"] if href.startswith("http://dx.doi.org/"): match = re.search(r'(10\..*)', href) if match: doi = match.group(1) break print "begin_tsv"
fg_descr = m.group(1) fg_id = m.group(2) # # Fetch the page # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") # # DOI is in the page # metaheaders = metaheaders.MetaHeaders(page=page) doi_str = metaheaders.get_item("citation_doi") doi_match = re.search(r'doi:(10\.[^/]+/[^\s]+)', doi_str, re.IGNORECASE) if doi_match: doi = doi_match.group(1) else: bail("Couldn't find an DOI") root = metaheaders.root abstractDiv = root.xpath("//div[@id='article_desc']/div/p/text()") if abstractDiv: abstract = abstractDiv[0] else: abstract = None
from cultools import urlparams, bail import metaheaders socket.setdefaulttimeout(15) # Read URL from stdin url = sys.stdin.readline().strip() sys.stdout = codecs.getwriter('utf-8')(sys.stdout) metaheaders = metaheaders.MetaHeaders(url, unescape_entities=True) print "begin_tsv" if metaheaders.get_item("citation_conference"): print "type\tINCONF" else: print "type\tJOUR" authors = metaheaders.get_multi_item("citation_author") if authors: for a in authors: print "author\t%s" % a metaheaders.print_item("title", "citation_title") metaheaders.print_date("citation_publication_date") metaheaders.print_item("volume", "citation_volume") metaheaders.print_item("start_page", "citation_firstpage") metaheaders.print_item("end_page", "citation_lastpage") metaheaders.print_item("issue", "citation_issue")
# Read URL from stdin url = sys.stdin.readline().strip() u = urlparse(url) # rewrite the URL - need ?isAuthorized=no to avoid redirect loop url = "%s://%s%s?isAuthorized=no" % (u.scheme, u.netloc, u.path) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) metaheaders = metaheaders.MetaHeaders(url) print "begin_tsv" if metaheaders.get_item("citation_conference"): print "type\tINCONF" else: print "type\tJOUR" authors = metaheaders.get_multi_item("citation_author") if authors: for a in authors: print "author\t%s" % a metaheaders.print_item("title","citation_title") metaheaders.print_date("citation_date") metaheaders.print_item("volume","citation_volume") metaheaders.print_item("start_page","citation_firstpage") metaheaders.print_item("end_page","citation_lastpage")
# Fetch the page # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") print "begin_tsv" print "publisher\tFrontiers" # # DOI is in the page # metapropsheaders = metaheaders.MetaHeaders(name="property", page=page) metaheaders = metaheaders.MetaHeaders(page=page) doi = metaheaders.get_item("citation_doi") if doi: print "linkout\tDOI\t\t%s\t\t" % doi print "linkout\tFRONT\t\t%s\t\t" % doi else: bail("Couldn't find an DOI") docType = metapropsheaders.get_item("og:type"); if not docType: bail("Cannot determine the publication type") if docType != "article": bail("Only supports journal papers ('article', 'JOUR') at this moment, but found " + docType) if not metaheaders.get_item("citation_title"): bail("Cannot find a title in that article")
"journal": "citation_journal_title", "issue": "citation_issue", "title": "DC.Title", "volume": "citation_volume", "start_page": "citation_firstpage", "end_page": "citation_lastpage" } """ <meta content="2012-01-01" name="DC.Date"/> <meta content="eLife Sciences" name="citation_journal_title"/> <meta content="" name="citation_issn"/> <meta content="2050-084X" name="citation_issn"/> """ doi = metaheaders.get_item("DC.Identifier") if not doi: bail('Unable to find a DOI') sys.exit(0) print "begin_tsv" print "linkout\tDOI\t\t%s\t\t" % (doi) print "type\tJOUR" print "doi\t" + doi for f in key_map.keys(): k = key_map[f] v = metaheaders.get_item(k) if not v: continue v = v.strip()
# # Fetch the page # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") print "begin_tsv" print "publisher\tNature Publishing Group" # # DOI is in the page # metaheaders = metaheaders.MetaHeaders(page=page) doi_str = metaheaders.get_item("citation_doi") doi_match = re.search(r'doi:(10\.[^/]+/[^\s]+)', doi_str, re.IGNORECASE) doi = None if doi_match: doi = doi_match.group(1) else: bail("Couldn't find an DOI") if doi: print "linkout\tDOI\t\t%s\t\t" % doi else: bail("Couldn't find an DOI") print "linkout\tSCIDAT\t\t%s\t\t" % artId
if not match: bail("Cannot parse this BioMed Central paper. Unrecognized URL: " + url + " - does the plugin need updating?") # # Fetch the page # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") print "begin_tsv" metaheaders = metaheaders.MetaHeaders(page=page) pmid = metaheaders.get_item("citation_pmid"); if pmid: print "linkout\tPMID\t%s\t\t\t" % pmid doi = metaheaders.get_item("citation_doi"); if doi: print "linkout\tDOI\t\t%s\t\t" % doi print "url\thttp://dx.doi.org/" + doi print "doi\t" + doi else: bail("Couldn't find a DOI") if not metaheaders.get_item("citation_title"): bail("Cannot find a title in that article") title = metaheaders.get_item("citation_title")
# Fetch the page # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") print "begin_tsv" print "publisher\tPeerJ Inc." # # DOI is in the page # metapropsheaders = metaheaders.MetaHeaders(name="property", page=page) metaheaders = metaheaders.MetaHeaders(page=page) doi = metaheaders.get_item("citation_doi") if doi: print "linkout\tDOI\t\t%s\t\t" % doi print "linkout\tPEERJP\t\t%s\t\t" % artId else: bail("Couldn't find an DOI") docType = metapropsheaders.get_item("og:type"); if not docType: bail("Cannot determine the publication type") if docType != "article": bail("Only supports journal papers ('article', 'JOUR') at this moment, but found " + docType) if not metaheaders.get_item("citation_title"): bail("Cannot find a title in that article")
citation_doi = ['doi:10.1121/1.3571537'] dc.creator = ['D. G. Aggelis', 'N. K. Paschos', 'N. M. Barkoula', 'A. S. Paipetis', 'T. E. Matikas', 'A. D. Georgoulis'] """ key_map = { "publisher": "citation_publisher", "abstract": "description", "issue": "citation_issue", "issn": "citation.issn", "title": "citation_title", "volume": "citation_volume", "start_page": "citation_firstpage", "end_page": "citation_lastpage" } doi = metaheaders.get_item("citation_doi") if not doi: bail('Unable to find a DOI') sys.exit(0) doi = doi.replace("doi:", "") print "begin_tsv" print "linkout\tDOI\t\t%s\t\t" % (doi) print "type\tJOUR" print "doi\t" + doi for f in key_map.keys(): k = key_map[f] v = metaheaders.get_item(k) if not v: