def resolve_metadata(doc, method): url = doc.get_field("url") res = re.match(pattern, url) url = r'https://books.google.%s/books?id=%s' % (res.group(1), res.group(2).split("&")[0]) doc.set_field("url", url) data = referencer.download("Reading Google Books web page", "Parsing the content of the Google Books page...", url) bib = referencer.download("Fetching BiBTeX data", "Downloading BiBTeX metadata for the book...", url + "&output=bibtex") doc.parse_bibtex(bib) return True
def get_citation_from_doi(query, email='*****@*****.**', tool='Referencer', database='pubmed'): params = { 'db': database, 'tool': tool, 'email': email, 'term': query + "[doi]", 'usehistory': 'y', 'retmax': 1 } # try to resolve the PubMed ID of the DOI url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' + urllib.urlencode( params) data = referencer.download(_("Resolving DOI"), _("Finding PubMed ID from DOI %s") % query, url) # parse XML output from PubMed... xmldoc = minidom.parseString(data) ids = xmldoc.getElementsByTagName('Id') # nothing found, exit if len(ids) == 0: raise "pubmed.get_citation_from_doi: DOI not found" # get ID id = ids[0].childNodes[0].data print "pubmed.get_citation_from_doi: DOI ", query, " has PubMed ID ", id return get_citation_from_pmid(id)
def get_citation_from_doi(query, email='*****@*****.**', tool='Referencer', database='pubmed'): params = { 'db':database, 'tool':tool, 'email':email, 'term':query + "[doi]", 'usehistory':'y', 'retmax':1 } # try to resolve the PubMed ID of the DOI url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' + urllib.urlencode(params) data = referencer.download (_("Resolving DOI"), _("Finding PubMed ID from DOI %s") % query , url); # parse XML output from PubMed... xmldoc = minidom.parseString(data) ids = xmldoc.getElementsByTagName('Id') # nothing found, exit if len(ids) == 0: raise "pubmed.get_citation_from_doi: DOI not found" # get ID id = ids[0].childNodes[0].data print "pubmed.get_citation_from_doi: DOI ", query, " has PubMed ID ", id return get_citation_from_pmid (id)
def resolve_metadata (doc, method): if method != "doi": return False doi = doc.get_field("doi") params = { 'data_type':"XML", 'doi':doi } url = "http://adsabs.harvard.edu/cgi-bin/nph-bib_query?" + urllib.urlencode (params) data = referencer.download (_("Resolving DOI"), _("Fetching metadata from NASA ADS for DOI %s") % doi, url); if data.find ("retrieved=\"1\"") == -1: print "Couldn't get info from ADS" return False fields = [] try: xmldoc = minidom.parseString (data) fields.append (["journal", get_field(xmldoc, "journal")]) fields.append (["title", get_field(xmldoc, "title")]) fields.append (["volume", get_field(xmldoc, "volume")]) authors = xmldoc.getElementsByTagName('author') authorString = "" first = True for author in authors: name = author.childNodes[0].data.encode("utf-8") if (first == False): authorString += " and " print "got author", name authorString += name first = False fields.append (["author", authorString]) print "appended authors" pages = get_field (xmldoc, "page") print "getting lastPage" lastPage = get_field (xmldoc, "lastpage") if (len(lastPage) > 0): pages += "-" pages += lastPage print "got pages " , pages fields.append (["page", pages]) print "appended pages" except: print "exception" return False for field in fields: if len(field[1]) > 0: doc.set_field(field[0], field[1]) # TODO: parse pubdata element for "Jul 1989" (month and year fields) return True
def resolve_metadata(doc, method): url = doc.get_field("url") res = re.match(pattern, url) url = r'https://books.google.%s/books?id=%s' % (res.group(1), res.group(2).split("&")[0]) doc.set_field("url", url) data = referencer.download( "Reading Google Books web page", "Parsing the content of the Google Books page...", url) bib = referencer.download("Fetching BiBTeX data", "Downloading BiBTeX metadata for the book...", url + "&output=bibtex") doc.parse_bibtex(bib) return True
def get_citation_from_pmid (pmid, email='*****@*****.**', tool='Referencer', database='pubmed'): params = { 'db':database, 'tool':tool, 'email':email, 'id':pmid, 'retmode':'xml' } # get citation info: url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + urllib.urlencode(params) data = referencer.download (_("Resolving PubMed ID"), _("Fetching metadata from NCBI for PubMed ID %s") % pmid, url); return data
def get_number_of_records (document): title = document.get_field("title") year = document.get_field ("year") author= document.get_field ("author") url0='http://estipub.isiknowledge.com/esti/cgi?databaseID=WOS&rspType=xml&method=search&firstRec=1&numRecs=1' url0+= '&query='+get_query(document) data0 = referencer.download( _("Obtaining data from ISI-WebOfScience"), _("Fetching number of ocurrences for %s/%s/%s") % (author,title,year), url0) print data0 xmldoc0 = minidom.parseString(data0) recordsFound=get_field(xmldoc0,"recordsFound") return int(recordsFound)
def get_data(self,document, firstrec=None, numrecs=None): title = document.get_field("title") year = document.get_field ("year") author= document.get_field ("author") if firstrec is None: firstrec = 1 if numrecs is None: numrecs = 1 url='http://estipub.isiknowledge.com/esti/cgi?databaseID=WOS&SID=Q1mNFhCECOk6c8aELLh&rspType=xml&method=searchRetrieve' url += \ '&firstRec=' + str(firstrec) + \ '&numRecs=' + str(numrecs) + \ '&query=' + get_query(document) data = referencer.download(_("Obtaining data from ISI-WebOfScience"), _("Fetching data for %s/%s/%s") % (author,title,year), url); return data
def resolve_metadata(doc, method): url = doc.get_field("url") res = re.match(pattern, url) url = r'http://books.google.%s/books?id=%s' % (res.group(1), res.group(2)) doc.set_field("url", url) bibtex_url = r'http://books.google.%s/books?id=%s&output=bibtex' % ( res.group(1), res.group(2)) #print "url: ", repr(bibtex_url) bib = referencer.download("Fetching BiBTeX data", "Downloading BiBTeX metadata for the book...", bibtex_url) #print "bib:", repr(bib) doc.parse_bibtex(bib) doc.set_type("book") return True
def do_search(document): title = document.get_field("title") year = document.get_field("year") author = document.get_field("author") url0 = 'http://estipub.isiknowledge.com/esti/cgi?action=search&viewType=xml&mode=GeneralSearch&product=WOS&ServiceName=GeneralSearch&filter=&Start=&End=%d&DestApp=WOS' % ( get_MAXRECORDS()) url0 += "&" + get_query(document) print "isi query url:", url0 if False: #debugging #data0 = open("plugins/isi-plugin-testdata.txt").read() data0 = open("plugins/isi-plugin-testdata2.txt").read() else: data0 = referencer.download( _("Obtaining data from ISI-WebOfScience"), _("Querying for %s/%s/%s") % (author, title, year), url0) print data0 xmldoc0 = minidom.parseString(data0) return xmldoc0
def do_search (document): title = document.get_field("title") year = document.get_field ("year") author= document.get_field ("author") url0='http://estipub.isiknowledge.com/esti/cgi?action=search&viewType=xml&mode=GeneralSearch&product=WOS&ServiceName=GeneralSearch&filter=&Start=&End=%d&DestApp=WOS' % (get_MAXRECORDS()) url0+= "&" + get_query(document) print "isi query url:", url0 if False: #debugging #data0 = open("plugins/isi-plugin-testdata.txt").read() data0 = open("plugins/isi-plugin-testdata2.txt").read() else: data0 = referencer.download( _("Obtaining data from ISI-WebOfScience"), _("Querying for %s/%s/%s") % (author,title,year), url0) print data0 xmldoc0 = minidom.parseString(data0) return xmldoc0
def resolve_metadata(doc, method=None): # try with title, otherwise try with author + year title = doc.get_field("title") if title: searchTerms = [title] else: searchTerms = [get_first_author(doc.get_field("author"))] searchTerms += [doc.get_field("year")] searchTerm = " ".join(searchTerms) for c in "(),.{}!\"':=#%$/&[]+": searchTerm = searchTerm.replace(c, "") searchTerm = searchTerm.replace("-", " ") while searchTerm.find(" ") > 0: #remove double spaces searchTerm = searchTerm.replace(" ", " ") #print "DBLP:searchTerm:", repr(searchTerm) url = "http://www.dblp.org/search/api/?%s&h=1000&c=0&f=0&format=xml" % ( urllib.urlencode({'q': searchTerm})) print "DBLP:url:", repr(url) data = referencer.download( _("Searching DBLP"), _("Fetching metadata from DBLP for search query '%s'") % searchTerm, url) if not data: return False hits = parse_hits_get_urls(data) print "DBLP:hits:", hits if len(hits) != 1: #XXX, display UI? print "DBLP: Not exactly one hit, giving up" return False bibtex_xml = get_bibtex_xml_from_url(hits[0]) #print bibtex_xml bibtex = bibtex_xml_to_bibtex(bibtex_xml) #print bibtex doc.parse_bibtex(bibtex) return True
def get_citation_from_pmid(pmid, email='*****@*****.**', tool='Referencer', database='pubmed'): params = { 'db': database, 'tool': tool, 'email': email, 'id': pmid, 'retmode': 'xml' } # get citation info: url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + urllib.urlencode( params) data = referencer.download( _("Resolving PubMed ID"), _("Fetching metadata from NCBI for PubMed ID %s") % pmid, url) return data
def resolve_metadata (doc, method=None): # try with title, otherwise try with author + year title = doc.get_field("title") if title: searchTerms = [title] else: searchTerms = [get_first_author(doc.get_field("author"))] searchTerms += [doc.get_field("year")] searchTerm = " ".join(searchTerms) for c in "(),.{}!\"':=#%$/&[]+": searchTerm = searchTerm.replace(c, "") searchTerm = searchTerm.replace("-", " ") while searchTerm.find(" ") > 0: #remove double spaces searchTerm = searchTerm.replace(" ", " ") #print "DBLP:searchTerm:", repr(searchTerm) url = "http://www.dblp.org/search/api/?%s&h=1000&c=0&f=0&format=xml" % (urllib.urlencode({'q': searchTerm})) print "DBLP:url:", repr(url) data = referencer.download (_("Searching DBLP"), _("Fetching metadata from DBLP for search query '%s'") % searchTerm, url); if not data: return False hits = parse_hits_get_urls(data) print "DBLP:hits:", hits if len(hits) != 1: #XXX, display UI? print "DBLP: Not exactly one hit, giving up" return False bibtex_xml = get_bibtex_xml_from_url(hits[0]) #print bibtex_xml bibtex = bibtex_xml_to_bibtex(bibtex_xml) #print bibtex doc.parse_bibtex(bibtex) return True
def referencer_search_TEST(search_text): email = '*****@*****.**' tool = 'Referencer' database = 'pubmed' retmax = 100 params = { 'db': database, 'tool': tool, 'email': email, 'term': search_text, 'usehistory': 'y', 'retmax': retmax } # try to resolve the PubMed ID of the DOI url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' + urllib.urlencode( params) data = referencer.download(_("Searching pubmed"), _("Searching pubmed for '%s'") % search_text, url) # parse XML output from PubMed... print data xmldoc = minidom.parseString(data) ids = xmldoc.getElementsByTagName('Id') # nothing found, exit # FIXME: not really an error if len(ids) == 0: raise "pubmed.referencer_search: no results" webenv = xmldoc.getElementsByTagName('WebEnv') if len(webenv) == 0: raise "pubmed.referencer_search: no webenv" webenv = webenv[0].childNodes[0].data query_key = xmldoc.getElementsByTagName('QueryKey') if len(query_key) == 0: raise "pubmed.referencer_search: no query_key" query_key = query_key[0].childNodes[0].data params = { 'db': database, 'tool': tool, 'email': email, 'webenv': webenv, 'query_key': query_key, 'retmax': retmax } url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?' + urllib.urlencode( params) data = referencer.download( _("Retrieving pubmed summaries"), _("Retrieving summaries for '%s'") % search_text, url) xmldoc = minidom.parseString(data) results = [] for docsum in xmldoc.getElementsByTagName('DocSum'): title = "" author = "" pmid = "" id = docsum.getElementsByTagName("Id") if len(id) != 0: pmid = id[0].childNodes[0].data else: raise "pubmed.referencer_search: docsum without id" for childnode in docsum.getElementsByTagName("Item"): if childnode.getAttribute("Name") == "Title": title = childnode.childNodes[0].data if childnode.getAttribute("Name") == "Author": author = childnode.childNodes[0].data results.append({"token": pmid, "title": title, "author": author}) print results return results
def referencer_search_TEST (search_text): email='*****@*****.**' tool='Referencer' database='pubmed' retmax = 100 params = { 'db':database, 'tool':tool, 'email':email, 'term':search_text, 'usehistory':'y', 'retmax':retmax } # try to resolve the PubMed ID of the DOI url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' + urllib.urlencode(params) data = referencer.download (_("Searching pubmed"), _("Searching pubmed for '%s'") % search_text , url); # parse XML output from PubMed... print data xmldoc = minidom.parseString(data) ids = xmldoc.getElementsByTagName('Id') # nothing found, exit # FIXME: not really an error if len(ids) == 0: raise "pubmed.referencer_search: no results" webenv = xmldoc.getElementsByTagName('WebEnv') if len(webenv) == 0: raise "pubmed.referencer_search: no webenv" webenv = webenv[0].childNodes[0].data query_key = xmldoc.getElementsByTagName('QueryKey') if len(query_key) == 0: raise "pubmed.referencer_search: no query_key" query_key = query_key[0].childNodes[0].data params = { 'db':database, 'tool':tool, 'email':email, 'webenv':webenv, 'query_key':query_key, 'retmax':retmax } url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?' + urllib.urlencode(params) data = referencer.download (_("Retrieving pubmed summaries"), _("Retrieving summaries for '%s'") % search_text , url); xmldoc = minidom.parseString(data) results = [] for docsum in xmldoc.getElementsByTagName('DocSum'): title = "" author = "" pmid = "" id = docsum.getElementsByTagName("Id") if len(id) !=0: pmid = id[0].childNodes[0].data else: raise "pubmed.referencer_search: docsum without id" for childnode in docsum.getElementsByTagName("Item"): if childnode.getAttribute("Name") == "Title": title = childnode.childNodes[0].data if childnode.getAttribute("Name") == "Author": author = childnode.childNodes[0].data results.append ({"token":pmid,"title":title,"author":author}) print results return results
def resolve_metadata (doc, method): if method != "doi": return False doi = doc.get_field("doi") params = { 'data_type':"XML", 'doi':doi } url = "http://adsabs.harvard.edu/cgi-bin/nph-bib_query?" + urllib.urlencode (params) data = referencer.download (_("Resolving DOI"), _("Fetching metadata from NASA ADS for DOI %s") % doi, url); if data.find ("retrieved=\"1\"") == -1: print "Couldn't get info from ADS" return False fields = [] try: xmldoc = minidom.parseString (data) fields.append (["title", get_field(xmldoc, "title")]) fields.append (["volume", get_field(xmldoc, "volume")]) fields.append (["issue", get_field(xmldoc, "issue")]) fields.append (["year", get_field(xmldoc, "pubdate").partition(' ')[2]]) fields.append (["Month", str.lower(get_field(xmldoc, "pubdate").partition(' ')[0])]) fields.append (["Adsurl", xmldoc.getElementsByTagName('url')[-1].childNodes[0].data.encode("utf-8")]) fields.append (["Adsbibcode", get_field(xmldoc, "bibcode")]) # ADS include full bibliographic information in the journal XML tag, # see http://doc.adsabs.harvard.edu/abs_doc/help_pages/taggedformat.html#jnl journal = get_field(xmldoc, "journal") journalString = re.sub(', [vV]ol(ume|\.).*', '', journal) fields.append (["journal", journalString]) authors = xmldoc.getElementsByTagName('author') authorString = "" first = True for author in authors: name = author.childNodes[0].data.encode("utf-8") if (first == False): authorString += " and " print "got author", name authorString += name first = False fields.append (["author", authorString]) print "appended authors" pages = get_field (xmldoc, "page") print "getting lastPage" lastPage = get_field (xmldoc, "lastpage") if (len(lastPage) > 0): pages += "-" pages += lastPage print "got pages " , pages fields.append (["pages", pages]) print "appended pages" except: print "exception" return False for field in fields: if len(field[1]) > 0: doc.set_field(field[0], field[1]) return True
def resolve_metadata(doc, method): if method != "doi": return False doi = doc.get_field("doi") params = {'data_type': "XML", 'doi': doi} url = "http://adsabs.harvard.edu/cgi-bin/nph-bib_query?" + urllib.urlencode( params) data = referencer.download( _("Resolving DOI"), _("Fetching metadata from NASA ADS for DOI %s") % doi, url) if data.find("retrieved=\"1\"") == -1: print "Couldn't get info from ADS" return False fields = [] try: xmldoc = minidom.parseString(data) fields.append(["journal", get_field(xmldoc, "journal")]) fields.append(["title", get_field(xmldoc, "title")]) fields.append(["volume", get_field(xmldoc, "volume")]) fields.append(["issue", get_field(xmldoc, "issue")]) fields.append(["year", get_field(xmldoc, "pubdate").partition(' ')[2]]) fields.append([ "Month", str.lower(get_field(xmldoc, "pubdate").partition(' ')[0]) ]) fields.append([ "Adsurl", xmldoc.getElementsByTagName('url')[-1].childNodes[0].data.encode( "utf-8") ]) fields.append(["Adsbibcode", get_field(xmldoc, "bibcode")]) authors = xmldoc.getElementsByTagName('author') authorString = "" first = True for author in authors: name = author.childNodes[0].data.encode("utf-8") if (first == False): authorString += " and " print "got author", name authorString += name first = False fields.append(["author", authorString]) print "appended authors" pages = get_field(xmldoc, "page") print "getting lastPage" lastPage = get_field(xmldoc, "lastpage") if (len(lastPage) > 0): pages += "-" pages += lastPage print "got pages ", pages fields.append(["pages", pages]) print "appended pages" except: print "exception" return False for field in fields: if len(field[1]) > 0: doc.set_field(field[0], field[1]) return True
def get_bibtex_xml_from_url(url): url = url + ".xml" data = referencer.download (_("Searching DBLP"), _("Fetching metadata from DBLP for url '%s'") % url, url); return data
def get_bibtex_xml_from_url(url): url = url + ".xml" data = referencer.download( _("Searching DBLP"), _("Fetching metadata from DBLP for url '%s'") % url, url) return data