def doPost(db, # eutils db name (pubmed, PMC, ...) ids, # list of IDs to post and get fetch for URLReader=surl.ThrottledURLReader(), debug=False, ): """ do a eutils.post and return webenv/query_key as eutils URL params """ # build params for post idParams = ','.join( map(lambda x: str(x).strip(), ids) ) params = "api_key=%s&db=%s&id=%s" %(EUTILS_API_KEY, db, idParams) url = EPOST_BASE if debug: sys.stderr.write( "Post URL:\n%s\n" % url ) sys.stderr.write( "Post Params: \n'%s'\n" % params) outputX = URLReader.readURL(url, params=params, GET=False) if debug: sys.stderr.write( "Output from Epost:\n%s\n" % outputX) xmlDoc = minidom.parseString(outputX) # get webenv params webenv, query_key = getWebenv(xmlDoc) webenvURLParams = codeWebenvURLParams(webenv, query_key) return webenvURLParams
def __init__( self, basePath='.', # base path to write article files # files written to basePath/journalName urlReader=surl.ThrottledURLReader(seconds=0.2), pubmedFile='mgiPubmedIDs.tsv', # holds pubmed IDs in MGI verbose=False, writeFiles=True, # =False to not write any files/dirs getXml=False, # =True to write xml output for each # matching article (pmid.xml) getPdf=True, # =True to write PDF files for each # matching article that has PDF # (pmid.pdf) getText=False, # =True to write extracted text file # for each matching article that # has text in the xml output: pmid.txt ): self.basePath = basePath self.urlReader = urlReader self._getPubmedIds(pubmedFile) self.verbose = verbose self.writeFiles = writeFiles self.getXml = getXml self.getPdf = getPdf self.getText = getText self.journalSummary = {} self.curOutputDir = '' self.reporters = [] self.curReporter = None # current reporter (for journal/search)
def getSearchResults( db, # eutils db name ('pubmed', 'pmc', ...) queryString, # esearch query string op='summary', # 'summary' or 'fetch' output retmode='xml', # eutils desired output format rettype=None, # eutils rettype option version='2.0', # eutils output version (affects json?) retmax=10000, # max number of results to return # 10000 is XML output max for eutils URLReader=surl.ThrottledURLReader(), debug=False, ): """ Do esearch and get results as esummary or efetch. Return count of results, results (string), webenv/query_key as eutils URL params """ # do search, save results in eutils history count, webenvURLParams = doSearch(db, queryString, URLReader=URLReader, debug=debug) # get result summary or fetch output = getResults(db, webenvURLParams, op=op, retmode=retmode, rettype=rettype, version=version, retmax=retmax, URLReader=URLReader, debug=debug) return count, output, webenvURLParams
def doPost( db, # eutils db name ('pubmed', 'pmc', ...) ids, # list of IDs (str or bytes) to post and get fetch for URLReader=surl.ThrottledURLReader(), debug=False, ): """ do a eutils.post and return webenv/query_key as eutils URL params. """ def toBytes(x): if type(x) == type(b' '): return x if type(x) == type(' '): return x.encode(encoding=DEFAULT_ENCODING) else: return str(x).encode(encoding=DEFAULT_ENCODING) # build params for post idParams = b','.join([toBytes(x).strip() for x in ids]) otherParams = b'api_key=%b&db=%b&id=' % (toBytes(EUTILS_API_KEY), toBytes(db)) params = otherParams + idParams url = EPOST_BASE if debug: sys.stderr.write("Post URL:\n%s\n" % url[:200]) sys.stderr.write("Post Params: \n'%s'\n" % params[:200]) outputX = URLReader.readURL(url, params=params, GET=False) if debug: sys.stderr.write("Output from Epost:\n%s\n" % outputX[:100]) xmlDoc = minidom.parseString(outputX) # get webenv params webenv, query_key = getWebenv(xmlDoc) webenvURLParams = codeWebenvURLParams(webenv, query_key) return webenvURLParams
def doSearch( db, # eutils db name ('pubmed', 'pmc', ...) queryString, # esearch query string URLReader=surl.ThrottledURLReader(), debug=False, ): """ do a eutils.esearch & leave result set on the eutils history server. Return count and webenv/query_key (as URL params) on history server. """ # do search, save results in eutils history - get search output in xml url = ESEARCH_BASE + "%s&db=%s&term=%s&retmode=%s" % \ (USEHISTORY, db, queryString,'xml') if debug: sys.stderr.write("Esearch URL:\n%s\n" % url) outputX = URLReader.readURL(url) if debug: sys.stderr.write("Output from Esearch:\n%s\n" % outputX) xmlDoc = minidom.parseString(outputX) # what about errors? count = int(xmlDoc.getElementsByTagName("Count")[0].childNodes[0].data) # get webenv params webenv, query_key = getWebenv(xmlDoc) webenvURLParams = codeWebenvURLParams(webenv, query_key) return count, webenvURLParams
def getResults( db, # eutils db name ('pubmed', 'pmc', ...) webenvURLParams, op='summary', # 'summary' or 'fetch' output retmode='xml', # eutils desired output format rettype=None, # eutils rettype option version='2.0', # eutils output version (affects json?) retmax=None, # max number of results to return # None of 0 means no max. # 10000 is XML output max for eutils # 500 is json output max for eutils URLReader=surl.ThrottledURLReader(), debug=False, ): """ Do a eutils.esearch or efetch from results on history server and return results (string) Retmode/rettype: see notes above Note: for json output, eutils have a 500 record output limit, and you get an eutils error if you don't have &retmax """ # result type (could check for more option combination errors) if op == 'summary': url = ESUMMARY_BASE elif op == 'fetch': if retmode == 'json': raise Exception('NCBI efetch does not support json return mode\n') url = EFETCH_BASE else: raise Exception('Invalid SearchResults operation: %s\n' % str(op)) url += webenvURLParams + \ "&db=%s&retmode=%s&version=%s" % (db, retmode, str(version)) if rettype != None: url += "&rettype=%s" % rettype if retmax == None or retmax == 0: retmax = 10000 # eutils XML max if retmode == 'json': url += "&retmax=%d" % min(retmax, 500) else: url += "&retmax=%d" % retmax if debug: sys.stderr.write("Summary/Fetch URL:\n%s\n" % url) output = URLReader.readURL(url) return output
def getPostResults(db, # eutils db name (pubmed, PMC, ...) ids, # list of IDs to post and get results for op='summary', # 'summary' or 'fetch' output retmode='xml', # eutils desired output format rettype=None, # eutils rettype option version='2.0', # eutils output version (affects json?) URLReader=surl.ThrottledURLReader(), debug=False, ): """ do a eutils.post and return eutils.efetch for the results """ webenvURLParams = doPost(db, ids, URLReader=URLReader, debug=debug) # get result summary or fetch output = getResults(db, webenvURLParams, op=op, retmode=retmode, rettype=rettype, version=version, URLReader=URLReader, debug=debug) return output, webenvURLParams
def main(): retmode = args.format urlReader = surl.ThrottledURLReader(seconds=0.4) # don't overwhelm eutils resultsBytes = eulib.getPostResults( 'pubmed', args.pmids, URLReader=urlReader, op='summary', rettype=None, retmode=retmode, )[0] if retmode == 'json': resultsJson = json.loads(resultsBytes) print( json.dumps( resultsJson, sort_keys=True, indent=4, separators=(',', ': ')) + '\n') else: print(resultsBytes.decode())
debug=False, ): """ do a eutils.post and return eutils.efetch for the results """ webenvURLParams = doPost(db, ids, URLReader=URLReader, debug=debug) # get result summary or fetch output = getResults(db, webenvURLParams, op=op, retmode=retmode, rettype=rettype, version=version, URLReader=URLReader, debug=debug) return output, webenvURLParams # ------------------------- if __name__ == "__main__": # test code URLReader = surl.ThrottledURLReader() query = 'Aging+Cell[TA]+AND+(2017/01/01:2017/02/01[PPDAT]+AND+foxo[TITLE})' print '-' * 30 + " Search pubmed" count, webenv = doSearch('pubmed',query, URLReader=URLReader, debug=False) print "webenv: '%s'" % webenv print "count: %d" % count print print '-' * 30 + " SearchResults pubmed summary json" count, output, webenv = getSearchResults('pubmed',query, op='summary', retmode='json', URLReader=URLReader, debug=False) print "output: \n%s" % output[:500] print print '-' * 30 + " SearchResults, pubmed, summary xml"