Python ThrottledURLReader Examples, simpleURLLib.ThrottledURLReader Python Examples

Example #1

0

Show file

File: NCBIutilsLib.py Project: mgijax/autolittriage

def doPost(db,			# eutils db name (pubmed, PMC, ...)
	    ids,		# list of IDs to post and get fetch for
	    URLReader=surl.ThrottledURLReader(),
	    debug=False,
    ):
    """ do a eutils.post and return webenv/query_key as eutils URL params
    """
    # build params for post
    idParams = ','.join( map(lambda x: str(x).strip(), ids)  )
    params = "api_key=%s&db=%s&id=%s" %(EUTILS_API_KEY, db, idParams)

    url = EPOST_BASE
    if debug:
	sys.stderr.write( "Post URL:\n%s\n" % url )
	sys.stderr.write( "Post Params: \n'%s'\n" % params)

    outputX = URLReader.readURL(url, params=params, GET=False) 
    if debug: sys.stderr.write( "Output from Epost:\n%s\n" % outputX)

    xmlDoc = minidom.parseString(outputX)

    # get webenv params
    webenv, query_key = getWebenv(xmlDoc)
    webenvURLParams = codeWebenvURLParams(webenv, query_key)

    return webenvURLParams

Example #2

0

Show file

File: backPopulate.py Project: mgijax/autolittriage

    def __init__(
        self,
        basePath='.',  # base path to write article files
        # files written to basePath/journalName
        urlReader=surl.ThrottledURLReader(seconds=0.2),
        pubmedFile='mgiPubmedIDs.tsv',  # holds pubmed IDs in MGI
        verbose=False,
        writeFiles=True,  # =False to not write any files/dirs
        getXml=False,  # =True to write xml output for each
        #   matching article (pmid.xml)
        getPdf=True,  # =True to write PDF files for each
        #   matching article that has PDF
        #   (pmid.pdf)
        getText=False,  # =True to write extracted text file
        #   for each matching article that
        #   has text in the xml output: pmid.txt
    ):
        self.basePath = basePath
        self.urlReader = urlReader
        self._getPubmedIds(pubmedFile)
        self.verbose = verbose
        self.writeFiles = writeFiles
        self.getXml = getXml
        self.getPdf = getPdf
        self.getText = getText

        self.journalSummary = {}
        self.curOutputDir = ''
        self.reporters = []
        self.curReporter = None  # current reporter (for journal/search)

Example #3

0

Show file

def getSearchResults(
    db,  # eutils db name ('pubmed', 'pmc', ...)
    queryString,  # esearch query string
    op='summary',  # 'summary' or 'fetch' output
    retmode='xml',  # eutils desired output format
    rettype=None,  # eutils rettype option
    version='2.0',  # eutils output version (affects json?)
    retmax=10000,  # max number of results to return
    # 10000 is XML output max for eutils
    URLReader=surl.ThrottledURLReader(),
    debug=False,
):
    """ Do esearch and get results as esummary or efetch.
        Return count of results, results (string), webenv/query_key as
            eutils URL params
    """
    # do search, save results in eutils history
    count, webenvURLParams = doSearch(db,
                                      queryString,
                                      URLReader=URLReader,
                                      debug=debug)

    # get result summary or fetch
    output = getResults(db,
                        webenvURLParams,
                        op=op,
                        retmode=retmode,
                        rettype=rettype,
                        version=version,
                        retmax=retmax,
                        URLReader=URLReader,
                        debug=debug)
    return count, output, webenvURLParams

Example #4

0

Show file

def doPost(
    db,  # eutils db name ('pubmed', 'pmc', ...)
    ids,  # list of IDs (str or bytes) to post and get fetch for
    URLReader=surl.ThrottledURLReader(),
    debug=False,
):
    """ do a eutils.post and return webenv/query_key as eutils URL params.
    """
    def toBytes(x):
        if type(x) == type(b' '): return x
        if type(x) == type(' '): return x.encode(encoding=DEFAULT_ENCODING)
        else: return str(x).encode(encoding=DEFAULT_ENCODING)

    # build params for post
    idParams = b','.join([toBytes(x).strip() for x in ids])
    otherParams = b'api_key=%b&db=%b&id=' % (toBytes(EUTILS_API_KEY),
                                             toBytes(db))
    params = otherParams + idParams

    url = EPOST_BASE
    if debug:
        sys.stderr.write("Post URL:\n%s\n" % url[:200])
        sys.stderr.write("Post Params: \n'%s'\n" % params[:200])

    outputX = URLReader.readURL(url, params=params, GET=False)
    if debug: sys.stderr.write("Output from Epost:\n%s\n" % outputX[:100])

    xmlDoc = minidom.parseString(outputX)

    # get webenv params
    webenv, query_key = getWebenv(xmlDoc)
    webenvURLParams = codeWebenvURLParams(webenv, query_key)

    return webenvURLParams

Example #5

0

Show file

def doSearch(
        db,  # eutils db name ('pubmed', 'pmc', ...)
        queryString,  # esearch query string
        URLReader=surl.ThrottledURLReader(),
        debug=False,
):
    """ do a eutils.esearch & leave result set on the eutils history server.
        Return count and webenv/query_key (as URL params) on history server.
    """
    # do search, save results in eutils history - get search output in xml
    url = ESEARCH_BASE + "%s&db=%s&term=%s&retmode=%s" % \
                                        (USEHISTORY, db, queryString,'xml')
    if debug: sys.stderr.write("Esearch URL:\n%s\n" % url)

    outputX = URLReader.readURL(url)
    if debug: sys.stderr.write("Output from Esearch:\n%s\n" % outputX)

    xmlDoc = minidom.parseString(outputX)  # what about errors?
    count = int(xmlDoc.getElementsByTagName("Count")[0].childNodes[0].data)

    # get webenv params
    webenv, query_key = getWebenv(xmlDoc)
    webenvURLParams = codeWebenvURLParams(webenv, query_key)

    return count, webenvURLParams

Example #6

0

Show file

def getResults(
    db,  # eutils db name ('pubmed', 'pmc', ...)
    webenvURLParams,
    op='summary',  # 'summary' or 'fetch' output
    retmode='xml',  # eutils desired output format
    rettype=None,  # eutils rettype option
    version='2.0',  # eutils output version (affects json?)
    retmax=None,  # max number of results to return
    # None of 0 means no max.
    # 10000 is XML output max for eutils
    #   500 is json output max for eutils
    URLReader=surl.ThrottledURLReader(),
    debug=False,
):
    """ Do a eutils.esearch or efetch from results on history server 
            and return results (string)
        Retmode/rettype: see notes above
        Note: for json output, eutils have a 500 record output limit,
        and you get an eutils error if you don't have &retmax
    """
    # result type (could check for more option combination errors)
    if op == 'summary': url = ESUMMARY_BASE
    elif op == 'fetch':
        if retmode == 'json':
            raise Exception('NCBI efetch does not support json return mode\n')
        url = EFETCH_BASE
    else:
        raise Exception('Invalid SearchResults operation: %s\n' % str(op))

    url += webenvURLParams + \
                "&db=%s&retmode=%s&version=%s" % (db, retmode, str(version))
    if rettype != None:
        url += "&rettype=%s" % rettype

    if retmax == None or retmax == 0: retmax = 10000  # eutils XML max
    if retmode == 'json':
        url += "&retmax=%d" % min(retmax, 500)
    else:
        url += "&retmax=%d" % retmax

    if debug: sys.stderr.write("Summary/Fetch URL:\n%s\n" % url)

    output = URLReader.readURL(url)

    return output

Example #7

0

Show file

File: NCBIutilsLib.py Project: mgijax/autolittriage

def getPostResults(db,		# eutils db name (pubmed, PMC, ...)
		    ids,	# list of IDs to post and get results for
		    op='summary',	# 'summary' or 'fetch' output
		    retmode='xml',	# eutils desired output format
		    rettype=None,	# eutils rettype option
		    version='2.0',	# eutils output version (affects json?)
		    URLReader=surl.ThrottledURLReader(),
		    debug=False,
    ):
    """ do a eutils.post and return eutils.efetch for the results
    """
    webenvURLParams = doPost(db, ids, URLReader=URLReader, debug=debug)

    # get result summary or fetch
    output = getResults(db, webenvURLParams, op=op, retmode=retmode,
		    rettype=rettype, version=version, URLReader=URLReader,
		    debug=debug)
    return output, webenvURLParams

Example #8

0

Show file

def main():
    retmode = args.format
    urlReader = surl.ThrottledURLReader(seconds=0.4)  # don't overwhelm eutils

    resultsBytes = eulib.getPostResults(
        'pubmed',
        args.pmids,
        URLReader=urlReader,
        op='summary',
        rettype=None,
        retmode=retmode,
    )[0]
    if retmode == 'json':
        resultsJson = json.loads(resultsBytes)
        print(
            json.dumps(
                resultsJson, sort_keys=True, indent=4, separators=(',',
                                                                   ': ')) +
            '\n')
    else:
        print(resultsBytes.decode())

Example #9

0

Show file

File: NCBIutilsLib.py Project: mgijax/autolittriage

		    debug=False,
    ):
    """ do a eutils.post and return eutils.efetch for the results
    """
    webenvURLParams = doPost(db, ids, URLReader=URLReader, debug=debug)

    # get result summary or fetch
    output = getResults(db, webenvURLParams, op=op, retmode=retmode,
		    rettype=rettype, version=version, URLReader=URLReader,
		    debug=debug)
    return output, webenvURLParams

# -------------------------

if __name__ == "__main__":      # test code
    URLReader = surl.ThrottledURLReader()
    query = 'Aging+Cell[TA]+AND+(2017/01/01:2017/02/01[PPDAT]+AND+foxo[TITLE})'

    print '-' * 30 + " Search pubmed"
    count, webenv = doSearch('pubmed',query, URLReader=URLReader, debug=False)
    print "webenv: '%s'" % webenv
    print "count: %d" % count
    print

    print '-' * 30 + " SearchResults pubmed summary json"
    count, output, webenv = getSearchResults('pubmed',query, op='summary',
		retmode='json', URLReader=URLReader, debug=False)
    print "output: \n%s" % output[:500]
    print

    print '-' * 30 + " SearchResults, pubmed, summary xml"