Example #1
0
def _open(cgi, params={}, post=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current
    # Remove None values from the parameters
    for key, value in list(params.items()):
        if value is None:
            del params[key]
    # Tell Entrez that we are using Biopython (or whatever the user has
    # specified explicitly in the parameters or by changing the default)
    if not "tool" in params:
        params["tool"] = tool
    # Tell Entrez who we are
    if not "email" in params:
        if email is not None:
            params["email"] = email
        else:
            warnings.warn(
                """
Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is [email protected], you can specify it as follows:
   from Bio import Entrez
   Entrez.email = '*****@*****.**'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)
    # Open a handle to Entrez.
    options = _urlencode(params, doseq=True)
    #print cgi + "?" + options
    try:
        if post:
            #HTTP POST
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            #HTTP GET
            cgi += "?" + options
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Example #2
0
def _open(cgi, params={}, post=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current
    # Remove None values from the parameters
    for key, value in params.items():
        if value is None:
            del params[key]
    # Tell Entrez that we are using Biopython (or whatever the user has
    # specified explicitly in the parameters or by changing the default)
    if not "tool" in params:
        params["tool"] = tool
    # Tell Entrez who we are
    if not "email" in params:
        if email is not None:
            params["email"] = email
        else:
            warnings.warn("""
Email address is not specified.

To make use of NCBI's E-utilities, NCBI strongly recommends you to specify
your email address with each request. From June 1, 2010, this will be
mandatory. As an example, if your email address is [email protected], you
can specify it as follows:
   from Bio import Entrez
   Entrez.email = '*****@*****.**'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)
    # Open a handle to Entrez.
    options = _urlencode(params, doseq=True)
    #print cgi + "?" + options
    try:
        if post:
            #HTTP POST
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            #HTTP GET
            cgi += "?" + options
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Example #3
0
def _open(url, post=None):
    """Build the URL and open a handle to it (PRIVATE).

    Open a handle to TogoWS, will raise an IOError if it encounters an error.

    In the absence of clear guidelines, this function enforces a limit of
    "up to three queries per second" to avoid abusing the TogoWS servers.
    """
    delay = 0.333333333  # one third of a second
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    # print(url)
    if post:
        handle = _urlopen(url, _as_bytes(post))
    else:
        handle = _urlopen(url)

    # We now trust TogoWS to have set an HTTP error code, that
    # suffices for my current unit tests. Previously we would
    # examine the start of the data returned back.
    return _binary_to_string_handle(handle)
Example #4
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The argument post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the URL encoded parameters would
    be over 1000 characters long.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    # NCBI requirement: At most three queries per second if no API key is provided.
    # Equivalently, at least a third of second between queries
    params = _construct_params(params)
    options = _encode_options(ecitmatch, params)
    # Using just 0.333333334 seconds sometimes hit the NCBI rate limit,
    # the slightly longer pause of 0.37 seconds has been more reliable.
    delay = 0.1 if api_key else 0.37
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    cgi = _construct_cgi(cgi, post, options)

    for i in range(max_tries):
        try:
            if post:
                handle = _urlopen(cgi, data=_as_bytes(options))
            else:
                handle = _urlopen(cgi)
        except _URLError as exception:
            # Reraise if the final try fails
            if i >= max_tries - 1:
                raise

            # Reraise if the exception is triggered by a HTTP 4XX error
            # indicating some kind of bad request
            if isinstance(exception, _HTTPError) \
                    and exception.status // 100 == 4:
                raise

            # Treat everything else as a transient error and try again after a
            # brief delay.
            time.sleep(sleep_between_tries)
        else:
            break

    return _binary_to_string_handle(handle)
Example #5
0
def scan(seq="", mirror='http://www.expasy.org', output='xml', **keywords):
    """Execute a ScanProsite search.

    mirror:      The ScanProsite mirror to be used
                 (default: http://www.expasy.org).
    seq:         The query sequence, or UniProtKB (Swiss-Prot,
                 TrEMBL) accession
    output:      Format of the search results
                 (default: xml)

    Further search parameters can be passed as keywords; see the
    documentation for programmatic access to ScanProsite at
    http://www.expasy.org/tools/scanprosite/ScanPrositeREST.html
    for a description of such parameters.

    This function returns a handle to the search results returned by
    ScanProsite. Search results in the XML format can be parsed into a
    Python object, by using the Bio.ExPASy.ScanProsite.read function.
    """
    parameters = {'seq': seq,
                  'output': output}
    for key, value in keywords.items():
        if value is not None:
            parameters[key] = value
    command = _urlencode(parameters)
    url = "%s/cgi-bin/prosite/PSScan.cgi?%s" % (mirror, command)
    handle = _urlopen(url)
    return handle
Example #6
0
def get_sprot_raw(id):
    """Get a handle to a raw SwissProt entry at ExPASy.

    For an ID of XXX, fetches http://www.uniprot.org/uniprot/XXX.txt
    (as per the http://www.expasy.ch/expasy_urls.html documentation).
    """
    return _urlopen("http://www.uniprot.org/uniprot/%s.txt" % id)
Example #7
0
    def get_recent_changes(self):
        """Returns three lists of the newest weekly files (added,mod,obsolete).

        Reads the directories with changed entries from the PDB server and
        returns a tuple of three URL's to the files of new, modified and
        obsolete entries from the most recent list. The directory with the
        largest numerical name is used.
        Returns None if something goes wrong.

        Contents of the data/status dir (20031013 would be used);
        drwxrwxr-x   2 1002     sysadmin     512 Oct  6 18:28 20031006
        drwxrwxr-x   2 1002     sysadmin     512 Oct 14 02:14 20031013
        -rw-r--r--   1 1002     sysadmin    1327 Mar 12  2001 README
        """
        url = self.pdb_server + '/pub/pdb/data/status/'
        with contextlib.closing(_urlopen(url)) as handle:
            recent = filter(str.isdigit,
                            (x.split()[-1] for x in handle.readlines())
                            )[-1]

        path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent)

        # Retrieve the lists
        added = self.get_status_list(path + 'added.pdb')
        modified = self.get_status_list(path + 'modified.pdb')
        obsolete = self.get_status_list(path + 'obsolete.pdb')
        return [added, modified, obsolete]
Example #8
0
    def get(self, service, **kwargs):
        """Does HTTP request to BOLD webservice.

        Args:
            service: The BOLD API alias to interact with.
            kwargs: Paramenters send by users.

        Returns:
            A Response class containing parsed data as attribute `items`.

        """
        params = ''

        if service == 'call_id':
            sequence = utils._prepare_sequence(kwargs['seq'])
            params = _urlencode({'db': kwargs['db'], 'sequence': sequence})

        if service == 'call_taxon_search':
            if kwargs['fuzzy'] is True:
                fuzzy = 'true'
            else:
                fuzzy = 'false'
            params = _urlencode({
                'taxName': kwargs['taxonomic_identification'],
                'fuzzy': fuzzy,
            })

        if service == 'call_taxon_data':
            if kwargs['include_tree'] is False:
                params = _urlencode({
                    'taxId': kwargs['tax_id'],
                    'dataTypes': kwargs['data_type'],
                })
            else:
                params = _urlencode({
                    'taxId': kwargs['tax_id'],
                    'dataTypes': kwargs['data_type'],
                    'includeTree': 'true',
                })

        if service == 'call_specimen_data' or service == 'call_sequence_data' or \
                service == 'call_full_data' or service == 'call_trace_files':
            payload = dict()
            for k, v in kwargs.items():
                if v is not None and k != 'url':
                    payload[k] = v
            params = _urlencode(payload)

        url = kwargs['url'] + "?" + params
        req = _Request(url, headers={'User-Agent': 'BiopythonClient'})
        handle = _urlopen(req)
        response = Response()

        if service == 'call_trace_files':
            binary_result = handle.read()
            response._parse_data(service, binary_result)
        else:
            result = _as_string(handle.read())
            response._parse_data(service, result)
        return response
Example #9
0
def get_prosite_raw(id, cgi=None):
    """Get a text handle to a raw PROSITE or PRODOC record at ExPASy.

    The cgi argument is deprecated due to changes in the ExPASy
    website.

    >>> from Bio import ExPASy
    >>> from Bio.ExPASy import Prosite
    >>> handle = ExPASy.get_prosite_raw('PS00001')
    >>> record = Prosite.read(handle)
    >>> handle.close()
    >>> print(record.accession)
    PS00001

    For a non-existing key, ExPASy returns an error:

    >>> # Python 2/3 docstring workaround: Revise for 'Python 3 only'
    >>> try:
    ...    handle = ExPASy.get_prosite_raw("does_not_exist")
    ... except Exception as e:
    ...    print('HTTPError: %s' %e)
    HTTPError: ... Error 404: Not Found

    """
    url = "https://prosite.expasy.org/%s.txt" % id
    return _binary_to_string_handle(_urlopen(url))
Example #10
0
def search(text,
           output_format="tab",
           sort="score",
           oragnism="",
           columns=(),
           isoform=False,
           compress=False,
           offset=0,
           limit=0):
    """Perform a query over the UniProt API.

    More at: https://www.uniprot.org/help/api_queries
    """
    cgi = "https://www.uniprot.org/uniprot/?"
    variables = {
        "query": text,
        "format": output_format,
        "sort": sort,
        "offset": str(offset)
    }
    if oragnism:
        variables["organism"] = oragnism
    if columns:
        variables["columns"] = ",".join(columns)
    if isoform:
        variables["isoform"] = "Yes"
    if compress:
        variables["compress"] = "Yes"
    if limit:
        variables["limit"] = str(limit)

    fullcgi = "".join((cgi, _urlencode(variables)))
    return _binary_to_string_handle(_urlopen(fullcgi))
Example #11
0
def get_sprot_raw(id):
    """Get a handle to a raw SwissProt entry at ExPASy.

    For an ID of XXX, fetches http://www.uniprot.org/uniprot/XXX.txt
    (as per the http://www.expasy.ch/expasy_urls.html documentation).
    """
    return _urlopen("http://www.uniprot.org/uniprot/%s.txt" % id)
Example #12
0
def get_prosite_raw(id, cgi=None):
    """Get a text handle to a raw PROSITE or PRODOC record at ExPASy.

    The cgi argument is deprecated due to changes in the ExPASy
    website.

    For a non-existing key, ExPASy returns nothing.

    >>> from Bio import ExPASy
    >>> from Bio.ExPASy import Prosite
    >>> with ExPASy.get_prosite_raw('PS00001') as handle:
    ...    record = Prosite.read(handle)
    ...
    >>> print(record.accession)
    PS00001


    For a non-existing key, ExPASy returns an error:

    >>> handle = get_prosite_raw("does_not_exist")
    Traceback (most recent call last):
    ...
    urllib.error.HTTPError: HTTP Error 404: Not Found

    """
    url = "https://prosite.expasy.org/%s.txt" % id
    return _binary_to_string_handle(_urlopen(url))
Example #13
0
    def get_all_obsolete(self):
        """Returns a list of all obsolete entries ever in the PDB.

        Returns a list of all obsolete pdb codes that have ever been
        in the PDB.

        Gets and parses the file from the PDB server in the format
        (the first pdb_code column is the one used). The file looks
        like this::

             LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
            OBSLTE    31-JUL-94 116L     216L
            ...
            OBSLTE    29-JAN-96 1HFT     2HFT
            OBSLTE    21-SEP-06 1HFV     2J5X
            OBSLTE    21-NOV-03 1HG6
            OBSLTE    18-JUL-84 1HHB     2HHB 3HHB
            OBSLTE    08-NOV-96 1HID     2HID
            OBSLTE    01-APR-97 1HIU     2HIU
            OBSLTE    14-JAN-04 1HKE     1UUZ
            ...

        """
        url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat'
        with contextlib.closing(_urlopen(url)) as handle:
            # Extract pdb codes. Could use a list comprehension, but I want
            # to include an assert to check for mis-reading the data.
            obsolete = []
            for line in handle:
                if not line.startswith(b"OBSLTE "):
                    continue
                pdb = line.split()[2]
                assert len(pdb) == 4
                obsolete.append(_as_string(pdb))
        return obsolete
Example #14
0
    def get_all_obsolete(self):
        """Return a list of all obsolete entries ever in the PDB.

        Returns a list of all obsolete pdb codes that have ever been
        in the PDB.

        Gets and parses the file from the PDB server in the format
        (the first pdb_code column is the one used). The file looks
        like this::

             LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
            OBSLTE    31-JUL-94 116L     216L
            ...
            OBSLTE    29-JAN-96 1HFT     2HFT
            OBSLTE    21-SEP-06 1HFV     2J5X
            OBSLTE    21-NOV-03 1HG6
            OBSLTE    18-JUL-84 1HHB     2HHB 3HHB
            OBSLTE    08-NOV-96 1HID     2HID
            OBSLTE    01-APR-97 1HIU     2HIU
            OBSLTE    14-JAN-04 1HKE     1UUZ
            ...

        """
        url = self.pdb_server + "/pub/pdb/data/status/obsolete.dat"
        with contextlib.closing(_urlopen(url)) as handle:
            # Extract pdb codes. Could use a list comprehension, but I want
            # to include an assert to check for mis-reading the data.
            obsolete = []
            for line in handle:
                if not line.startswith(b"OBSLTE "):
                    continue
                pdb = line.split()[2]
                assert len(pdb) == 4
                obsolete.append(_as_string(pdb))
        return obsolete
Example #15
0
    def get_recent_changes(self):
        """Returns three lists of the newest weekly files (added,mod,obsolete).

        Reads the directories with changed entries from the PDB server and
        returns a tuple of three URL's to the files of new, modified and
        obsolete entries from the most recent list. The directory with the
        largest numerical name is used.
        Returns None if something goes wrong.

        Contents of the data/status dir (20031013 would be used);
        drwxrwxr-x   2 1002     sysadmin     512 Oct  6 18:28 20031006
        drwxrwxr-x   2 1002     sysadmin     512 Oct 14 02:14 20031013
        -rw-r--r--   1 1002     sysadmin    1327 Mar 12  2001 README
        """
        url = self.pdb_server + '/pub/pdb/data/status/'
        with contextlib.closing(_urlopen(url)) as handle:
            recent = filter(str.isdigit,
                            (x.split()[-1] for x in handle.readlines()))[-1]

        path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent)

        # Retrieve the lists
        added = self.get_status_list(path + 'added.pdb')
        modified = self.get_status_list(path + 'modified.pdb')
        obsolete = self.get_status_list(path + 'obsolete.pdb')
        return [added, modified, obsolete]
Example #16
0
def scan(seq="", mirror='https://www.expasy.org', output='xml', **keywords):
    """Execute a ScanProsite search.

    Arguments:
     - mirror:   The ScanProsite mirror to be used
                 (default: https://www.expasy.org).
     - seq:      The query sequence, or UniProtKB (Swiss-Prot,
                 TrEMBL) accession
     - output:   Format of the search results
                 (default: xml)

    Further search parameters can be passed as keywords; see the
    documentation for programmatic access to ScanProsite at
    https://www.expasy.org/tools/scanprosite/ScanPrositeREST.html
    for a description of such parameters.

    This function returns a handle to the search results returned by
    ScanProsite. Search results in the XML format can be parsed into a
    Python object, by using the Bio.ExPASy.ScanProsite.read function.
    """
    parameters = {'seq': seq, 'output': output}
    for key, value in keywords.items():
        if value is not None:
            parameters[key] = value
    command = _urlencode(parameters)
    url = "%s/cgi-bin/prosite/PSScan.cgi?%s" % (mirror, command)
    handle = _urlopen(url)
    return handle
Example #17
0
def get_prosite_raw(id, cgi=None):
    """Get a text handle to a raw PROSITE or PRODOC record at ExPASy.

    The cgi argument is deprecated due to changes in the ExPASy
    website.

    For a non-existing key, ExPASy returns nothing.

    >>> from Bio import ExPASy
    >>> from Bio.ExPASy import Prosite
    >>> with ExPASy.get_prosite_raw('PS00001') as handle:
    ...    record = Prosite.read(handle)
    ...
    >>> print(record.accession)
    PS00001


    For a non-existing key, ExPASy returns an error:

    >>> handle = get_prosite_raw("does_not_exist")
    Traceback (most recent call last):
    ...
    urllib.error.HTTPError: HTTP Error 404: Not Found

    """
    url = "http://prosite.expasy.org/%s.txt" % id
    return _binary_to_string_handle(_urlopen(url))
Example #18
0
def get_sprot_raw(id):
    """Get a text handle to a raw SwissProt entry at ExPASy.

    For an ID of XXX, fetches http://www.uniprot.org/uniprot/XXX.txt
    (as per the https://www.expasy.org/expasy_urls.html documentation).

    >>> from Bio import ExPASy
    >>> from Bio import SwissProt
    >>> handle = ExPASy.get_sprot_raw("O23729")
    >>> record = SwissProt.read(handle)
    >>> handle.close()
    >>> print(record.entry_name)
    CHS3_BROFI

    For a non-existing identifier, UniProt returns an error:

    >>> # Python2/3 docstring workaround: Revise for 'Python 3 only'
    >>> try:
    ...    ExPASy.get_sprot_raw("DOES_NOT_EXIST")
    ... except Exception as e:
    ...    print('HTTPError: %s' %e)
    HTTPError: ... Error 404: 

    """  # noqa: W291
    url = "http://www.uniprot.org/uniprot/%s.txt" % id
    return _binary_to_string_handle(_urlopen(url))
Example #19
0
 def startElementHandler(self, name, attrs):
     # preprocessing the xml schema
     if self.is_schema:
         if len(attrs) == 1:
             schema = list(attrs.values())[0]
             handle = self.open_xsd_file(os.path.basename(schema))
             # if there is no local xsd file grab the url and parse the file
             if not handle:
                 handle = _urlopen(schema)
                 text = handle.read()
                 self.save_xsd_file(os.path.basename(schema), text)
                 handle.close()
                 self.parse_xsd(ET.fromstring(text))
             else:
                 self.parse_xsd(ET.fromstring(handle.read()))
                 handle.close()
     self.content = ""
     if name in self.lists:
         object = ListElement()
     elif name in self.dictionaries:
         object = DictionaryElement()
     elif name in self.structures:
         object = StructureElement(self.structures[name])
     elif name in self.items:  # Only appears in ESummary
         name = str(attrs["Name"])  # convert from Unicode
         del attrs["Name"]
         itemtype = str(attrs["Type"])  # convert from Unicode
         del attrs["Type"]
         if itemtype == "Structure":
             object = DictionaryElement()
         elif name in ("ArticleIds", "History"):
             object = StructureElement(["pubmed", "medline"])
         elif itemtype == "List":
             object = ListElement()
         else:
             object = StringElement()
         object.itemname = name
         object.itemtype = itemtype
     elif name in self.strings + self.errors + self.integers:
         self.attributes = attrs
         return
     else:
         # Element not found in DTD
         if self.validating:
             raise ValidationError(name)
         else:
             # this will not be stored in the record
             object = ""
     if object != "":
         object.tag = name
         if attrs:
             object.attributes = dict(attrs)
         if len(self.stack) != 0:
             current = self.stack[-1]
             try:
                 current.append(object)
             except AttributeError:
                 current[name] = object
     self.stack.append(object)
Example #20
0
 def startElementHandler(self, name, attrs):
     # preprocessing the xml schema
     if self.is_schema:
         if len(attrs) == 1:
             schema = list(attrs.values())[0]
             handle = self.open_xsd_file(os.path.basename(schema))
             # if there is no local xsd file grab the url and parse the file
             if not handle:
                 handle = _urlopen(schema)
                 text = handle.read()
                 self.save_xsd_file(os.path.basename(schema), text)
                 handle.close()
                 self.parse_xsd(ET.fromstring(text))
             else:
                 self.parse_xsd(ET.fromstring(handle.read()))
                 handle.close()
     self.content = ""
     if name in self.lists:
         object = ListElement()
     elif name in self.dictionaries:
         object = DictionaryElement()
     elif name in self.structures:
         object = StructureElement(self.structures[name])
     elif name in self.items:  # Only appears in ESummary
         name = str(attrs["Name"])  # convert from Unicode
         del attrs["Name"]
         itemtype = str(attrs["Type"])  # convert from Unicode
         del attrs["Type"]
         if itemtype == "Structure":
             object = DictionaryElement()
         elif name in ("ArticleIds", "History"):
             object = StructureElement(["pubmed", "medline"])
         elif itemtype == "List":
             object = ListElement()
         else:
             object = StringElement()
         object.itemname = name
         object.itemtype = itemtype
     elif name in self.strings + self.errors + self.integers:
         self.attributes = attrs
         return
     else:
         # Element not found in DTD
         if self.validating:
             raise ValidationError(name)
         else:
             # this will not be stored in the record
             object = ""
     if object != "":
         object.tag = name
         if attrs:
             object.attributes = dict(attrs)
         if len(self.stack) != 0:
             current = self.stack[-1]
             try:
                 current.append(object)
             except AttributeError:
                 current[name] = object
     self.stack.append(object)
Example #21
0
def _q(op, arg1, arg2=None, arg3=None):
    URL = "http://rest.kegg.jp/%s"
    if arg2 and arg3:
        args = "%s/%s/%s/%s" % (op, arg1, arg2, arg3)
    elif arg2:
        args = "%s/%s/%s" % (op, arg1, arg2)
    else:
        args = "%s/%s" % (op, arg1)
    return _urlopen(URL % (args))
Example #22
0
def _q(op, arg1, arg2=None, arg3=None):
    URL = "http://rest.kegg.jp/%s"
    if arg2 and arg3:
        args = "%s/%s/%s/%s" % (op, arg1, arg2, arg3)
    elif arg2:
        args = "%s/%s/%s" % (op, arg1, arg2)
    else:
        args = "%s/%s" % (op, arg1)
    return _urlopen(URL % (args))
Example #23
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the URL encoded paramters would
    be over 1000 characters long.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    # NCBI requirement: At most three queries per second if no API key is provided.
    # Equivalently, at least a third of second between queries
    params = _construct_params(params)
    options = _encode_options(ecitmatch, params)
    # Using just 0.333333334 seconds sometimes hit the NCBI rate limit,
    # the slightly longer pause of 0.37 seconds has been more reliable.
    delay = 0.1 if api_key else 0.37
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    cgi = _construct_cgi(cgi, post, options)

    try:
        if post:
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Example #24
0
def get_prosite_raw(id, cgi='http://www.expasy.ch/cgi-bin/get-prosite-raw.pl'):
    """get_prosite_raw(id,
                       cgi='http://www.expasy.ch/cgi-bin/get-prosite-raw.pl')
    -> handle

    Get a handle to a raw PROSITE or PRODOC entry at ExPASy.

    For a non-existing key, ExPASy returns nothing.
    """
    return _urlopen("%s?%s" % (cgi, id))
Example #25
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the URL encoded paramters would
    be over 1000 characters long.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    params = _construct_params(params)
    options = _encode_options(ecitmatch, params)

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    cgi = _construct_cgi(cgi, post, options)

    try:
        if post:
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Example #26
0
def get_prosite_raw(id, cgi='http://www.expasy.ch/cgi-bin/get-prosite-raw.pl'):
    """get_prosite_raw(id,
                       cgi='http://www.expasy.ch/cgi-bin/get-prosite-raw.pl')
    -> handle

    Get a handle to a raw PROSITE or PRODOC entry at ExPASy.

    For a non-existing key, ExPASy returns nothing.
    """
    return _urlopen("%s?%s" % (cgi, id))
Example #27
0
    def get_all_entries(self):
        """Retrieves a big file containing all the PDB entries and some annotation.

        Returns a list of PDB codes in the index file.
        """
        url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx'
        print("Retrieving index file. Takes about 27 MB.")
        with contextlib.closing(_urlopen(url)) as handle:
            all_entries = [_as_string(line[:4]) for line in handle.readlines()[2:]
                           if len(line) > 4]
        return all_entries
Example #28
0
    def externalEntityRefHandler(self, context, base, systemId, publicId):
        """Handle external entity reference in order to cache DTD locally.

        The purpose of this function is to load the DTD locally, instead
        of downloading it from the URL specified in the XML. Using the local
        DTD results in much faster parsing. If the DTD is not found locally,
        we try to download it. If new DTDs become available from NCBI,
        putting them in Bio/Entrez/DTDs will allow the parser to see them.
        """
        urlinfo = _urlparse(systemId)
        # Following attribute requires Python 2.5+
        # if urlinfo.scheme=='http':
        if urlinfo[0] in ["http", "https", "ftp"]:
            # Then this is an absolute path to the DTD.
            url = systemId
        elif urlinfo[0] == "":
            # Then this is a relative path to the DTD.
            # Look at the parent URL to find the full path.
            try:
                source = self.dtd_urls[-1]
            except IndexError:
                # Assume the default URL for DTDs if the top parent
                # does not contain an absolute path
                source = "http://www.ncbi.nlm.nih.gov/dtd/"
            else:
                source = os.path.dirname(source)
            # urls always have a forward slash, don't use os.path.join
            url = source.rstrip("/") + "/" + systemId
        else:
            raise ValueError("Unexpected URL scheme %r" % (urlinfo[0]))
        self.dtd_urls.append(url)
        # First, try to load the local version of the DTD file
        location, filename = os.path.split(systemId)
        handle = self.open_dtd_file(filename)
        if not handle:
            # DTD is not available as a local file. Try accessing it through
            # the internet instead.
            try:
                handle = _urlopen(url)
            except IOError:
                _raise_from(
                    RuntimeError("Failed to access %s at %s" % (filename, url)), None
                )
            text = handle.read()
            handle.close()
            self.save_dtd_file(filename, text)
            handle = BytesIO(text)

        parser = self.parser.ExternalEntityParserCreate(context)
        parser.ElementDeclHandler = self.elementDecl
        parser.ParseFile(handle)
        handle.close()
        self.dtd_urls.pop()
        return 1
Example #29
0
def get_prosite_entry(id,
                      cgi='http://www.expasy.ch/cgi-bin/get-prosite-entry'):
    """get_prosite_entry(id,
    cgi='http://www.expasy.ch/cgi-bin/get-prosite-entry') -> handle

    Get a handle to a PROSITE entry at ExPASy in HTML format.

    For a non-existing key XXX, ExPASy returns an HTML-formatted page
    containing this line:
    'There is currently no PROSITE entry for XXX. Please try again.'
    """
    return _urlopen("%s?%s" % (cgi, id))
Example #30
0
def get_prodoc_entry(id, cgi='http://www.expasy.ch/cgi-bin/get-prodoc-entry'):
    """get_prodoc_entry(id,
    cgi='http://www.expasy.ch/cgi-bin/get-prodoc-entry') -> handle

    Get a handle to a PRODOC entry at ExPASy in HTML format.

    For a non-existing key XXX, ExPASy returns an HTML-formatted page
    containing this line:
    'There is no PROSITE documentation entry XXX. Please try again.'
    """
    # Open a handle to ExPASy.
    return _urlopen("%s?%s" % (cgi, id))
Example #31
0
def get_prosite_entry(id,
                      cgi='http://www.expasy.ch/cgi-bin/get-prosite-entry'):
    """get_prosite_entry(id,
    cgi='http://www.expasy.ch/cgi-bin/get-prosite-entry') -> handle

    Get a handle to a PROSITE entry at ExPASy in HTML format.

    For a non-existing key XXX, ExPASy returns an HTML-formatted page
    containing this line:
    'There is currently no PROSITE entry for XXX. Please try again.'
    """
    return _urlopen("%s?%s" % (cgi, id))
Example #32
0
def get_prodoc_entry(id, cgi='http://www.expasy.ch/cgi-bin/get-prodoc-entry'):
    """get_prodoc_entry(id,
    cgi='http://www.expasy.ch/cgi-bin/get-prodoc-entry') -> handle

    Get a handle to a PRODOC entry at ExPASy in HTML format.

    For a non-existing key XXX, ExPASy returns an HTML-formatted page
    containing this line:
    'There is no PROSITE documentation entry XXX. Please try again.'
    """
    # Open a handle to ExPASy.
    return _urlopen("%s?%s" % (cgi, id))
Example #33
0
    def externalEntityRefHandler(self, context, base, systemId, publicId):
        """Handle external entiry reference in order to cache DTD locally.

        The purpose of this function is to load the DTD locally, instead
        of downloading it from the URL specified in the XML. Using the local
        DTD results in much faster parsing. If the DTD is not found locally,
        we try to download it. If new DTDs become available from NCBI,
        putting them in Bio/Entrez/DTDs will allow the parser to see them.
        """
        urlinfo = _urlparse(systemId)
        # Following attribute requires Python 2.5+
        # if urlinfo.scheme=='http':
        if urlinfo[0] in ['http', 'https', 'ftp']:
            # Then this is an absolute path to the DTD.
            url = systemId
        elif urlinfo[0] == '':
            # Then this is a relative path to the DTD.
            # Look at the parent URL to find the full path.
            try:
                source = self.dtd_urls[-1]
            except IndexError:
                # Assume the default URL for DTDs if the top parent
                # does not contain an absolute path
                source = "http://www.ncbi.nlm.nih.gov/dtd/"
            else:
                source = os.path.dirname(source)
            # urls always have a forward slash, don't use os.path.join
            url = source.rstrip("/") + "/" + systemId
        else:
            raise ValueError("Unexpected URL scheme %r" % (urlinfo[0]))
        self.dtd_urls.append(url)
        # First, try to load the local version of the DTD file
        location, filename = os.path.split(systemId)
        handle = self.open_dtd_file(filename)
        if not handle:
            # DTD is not available as a local file. Try accessing it through
            # the internet instead.
            try:
                handle = _urlopen(url)
            except IOError:
                raise RuntimeError("Failed to access %s at %s" % (filename, url))
            text = handle.read()
            handle.close()
            self.save_dtd_file(filename, text)
            handle = BytesIO(text)

        parser = self.parser.ExternalEntityParserCreate(context)
        parser.ElementDeclHandler = self.elementDecl
        parser.ParseFile(handle)
        handle.close()
        self.dtd_urls.pop()
        return 1
Example #34
0
    def get_status_list(url):
        """Retrieve a list of pdb codes in the weekly pdb status file from given URL.

        Used by get_recent_changes. Typical contents of the list files parsed
        by this method is now very simply - one PDB name per line.
        """
        with contextlib.closing(_urlopen(url)) as handle:
            answer = []
            for line in handle:
                pdb = line.strip()
                assert len(pdb) == 4
                answer.append(_as_string(pdb))
        return answer
Example #35
0
    def get_status_list(url):
        """Retrieve a list of pdb codes in the weekly pdb status file from given URL.

        Used by get_recent_changes. Typical contents of the list files parsed
        by this method is now very simply - one PDB name per line.
        """
        with contextlib.closing(_urlopen(url)) as handle:
            answer = []
            for line in handle:
                pdb = line.strip()
                assert len(pdb) == 4
                answer.append(_as_string(pdb))
        return answer
Example #36
0
def _q(op, arg1, arg2=None, arg3=None):
    URL = "http://rest.kegg.jp/%s"
    if arg2 and arg3:
        args = "%s/%s/%s/%s" % (op, arg1, arg2, arg3)
    elif arg2:
        args = "%s/%s/%s" % (op, arg1, arg2)
    else:
        args = "%s/%s" % (op, arg1)
    resp = _urlopen(URL % (args))

    if "image" == arg2:
        return resp

    return _binary_to_string_handle(resp)
def sprot_search_ful(text, make_wild=None, swissprot=1, trembl=None,
                     cgi='http://www.expasy.ch/cgi-bin/sprot-search-ful'):
    """Search SwissProt by full text (BROKEN)."""
    variables = {'SEARCH': text}
    if make_wild:
        variables['makeWild'] = 'on'
    if swissprot:
        variables['S'] = 'on'
    if trembl:
        variables['T'] = 'on'
    options = _urlencode(variables)
    fullcgi = "%s?%s" % (cgi, options)
    handle = _binary_to_string_handle(_urlopen(fullcgi))
    return handle
Example #38
0
def _q(op, arg1, arg2=None, arg3=None):
    URL = "http://rest.kegg.jp/%s"
    if arg2 and arg3:
        args = "%s/%s/%s/%s" % (op, arg1, arg2, arg3)
    elif arg2:
        args = "%s/%s/%s" % (op, arg1, arg2)
    else:
        args = "%s/%s" % (op, arg1)
    resp = _urlopen(URL % (args))

    if "image" == arg2:
        return resp

    return _binary_to_string_handle(resp)
Example #39
0
def get_temp_imagefilename(url):
    """Returns filename of temporary file containing downloaded image.

    Create a new temporary file to hold the image file at the passed URL
    and return the filename.
    """
    img = _urlopen(url).read()
    im = Image.open(BtyesIO(img))
    #im.transpose(Image.FLIP_TOP_BOTTOM)
    f = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
    fname = f.name
    f.close()
    im.save(fname, 'PNG')
    return fname
Example #40
0
def get_temp_imagefilename(url):
    """Returns filename of temporary file containing downloaded image.

    Create a new temporary file to hold the image file at the passed URL
    and return the filename.
    """
    img = _urlopen(url).read()
    im = Image.open(BytesIO(img))
    # im.transpose(Image.FLIP_TOP_BOTTOM)
    f = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
    fname = f.name
    f.close()
    im.save(fname, 'PNG')
    return fname
def sprot_search_de(text, swissprot=1, trembl=None,
                    cgi='http://www.expasy.ch/cgi-bin/sprot-search-de'):
    """Search SwissProt (BROKEN).

    Search by name, description, gene name, species, or organelle.
    """
    variables = {'SEARCH': text}
    if swissprot:
        variables['S'] = 'on'
    if trembl:
        variables['T'] = 'on'
    options = _urlencode(variables)
    fullcgi = "%s?%s" % (cgi, options)
    return _binary_to_string_handle(_urlopen(fullcgi))
Example #42
0
def get_prosite_entry(
        id, cgi='http://prosite.expasy.org/cgi-bin/prosite/get-prosite-entry'):
    """Get a text handle to a PROSITE entry at ExPASy in HTML format.

    >>> from Bio import ExPASy
    >>> with ExPASy.get_prosite_entry('PS00001') as in_handle:
    ...     html = in_handle.read()
    ...
    >>> with open("myprositerecord.html", "w") as out_handle:
    ...     out_handle.write(html)
    ...

    For a non-existing key XXX, ExPASy returns an HTML-formatted page
    containing this text: 'There is currently no PROSITE entry for'
    """
    return _binary_to_string_handle(_urlopen("%s?%s" % (cgi, id)))
Example #43
0
def get_prosite_entry(id,
                      cgi='https://prosite.expasy.org/cgi-bin/prosite/get-prosite-entry'):
    """Get a text handle to a PROSITE entry at ExPASy in HTML format.

    >>> from Bio import ExPASy
    >>> with ExPASy.get_prosite_entry('PS00001') as in_handle:
    ...     html = in_handle.read()
    ...
    >>> with open("myprositerecord.html", "w") as out_handle:
    ...     out_handle.write(html)
    ...

    For a non-existing key XXX, ExPASy returns an HTML-formatted page
    containing this text: 'There is currently no PROSITE entry for'
    """
    return _binary_to_string_handle(_urlopen("%s?%s" % (cgi, id)))
Example #44
0
def get_prodoc_entry(id,
                     cgi='https://prosite.expasy.org/cgi-bin/prosite/get-prodoc-entry'):
    """Get a text handle to a PRODOC entry at ExPASy in HTML format.

    >>> from Bio import ExPASy
    >>> in_handle = ExPASy.get_prodoc_entry('PDOC00001')
    >>> html = in_handle.read()
    >>> in_handle.close()
    ...
    >>> with open("myprodocrecord.html", "w") as out_handle:
    ...     # Python2/3 docstring workaround: Revise for 'Python 3 only'
    ...     _ = out_handle.write(html)
    ...

    For a non-existing key XXX, ExPASy returns an HTML-formatted page
    containing this text: 'There is currently no PROSITE entry for'
    """
    return _binary_to_string_handle(_urlopen("%s?%s" % (cgi, id)))
Example #45
0
def sprot_search_de(text, swissprot=1, trembl=None,
                    cgi='http://www.expasy.ch/cgi-bin/sprot-search-de'):
    """sprot_search_de(text, swissprot=1, trembl=None,
    cgi='http://www.expasy.ch/cgi-bin/sprot-search-de') -> handle

    Search SwissProt by name, description, gene name, species, or
    organelle.

    """
    variables = {'SEARCH': text}
    if swissprot:
        variables['S'] = 'on'
    if trembl:
        variables['T'] = 'on'
    options = _urlencode(variables)
    fullcgi = "%s?%s" % (cgi, options)
    handle = _urlopen(fullcgi)
    return handle
Example #46
0
 def startElementHandler(self, name, attrs):
     # First, check if the current consumer can use the tag
     if self.consumer is not None:
         consumed = self.consumer.startElementHandler(name, attrs)
         if consumed:
             return
     # preprocessing the xml schema
     if self.is_schema:
         if len(attrs) == 1:
             schema = list(attrs.values())[0]
             handle = self.open_xsd_file(os.path.basename(schema))
             # if there is no local xsd file grab the url and parse the file
             if not handle:
                 handle = _urlopen(schema)
                 text = handle.read()
                 self.save_xsd_file(os.path.basename(schema), text)
                 handle.close()
                 self.parse_xsd(ET.fromstring(text))
             else:
                 self.parse_xsd(ET.fromstring(handle.read()))
                 handle.close()
     cls = self.classes.get(name)
     if cls is None:
         # Element not found in DTD
         if self.validating:
             raise ValidationError(name)
         else:
             # this will not be stored in the record
             consumer = Consumer(name, attrs)
     else:
         consumer = cls(name, attrs)
     consumer.parent = self.consumer
     if self.consumer is None:
         # This is relevant only for Entrez.parse, not for Entrez.read.
         # If self.consumer is None, then this is the first start tag we
         # encounter, and it should refer to a list. Store this list in
         # the record attribute, so that Entrez.parse can iterate over it.
         # The record attribute will be set again at the last end tag;
         # However, it doesn't hurt to set it twice.
         value = consumer.value
         if value is not None:
             self.record = value
     self.consumer = consumer
Example #47
0
 def schemaHandler(self, name, attrs):
     """Process the XML schema (before processing the element)."""
     key = "%s noNamespaceSchemaLocation" % self.schema_namespace
     schema = attrs[key]
     handle = self.open_xsd_file(os.path.basename(schema))
     # if there is no local xsd file grab the url and parse the file
     if not handle:
         handle = _urlopen(schema)
         text = handle.read()
         self.save_xsd_file(os.path.basename(schema), text)
         handle.close()
         self.parse_xsd(ET.fromstring(text))
     else:
         self.parse_xsd(ET.fromstring(handle.read()))
         handle.close()
     # continue handling the element
     self.startElementHandler(name, attrs)
     # reset the element handler
     self.parser.StartElementHandler = self.startElementHandler
Example #48
0
 def startElementHandler(self, name, attrs):
     # First, check if the current consumer can use the tag
     if self.consumer is not None:
         consumed = self.consumer.startElementHandler(name, attrs)
         if consumed:
             return
     # preprocessing the xml schema
     if self.is_schema:
         if len(attrs) == 1:
             schema = list(attrs.values())[0]
             handle = self.open_xsd_file(os.path.basename(schema))
             # if there is no local xsd file grab the url and parse the file
             if not handle:
                 handle = _urlopen(schema)
                 text = handle.read()
                 self.save_xsd_file(os.path.basename(schema), text)
                 handle.close()
                 self.parse_xsd(ET.fromstring(text))
             else:
                 self.parse_xsd(ET.fromstring(handle.read()))
                 handle.close()
     cls = self.classes.get(name)
     if cls is None:
         # Element not found in DTD
         if self.validating:
             raise ValidationError(name)
         else:
             # this will not be stored in the record
             consumer = Consumer(name, attrs)
     else:
         consumer = cls(name, attrs)
     consumer.parent = self.consumer
     if self.consumer is None:
         # This is relevant only for Entrez.parse, not for Entrez.read.
         # If self.consumer is None, then this is the first start tag we
         # encounter, and it should refer to a list. Store this list in
         # the record attribute, so that Entrez.parse can iterate over it.
         # The record attribute will be set again at the last end tag;
         # However, it doesn't hurt to set it twice.
         value = consumer.value
         if value is not None:
             self.record = value
     self.consumer = consumer
Example #49
0
def sprot_search_ful(text, make_wild=None, swissprot=1, trembl=None,
                     cgi='http://www.expasy.ch/cgi-bin/sprot-search-ful'):
    """sprot_search_ful(text, make_wild=None, swissprot=1, trembl=None,
    cgi='http://www.expasy.ch/cgi-bin/sprot-search-ful') -> handle

    Search SwissProt by full text.

    """
    variables = {'SEARCH': text}
    if make_wild:
        variables['makeWild'] = 'on'
    if swissprot:
        variables['S'] = 'on'
    if trembl:
        variables['T'] = 'on'
    options = _urlencode(variables)
    fullcgi = "%s?%s" % (cgi, options)
    handle = _urlopen(fullcgi)
    return handle
Example #50
0
 def schemaHandler(self, name, attrs):
     """Process the XML schema (before processing the element)."""
     key = "%s noNamespaceSchemaLocation" % self.schema_namespace
     schema = attrs[key]
     handle = self.open_xsd_file(os.path.basename(schema))
     # if there is no local xsd file grab the url and parse the file
     if not handle:
         handle = _urlopen(schema)
         text = handle.read()
         self.save_xsd_file(os.path.basename(schema), text)
         handle.close()
         self.parse_xsd(ET.fromstring(text))
     else:
         self.parse_xsd(ET.fromstring(handle.read()))
         handle.close()
     # continue handling the element
     self.startElementHandler(name, attrs)
     # reset the element handler
     self.parser.StartElementHandler = self.startElementHandler
Example #51
0
def get_sprot_raw(id):
    """Get a text handle to a raw SwissProt entry at ExPASy.

    For an ID of XXX, fetches http://www.uniprot.org/uniprot/XXX.txt
    (as per the https://www.expasy.org/expasy_urls.html documentation).

    >>> from Bio import ExPASy
    >>> from Bio import SwissProt
    >>> with ExPASy.get_sprot_raw("O23729") as handle:
    ...     record = SwissProt.read(handle)
    ...
    >>> print(record.entry_name)
    CHS3_BROFI

    For a non-existing identifier, UniProt returns an error:

    >>> ExPASy.get_sprot_raw("DOES_NOT_EXIST")
    Traceback (most recent call last):
    ...
    urllib.error.HTTPError: HTTP Error 404: Not Found

    """
    url = "http://www.uniprot.org/uniprot/%s.txt" % id
    return _binary_to_string_handle(_urlopen(url))
Example #52
0
def get_sprot_raw(id):
    """Get a text handle to a raw SwissProt entry at ExPASy.

    For an ID of XXX, fetches http://www.uniprot.org/uniprot/XXX.txt
    (as per the http://www.expasy.ch/expasy_urls.html documentation).

    >>> from Bio import ExPASy
    >>> from Bio import SwissProt
    >>> with ExPASy.get_sprot_raw("O23729") as handle:
    ...     record = SwissProt.read(handle)
    ...
    >>> print(record.entry_name)
    CHS3_BROFI

    For a non-existing identifier, UniProt returns an error:

    >>> ExPASy.get_sprot_raw("DOES_NOT_EXIST")
    Traceback (most recent call last):
    ...
    urllib.error.HTTPError: HTTP Error 404: Not Found

    """
    url = "http://www.uniprot.org/uniprot/%s.txt" % id
    return _binary_to_string_handle(_urlopen(url))
Example #53
0
def qblast(program, database, sequence, url_base=NCBI_BLAST_URL,
           auto_format=None, composition_based_statistics=None,
           db_genetic_code=None, endpoints=None, entrez_query='(none)',
           expect=10.0, filter=None, gapcosts=None, genetic_code=None,
           hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None,
           matrix_name=None, nucl_penalty=None, nucl_reward=None,
           other_advanced=None, perc_ident=None, phi_pattern=None,
           query_file=None, query_believe_defline=None, query_from=None,
           query_to=None, searchsp_eff=None, service=None, threshold=None,
           ungapped_alignment=None, word_size=None,
           alignments=500, alignment_view=None, descriptions=500,
           entrez_links_new_window=None, expect_low=None, expect_high=None,
           format_entrez_query=None, format_object=None, format_type='XML',
           ncbi_gi=None, results_file=None, show_overview=None, megablast=None,
           ):
    """Do a BLAST search using the QBLAST server at NCBI or a cloud service
    provider.

    Supports all parameters of the qblast API for Put and Get.

    Please note that BLAST on the cloud supports the NCBI-BLAST Common
    URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To
    use this feature, please set url_base to
    'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and
    format_object='Alignment'. For more details, please see
    https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast

    Some useful parameters:

     - program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
     - database       Which database to search against (e.g. "nr").
     - sequence       The sequence to search.
     - ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
     - descriptions   Number of descriptions to show.  Def 500.
     - alignments     Number of alignments to show.  Def 500.
     - expect         An expect value cutoff.  Def 10.0.
     - matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
     - filter         "none" turns off filtering.  Default no filtering
     - format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
     - entrez_query   Entrez query to limit Blast search
     - hitlist_size   Number of hits to return. Default 50
     - megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
     - service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html

    """
    import time

    assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ('AUTO_FORMAT', auto_format),
        ('COMPOSITION_BASED_STATISTICS', composition_based_statistics),
        ('DATABASE', database),
        ('DB_GENETIC_CODE', db_genetic_code),
        ('ENDPOINTS', endpoints),
        ('ENTREZ_QUERY', entrez_query),
        ('EXPECT', expect),
        ('FILTER', filter),
        ('GAPCOSTS', gapcosts),
        ('GENETIC_CODE', genetic_code),
        ('HITLIST_SIZE', hitlist_size),
        ('I_THRESH', i_thresh),
        ('LAYOUT', layout),
        ('LCASE_MASK', lcase_mask),
        ('MEGABLAST', megablast),
        ('MATRIX_NAME', matrix_name),
        ('NUCL_PENALTY', nucl_penalty),
        ('NUCL_REWARD', nucl_reward),
        ('OTHER_ADVANCED', other_advanced),
        ('PERC_IDENT', perc_ident),
        ('PHI_PATTERN', phi_pattern),
        ('PROGRAM', program),
        # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ('QUERY', sequence),
        ('QUERY_FILE', query_file),
        ('QUERY_BELIEVE_DEFLINE', query_believe_defline),
        ('QUERY_FROM', query_from),
        ('QUERY_TO', query_to),
        # ('RESULTS_FILE',...), - Can we use this parameter?
        ('SEARCHSP_EFF', searchsp_eff),
        ('SERVICE', service),
        ('THRESHOLD', threshold),
        ('UNGAPPED_ALIGNMENT', ungapped_alignment),
        ('WORD_SIZE', word_size),
        ('CMD', 'Put'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = _Request(url_base,
                       message,
                       {"User-Agent": "BiopythonClient"})
    handle = _urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ('ALIGNMENTS', alignments),
        ('ALIGNMENT_VIEW', alignment_view),
        ('DESCRIPTIONS', descriptions),
        ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window),
        ('EXPECT_LOW', expect_low),
        ('EXPECT_HIGH', expect_high),
        ('FORMAT_ENTREZ_QUERY', format_entrez_query),
        ('FORMAT_OBJECT', format_object),
        ('FORMAT_TYPE', format_type),
        ('NCBI_GI', ncbi_gi),
        ('RID', rid),
        ('RESULTS_FILE', results_file),
        ('SERVICE', service),
        ('SHOW_OVERVIEW', show_overview),
        ('CMD', 'Get'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Poll NCBI until the results are ready.  Use a backoff delay from 2 - 120 second wait
    delay = 2.0
    previous = time.time()
    while True:
        current = time.time()
        wait = previous + delay - current
        if wait > 0:
            time.sleep(wait)
            previous = current + wait
        else:
            previous = current
        if delay + .5 * delay <= 120:
            delay += .5 * delay
        else:
            delay = 120

        request = _Request(url_base,
                           message,
                           {"User-Agent": "BiopythonClient"})
        handle = _urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results == "\n\n":
            continue
        # XML results don't have the Status tag when finished
        if "Status=" not in results:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i + len("Status="):j].strip()
        if status.upper() == "READY":
            break

    return StringIO(results)
Example #54
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the query URL would be over
    1000 characters long.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    if params is None:
        params = {}
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current
    # Remove None values from the parameters
    for key, value in list(params.items()):
        if value is None:
            del params[key]
    # Tell Entrez that we are using Biopython (or whatever the user has
    # specified explicitly in the parameters or by changing the default)
    if "tool" not in params:
        params["tool"] = tool
    # Tell Entrez who we are
    if "email" not in params:
        if email is not None:
            params["email"] = email
        else:
            warnings.warn("""
Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is [email protected], you can specify it as follows:
   from Bio import Entrez
   Entrez.email = '*****@*****.**'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)

    # Open a handle to Entrez.
    options = _urlencode(params, doseq=True)
    # _urlencode encodes pipes, which NCBI expects in ECitMatch
    if ecitmatch:
        options = options.replace('%7C', '|')
    # print cgi + "?" + options

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    try:
        if post:
            # HTTP POST
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            # HTTP GET
            cgi += "?" + options
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Example #55
0
 def fill_hot_cache(self):
     url = self.url + _urlencode(self.query)
     fh = _urlopen(url)
     self.hot_cache = fh.read()
     fh.close()
Example #56
0
def qblast(
    program,
    database,
    sequence,
    url_base=NCBI_BLAST_URL,
    auto_format=None,
    composition_based_statistics=None,
    db_genetic_code=None,
    endpoints=None,
    entrez_query='(none)',
    expect=10.0,
    filter=None,
    gapcosts=None,
    genetic_code=None,
    hitlist_size=50,
    i_thresh=None,
    layout=None,
    lcase_mask=None,
    matrix_name=None,
    nucl_penalty=None,
    nucl_reward=None,
    other_advanced=None,
    perc_ident=None,
    phi_pattern=None,
    query_file=None,
    query_believe_defline=None,
    query_from=None,
    query_to=None,
    searchsp_eff=None,
    service=None,
    threshold=None,
    ungapped_alignment=None,
    word_size=None,
    alignments=500,
    alignment_view=None,
    descriptions=500,
    entrez_links_new_window=None,
    expect_low=None,
    expect_high=None,
    format_entrez_query=None,
    format_object=None,
    format_type='XML',
    ncbi_gi=None,
    results_file=None,
    show_overview=None,
    megablast=None,
    template_type=None,
    template_length=None,
):
    """BLAST search using NCBI's QBLAST server or a cloud service provider.

    Supports all parameters of the qblast API for Put and Get.

    Please note that BLAST on the cloud supports the NCBI-BLAST Common
    URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To
    use this feature, please set url_base to
    'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and
    format_object='Alignment'. For more details, please see
    https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast

    Some useful parameters:

     - program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
     - database       Which database to search against (e.g. "nr").
     - sequence       The sequence to search.
     - ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
     - descriptions   Number of descriptions to show.  Def 500.
     - alignments     Number of alignments to show.  Def 500.
     - expect         An expect value cutoff.  Def 10.0.
     - matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
     - filter         "none" turns off filtering.  Default no filtering
     - format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
     - entrez_query   Entrez query to limit Blast search
     - hitlist_size   Number of hits to return. Default 50
     - megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
     - service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    https://ncbi.github.io/blast-cloud/dev/api.html

    """
    import time

    programs = ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
    if program not in programs:
        raise ValueError("Program specified is %s. Expected one of %s" %
                         (program, ", ".join(programs)))

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ('AUTO_FORMAT', auto_format),
        ('COMPOSITION_BASED_STATISTICS', composition_based_statistics),
        ('DATABASE', database),
        ('DB_GENETIC_CODE', db_genetic_code),
        ('ENDPOINTS', endpoints),
        ('ENTREZ_QUERY', entrez_query),
        ('EXPECT', expect),
        ('FILTER', filter),
        ('GAPCOSTS', gapcosts),
        ('GENETIC_CODE', genetic_code),
        ('HITLIST_SIZE', hitlist_size),
        ('I_THRESH', i_thresh),
        ('LAYOUT', layout),
        ('LCASE_MASK', lcase_mask),
        ('MEGABLAST', megablast),
        ('MATRIX_NAME', matrix_name),
        ('NUCL_PENALTY', nucl_penalty),
        ('NUCL_REWARD', nucl_reward),
        ('OTHER_ADVANCED', other_advanced),
        ('PERC_IDENT', perc_ident),
        ('PHI_PATTERN', phi_pattern),
        ('PROGRAM', program),
        # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ('QUERY', sequence),
        ('QUERY_FILE', query_file),
        ('QUERY_BELIEVE_DEFLINE', query_believe_defline),
        ('QUERY_FROM', query_from),
        ('QUERY_TO', query_to),
        # ('RESULTS_FILE',...), - Can we use this parameter?
        ('SEARCHSP_EFF', searchsp_eff),
        ('SERVICE', service),
        ('TEMPLATE_TYPE', template_type),
        ('TEMPLATE_LENGTH', template_length),
        ('THRESHOLD', threshold),
        ('UNGAPPED_ALIGNMENT', ungapped_alignment),
        ('WORD_SIZE', word_size),
        ('CMD', 'Put'),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = _Request(url_base, message, {"User-Agent": "BiopythonClient"})
    handle = _urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ('ALIGNMENTS', alignments),
        ('ALIGNMENT_VIEW', alignment_view),
        ('DESCRIPTIONS', descriptions),
        ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window),
        ('EXPECT_LOW', expect_low),
        ('EXPECT_HIGH', expect_high),
        ('FORMAT_ENTREZ_QUERY', format_entrez_query),
        ('FORMAT_OBJECT', format_object),
        ('FORMAT_TYPE', format_type),
        ('NCBI_GI', ncbi_gi),
        ('RID', rid),
        ('RESULTS_FILE', results_file),
        ('SERVICE', service),
        ('SHOW_OVERVIEW', show_overview),
        ('CMD', 'Get'),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Poll NCBI until the results are ready.
    # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
    # 1. Do not contact the server more often than once every 10 seconds.
    # 2. Do not poll for any single RID more often than once a minute.
    # 3. Use the URL parameter email and tool, so that the NCBI
    #    can contact you if there is a problem.
    # 4. Run scripts weekends or between 9 pm and 5 am Eastern time
    #    on weekdays if more than 50 searches will be submitted.
    # --
    # Could start with a 10s delay, but expect most short queries
    # will take longer thus at least 70s with delay. Therefore,
    # start with 20s delay, thereafter once a minute.
    delay = 20  # seconds
    while True:
        current = time.time()
        wait = qblast._previous + delay - current
        if wait > 0:
            time.sleep(wait)
            qblast._previous = current + wait
        else:
            qblast._previous = current
        # delay by at least 60 seconds only if running the request against the public NCBI API
        if delay < 60 and url_base == NCBI_BLAST_URL:
            # Wasn't a quick return, must wait at least a minute
            delay = 60

        request = _Request(url_base, message,
                           {"User-Agent": "BiopythonClient"})
        handle = _urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results == "\n\n":
            continue
        # XML results don't have the Status tag when finished
        if "Status=" not in results:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i + len("Status="):j].strip()
        if status.upper() == "READY":
            break
    return StringIO(results)
Example #57
0
def _open(cgi, params=None, ecitmatch=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers, and makes the request through POST
    rather than GET if the number of characters in the resulting query is
    greater than 1000.
    """
    if params is None:
        params = {}
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current
    # Remove None values from the parameters
    for key, value in list(params.items()):
        if value is None:
            del params[key]
    # Tell Entrez that we are using Biopython (or whatever the user has
    # specified explicitly in the parameters or by changing the default)
    if "tool" not in params:
        params["tool"] = tool
    # Tell Entrez who we are
    if "email" not in params:
        if email is not None:
            params["email"] = email
        else:
            warnings.warn("""
Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is [email protected], you can specify it as follows:
   from Bio import Entrez
   Entrez.email = '*****@*****.**'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)
    
    # By default, we do not force a POST request
    force_post = False
    
    # Make sure the UIDs are in the format UID,UID,...
    ids = params.get("id", None)
    if ids is not None:
        # Detect whether 200+ UIDs have been provided, and convert the list
        # [UID, UID, ...] into the string "UID,UID,..."
        if isinstance(ids, list):
            params["id"] = ",".join(ids)
        elif isinstance(ids, str):
            ids = ids.split(",")
        
        # If 200+ UIDs are given, force the POST request
        force_post = len(ids) > 200
    
    # Open a handle to Entrez.
    options = _urlencode(params, doseq=True)
    # _urlencode encodes pipes, which NCBI expects in ECitMatch
    if ecitmatch:
        options = options.replace('%7C', '|')
    # print cgi + "?" + options
    
    post = force_post or len(options) > 1000
    try:
        if post:
            # HTTP POST
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            # HTTP GET
            cgi += "?" + options
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Example #58
0
    def externalEntityRefHandler(self, context, base, systemId, publicId):
        """The purpose of this function is to load the DTD locally, instead
        of downloading it from the URL specified in the XML. Using the local
        DTD results in much faster parsing. If the DTD is not found locally,
        we try to download it. If new DTDs become available from NCBI,
        putting them in Bio/Entrez/DTDs will allow the parser to see them."""
        urlinfo = _urlparse(systemId)
        # Following attribute requires Python 2.5+
        # if urlinfo.scheme=='http':
        if urlinfo[0] == "http":
            # Then this is an absolute path to the DTD.
            url = systemId
        elif urlinfo[0] == "":
            # Then this is a relative path to the DTD.
            # Look at the parent URL to find the full path.
            try:
                url = self.dtd_urls[-1]
            except IndexError:
                # Assume the default URL for DTDs if the top parent
                # does not contain an absolute path
                source = "http://www.ncbi.nlm.nih.gov/dtd/"
            else:
                source = os.path.dirname(url)
            # urls always have a forward slash, don't use os.path.join
            url = source.rstrip("/") + "/" + systemId
        self.dtd_urls.append(url)
        # First, try to load the local version of the DTD file
        location, filename = os.path.split(systemId)
        handle = self.open_dtd_file(filename)
        if not handle:
            # DTD is not available as a local file. Try accessing it through
            # the internet instead.
            message = """\
Unable to load DTD file %s.

Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez.
Though most of NCBI's DTD files are included in the Biopython distribution,
sometimes you may find that a particular DTD file is missing. While we can
access the DTD file through the internet, the parser is much faster if the
required DTD files are available locally.

For this purpose, please download %s from

%s

and save it either in directory

%s

or in directory

%s

in order for Bio.Entrez to find it.

Alternatively, you can save %s in the directory
Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython.

Please also inform the Biopython developers about this missing DTD, by
reporting a bug on https://github.com/biopython/biopython/issues or sign
up to our mailing list and emailing us, so that we can include it with the
next release of Biopython.

Proceeding to access the DTD file through the internet...
""" % (
                filename,
                filename,
                url,
                self.global_dtd_dir,
                self.local_dtd_dir,
                filename,
            )
            warnings.warn(message)
            try:
                handle = _urlopen(url)
            except IOError:
                raise RuntimeException("Failed to access %s at %s" % (filename, url))

        parser = self.parser.ExternalEntityParserCreate(context)
        parser.ElementDeclHandler = self.elementDecl
        parser.ParseFile(handle)
        handle.close()
        self.dtd_urls.pop()
        return 1