コード例 #1
0
ファイル: test_KEGG_online.py プロジェクト: raj347/BiGPY
 def test_get_hsa_10458_list_ece_Z5100_as_aaseq(self):
     h = kegg_get(["hsa:10458", "ece:Z5100"], "aaseq")
     data = _as_string(h.read())
     self.assertEqual(data.count(">"), 2)
     self.assertEqual(h.url,
                      "http://rest.kegg.jp/get/hsa:10458+ece:Z5100/aaseq")
     h.close()
コード例 #2
0
ファイル: api.py プロジェクト: carlosp420/bold
    def get(self, service, **kwargs):
        """Does HTTP request to BOLD webservice.

        Args:
            service: The BOLD API alias to interact with.
            kwargs: Paramenters send by users.

        Returns:
            A Response class containing parsed data as attribute `items`.

        """
        params = ''

        if service == 'call_id':
            sequence = utils._prepare_sequence(kwargs['seq'])
            params = _urlencode({'db': kwargs['db'], 'sequence': sequence})

        if service == 'call_taxon_search':
            if kwargs['fuzzy'] is True:
                fuzzy = 'true'
            else:
                fuzzy = 'false'
            params = _urlencode({
                'taxName': kwargs['taxonomic_identification'],
                'fuzzy': fuzzy,
            })

        if service == 'call_taxon_data':
            if kwargs['include_tree'] is False:
                params = _urlencode({
                    'taxId': kwargs['tax_id'],
                    'dataTypes': kwargs['data_type'],
                })
            else:
                params = _urlencode({
                    'taxId': kwargs['tax_id'],
                    'dataTypes': kwargs['data_type'],
                    'includeTree': 'true',
                })

        if service == 'call_specimen_data' or service == 'call_sequence_data' or \
                service == 'call_full_data' or service == 'call_trace_files':
            payload = dict()
            for k, v in kwargs.items():
                if v is not None and k != 'url':
                    payload[k] = v
            params = _urlencode(payload)

        url = kwargs['url'] + "?" + params
        req = _Request(url, headers={'User-Agent': 'BiopythonClient'})
        handle = _urlopen(req)
        response = Response()

        if service == 'call_trace_files':
            binary_result = handle.read()
            response._parse_data(service, binary_result)
        else:
            result = _as_string(handle.read())
            response._parse_data(service, result)
        return response
コード例 #3
0
    def get_all_obsolete(self):
        """Return a list of all obsolete entries ever in the PDB.

        Returns a list of all obsolete pdb codes that have ever been
        in the PDB.

        Gets and parses the file from the PDB server in the format
        (the first pdb_code column is the one used). The file looks
        like this::

             LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
            OBSLTE    31-JUL-94 116L     216L
            ...
            OBSLTE    29-JAN-96 1HFT     2HFT
            OBSLTE    21-SEP-06 1HFV     2J5X
            OBSLTE    21-NOV-03 1HG6
            OBSLTE    18-JUL-84 1HHB     2HHB 3HHB
            OBSLTE    08-NOV-96 1HID     2HID
            OBSLTE    01-APR-97 1HIU     2HIU
            OBSLTE    14-JAN-04 1HKE     1UUZ
            ...

        """
        url = self.pdb_server + "/pub/pdb/data/status/obsolete.dat"
        with contextlib.closing(_urlopen(url)) as handle:
            # Extract pdb codes. Could use a list comprehension, but I want
            # to include an assert to check for mis-reading the data.
            obsolete = []
            for line in handle:
                if not line.startswith(b"OBSLTE "):
                    continue
                pdb = line.split()[2]
                assert len(pdb) == 4
                obsolete.append(_as_string(pdb))
        return obsolete
コード例 #4
0
ファイル: test_bgzf.py プロジェクト: fredricj/biopython
    def check_by_line(self, old_file, new_file, old_gzip=False):
        for mode in ["r", "rb"]:
            if old_gzip:
                h = gzip.open(old_file, mode)
            else:
                h = open(old_file, mode)
            old = h.read()
            if "b" in mode:
                old = _as_bytes(old)
            else:
                old = _as_string(old)
            h.close()

            for cache in [1, 10]:
                h = bgzf.BgzfReader(new_file, mode, max_cache=cache)
                if "b" in mode:
                    new = b"".join(line for line in h)
                else:
                    new = "".join(line for line in h)
                h.close()

                self.assertEqual(len(old), len(new))
                self.assertEqual(
                    old[:10], new[:10],
                    "%r vs %r, mode %r" % (old[:10], new[:10], mode))
                self.assertEqual(old, new)
コード例 #5
0
    def check_by_line(self, old_file, new_file, old_gzip=False):
        for mode in ["r", "rb"]:
            if old_gzip:
                h = gzip.open(old_file, mode)
            else:
                h = open(old_file, mode)
            old = h.read()
            #Seems gzip can return bytes even if mode="r",
            #perhaps a bug in Python 3.2?
            if "b" in mode:
                old = _as_bytes(old)
            else:
                old = _as_string(old)
            h.close()

            for cache in [1,10]:
                h = bgzf.BgzfReader(new_file, mode, max_cache=cache)
                if "b" in mode:
                    new = _empty_bytes_string.join(line for line in h)
                else:
                    new = "".join(line for line in h)
                h.close()

                self.assertEqual(len(old), len(new))
                self.assertEqual(old[:10], new[:10], \
                                 "%r vs %r, mode %r" % (old[:10], new[:10], mode))
                self.assertEqual(old, new)
コード例 #6
0
    def check_by_char(self, old_file, new_file, old_gzip=False):
        for mode in ["r", "rb"]:
            if old_gzip:
                h = gzip.open(old_file,mode)
            else:
                h = open(old_file, mode)
            old = h.read()
            #Seems gzip can return bytes even if mode="r",
            #perhaps a bug in Python 3.2?
            if "b" in mode:
                old = _as_bytes(old)
            else:
                old = _as_string(old)
            h.close()

            for cache in [1,10]:
                h = bgzf.BgzfReader(new_file, mode, max_cache=cache)
                temp = []
                while True:
                    char = h.read(1)
                    if not char: break
                    temp.append(char)
                if "b" in mode:
                    new = _empty_bytes_string.join(temp)
                else:
                    new = "".join(temp)
                del temp
                h.close()

                self.assertEqual(len(old), len(new))
                #If bytes vs unicode mismatch, give a short error message:
                self.assertEqual(old[:10], new[:10], \
                                 "%r vs %r, mode %r" % (old[:10], new[:10], mode))
                self.assertEqual(old, new)
コード例 #7
0
ファイル: PDBList.py プロジェクト: abradle/biopython
    def get_all_obsolete(self):
        """Returns a list of all obsolete entries ever in the PDB.

        Returns a list of all obsolete pdb codes that have ever been
        in the PDB.

        Gets and parses the file from the PDB server in the format
        (the first pdb_code column is the one used). The file looks
        like this::

             LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
            OBSLTE    31-JUL-94 116L     216L
            ...
            OBSLTE    29-JAN-96 1HFT     2HFT
            OBSLTE    21-SEP-06 1HFV     2J5X
            OBSLTE    21-NOV-03 1HG6
            OBSLTE    18-JUL-84 1HHB     2HHB 3HHB
            OBSLTE    08-NOV-96 1HID     2HID
            OBSLTE    01-APR-97 1HIU     2HIU
            OBSLTE    14-JAN-04 1HKE     1UUZ
            ...

        """
        url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat'
        with contextlib.closing(_urlopen(url)) as handle:
            # Extract pdb codes. Could use a list comprehension, but I want
            # to include an assert to check for mis-reading the data.
            obsolete = []
            for line in handle:
                if not line.startswith(b"OBSLTE "):
                    continue
                pdb = line.split()[2]
                assert len(pdb) == 4
                obsolete.append(_as_string(pdb))
        return obsolete
コード例 #8
0
ファイル: KGML_pathway.py プロジェクト: Ambuj-UF/ConCat-1.0
 def get_KGML(self):
     """Return the pathway as a string in prettified KGML format."""
     header = '\n'.join(['<?xml version="1.0"?>',
                         '<!DOCTYPE pathway SYSTEM "http://www.genome.jp/kegg/xml/KGML_v0.7.1_.dtd">',
                         '<!-- Created by KGML_Pathway.py %s -->' % time.asctime()])
     rough_xml = header + _as_string(ET.tostring(self.element, 'utf-8'))
     reparsed = minidom.parseString(rough_xml)
     return reparsed.toprettyxml(indent="  ")
コード例 #9
0
ファイル: KGML_pathway.py プロジェクト: Rapternmn/biopython
 def get_KGML(self):
     """Return the pathway as a string in prettified KGML format."""
     header = '\n'.join([
         '<?xml version="1.0"?>',
         '<!DOCTYPE pathway SYSTEM "http://www.genome.jp/kegg/xml/KGML_v0.7.1_.dtd">',
         '<!-- Created by KGML_Pathway.py %s -->' % time.asctime()
     ])
     rough_xml = header + _as_string(ET.tostring(self.element, 'utf-8'))
     reparsed = minidom.parseString(rough_xml)
     return reparsed.toprettyxml(indent="  ")
コード例 #10
0
    def get_all_entries(self):
        """Retrieves a big file containing all the PDB entries and some annotation.

        Returns a list of PDB codes in the index file.
        """
        url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx'
        print("Retrieving index file. Takes about 27 MB.")
        with contextlib.closing(_urlopen(url)) as handle:
            all_entries = [_as_string(line[:4]) for line in handle.readlines()[2:]
                           if len(line) > 4]
        return all_entries
コード例 #11
0
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     # This is to catch an error page from our proxy:
     handle = UndoHandle(ExPASy.get_sprot_raw(identifier))
     if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"):
         raise IOError
     record = SeqIO.read(handle, "swiss")
     handle.close()
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
コード例 #12
0
ファイル: test_SeqIO_online.py プロジェクト: lennax/biopython
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     # This is to catch an error page from our proxy:
     handle = UndoHandle(ExPASy.get_sprot_raw(identifier))
     if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"):
         raise IOError
     record = SeqIO.read(handle, "swiss")
     handle.close()
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
コード例 #13
0
ファイル: bgzf.py プロジェクト: zyha/biopython
def _load_bgzf_block(handle, text_mode=False):
    """Load the next BGZF block of compressed data (PRIVATE).

    Returns a tuple (block size and data), or at end of file
    will raise StopIteration.
    """
    magic = handle.read(4)
    if not magic:
        # End of file - should we signal this differently now?
        # See https://www.python.org/dev/peps/pep-0479/
        raise StopIteration
    if magic != _bgzf_magic:
        raise ValueError(r"A BGZF (e.g. a BAM file) block should start with "
                         r"%r, not %r; handle.tell() now says %r" %
                         (_bgzf_magic, magic, handle.tell()))
    gzip_mod_time, gzip_extra_flags, gzip_os, extra_len = \
        struct.unpack("<LBBH", handle.read(8))

    block_size = None
    x_len = 0
    while x_len < extra_len:
        subfield_id = handle.read(2)
        subfield_len = struct.unpack("<H", handle.read(2))[0]  # uint16_t
        subfield_data = handle.read(subfield_len)
        x_len += subfield_len + 4
        if subfield_id == _bytes_BC:
            assert subfield_len == 2, "Wrong BC payload length"
            assert block_size is None, "Two BC subfields?"
            block_size = struct.unpack("<H", subfield_data)[0] + 1  # uint16_t
    assert x_len == extra_len, (x_len, extra_len)
    assert block_size is not None, "Missing BC, this isn't a BGZF file!"
    # Now comes the compressed data, CRC, and length of uncompressed data.
    deflate_size = block_size - 1 - extra_len - 19
    d = zlib.decompressobj(-15)  # Negative window size means no headers
    data = d.decompress(handle.read(deflate_size)) + d.flush()
    expected_crc = handle.read(4)
    expected_size = struct.unpack("<I", handle.read(4))[0]
    if expected_size != len(data):
        raise RuntimeError("Decompressed to %i, "
                           "not %i" % (len(data), expected_size))
    # Should cope with a mix of Python platforms...
    crc = zlib.crc32(data)
    if crc < 0:
        crc = struct.pack("<i", crc)
    else:
        crc = struct.pack("<I", crc)
    if expected_crc != crc:
        raise RuntimeError("CRC is %s, not %s" % (crc, expected_crc))
    if text_mode:
        return block_size, _as_string(data)
    else:
        return block_size, data
コード例 #14
0
ファイル: bgzf.py プロジェクト: HuttonICS/biopython
def _load_bgzf_block(handle, text_mode=False):
    """Load the next BGZF block of compressed data (PRIVATE).

    Returns a tuple (block size and data), or at end of file
    will raise StopIteration.
    """
    magic = handle.read(4)
    if not magic:
        # End of file - should we signal this differently now?
        # See https://www.python.org/dev/peps/pep-0479/
        raise StopIteration
    if magic != _bgzf_magic:
        raise ValueError(r"A BGZF (e.g. a BAM file) block should start with "
                         r"%r, not %r; handle.tell() now says %r"
                         % (_bgzf_magic, magic, handle.tell()))
    gzip_mod_time, gzip_extra_flags, gzip_os, extra_len = \
        struct.unpack("<LBBH", handle.read(8))

    block_size = None
    x_len = 0
    while x_len < extra_len:
        subfield_id = handle.read(2)
        subfield_len = struct.unpack("<H", handle.read(2))[0]  # uint16_t
        subfield_data = handle.read(subfield_len)
        x_len += subfield_len + 4
        if subfield_id == _bytes_BC:
            assert subfield_len == 2, "Wrong BC payload length"
            assert block_size is None, "Two BC subfields?"
            block_size = struct.unpack("<H", subfield_data)[0] + 1  # uint16_t
    assert x_len == extra_len, (x_len, extra_len)
    assert block_size is not None, "Missing BC, this isn't a BGZF file!"
    # Now comes the compressed data, CRC, and length of uncompressed data.
    deflate_size = block_size - 1 - extra_len - 19
    d = zlib.decompressobj(-15)  # Negative window size means no headers
    data = d.decompress(handle.read(deflate_size)) + d.flush()
    expected_crc = handle.read(4)
    expected_size = struct.unpack("<I", handle.read(4))[0]
    if expected_size != len(data):
        raise RuntimeError("Decompressed to %i, "
                           "not %i" % (len(data), expected_size))
    # Should cope with a mix of Python platforms...
    crc = zlib.crc32(data)
    if crc < 0:
        crc = struct.pack("<i", crc)
    else:
        crc = struct.pack("<I", crc)
    if expected_crc != crc:
        raise RuntimeError("CRC is %s, not %s" % (crc, expected_crc))
    if text_mode:
        return block_size, _as_string(data)
    else:
        return block_size, data
コード例 #15
0
ファイル: PDBList.py プロジェクト: juliahi/biopython
    def get_status_list(url):
        """Retrieve a list of pdb codes in the weekly pdb status file from given URL.

        Used by get_recent_changes. Typical contents of the list files parsed
        by this method is now very simply - one PDB name per line.
        """
        with contextlib.closing(_urlopen(url)) as handle:
            answer = []
            for line in handle:
                pdb = line.strip()
                assert len(pdb) == 4
                answer.append(_as_string(pdb))
        return answer
コード例 #16
0
    def get_status_list(url):
        """Retrieve a list of pdb codes in the weekly pdb status file from given URL.

        Used by get_recent_changes. Typical contents of the list files parsed
        by this method is now very simply - one PDB name per line.
        """
        with contextlib.closing(_urlopen(url)) as handle:
            answer = []
            for line in handle:
                pdb = line.strip()
                assert len(pdb) == 4
                answer.append(_as_string(pdb))
        return answer
コード例 #17
0
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     try:
         #This is to catch an error page from our proxy:
         handle = UndoHandle(ExPASy.get_sprot_raw(identifier))
         if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"):
             raise IOError
         record = SeqIO.read(handle, "swiss")
         handle.close()
     except IOError:
         raise MissingExternalDependencyError(
             "internet (or maybe just ExPASy) not available")
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
コード例 #18
0
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     try:
         #This is to catch an error page from our proxy:
         handle = UndoHandle(ExPASy.get_sprot_raw(identifier))
         if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"):
             raise IOError
         record = SeqIO.read(handle, "swiss")
         handle.close()
     except IOError:
         raise MissingExternalDependencyError(
               "internet (or maybe just ExPASy) not available")
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
コード例 #19
0
def qblast(
    program,
    database,
    sequence,
    auto_format=None,
    composition_based_statistics=None,
    db_genetic_code=None,
    endpoints=None,
    entrez_query='(none)',
    expect=10.0,
    filter=None,
    gapcosts=None,
    genetic_code=None,
    hitlist_size=50,
    i_thresh=None,
    layout=None,
    lcase_mask=None,
    matrix_name=None,
    nucl_penalty=None,
    nucl_reward=None,
    other_advanced=None,
    perc_ident=None,
    phi_pattern=None,
    query_file=None,
    query_believe_defline=None,
    query_from=None,
    query_to=None,
    searchsp_eff=None,
    service=None,
    threshold=None,
    ungapped_alignment=None,
    word_size=None,
    alignments=500,
    alignment_view=None,
    descriptions=500,
    entrez_links_new_window=None,
    expect_low=None,
    expect_high=None,
    format_entrez_query=None,
    format_object=None,
    format_type='XML',
    ncbi_gi=None,
    results_file=None,
    show_overview=None,
    megablast=None,
):
    """Do a BLAST search using the QBLAST server at NCBI.

    Supports all parameters of the qblast API for Put and Get.
    Some useful parameters:
    program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
    database       Which database to search against (e.g. "nr").
    sequence       The sequence to search.
    ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
    descriptions   Number of descriptions to show.  Def 500.
    alignments     Number of alignments to show.  Def 500.
    expect         An expect value cutoff.  Def 10.0.
    matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
    filter         "none" turns off filtering.  Default no filtering
    format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
    entrez_query   Entrez query to limit Blast search
    hitlist_size   Number of hits to return. Default 50
    megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
    service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html

    """
    import time

    assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ('AUTO_FORMAT', auto_format),
        ('COMPOSITION_BASED_STATISTICS', composition_based_statistics),
        ('DATABASE', database),
        ('DB_GENETIC_CODE', db_genetic_code),
        ('ENDPOINTS', endpoints),
        ('ENTREZ_QUERY', entrez_query),
        ('EXPECT', expect),
        ('FILTER', filter),
        ('GAPCOSTS', gapcosts),
        ('GENETIC_CODE', genetic_code),
        ('HITLIST_SIZE', hitlist_size),
        ('I_THRESH', i_thresh),
        ('LAYOUT', layout),
        ('LCASE_MASK', lcase_mask),
        ('MEGABLAST', megablast),
        ('MATRIX_NAME', matrix_name),
        ('NUCL_PENALTY', nucl_penalty),
        ('NUCL_REWARD', nucl_reward),
        ('OTHER_ADVANCED', other_advanced),
        ('PERC_IDENT', perc_ident),
        ('PHI_PATTERN', phi_pattern),
        ('PROGRAM', program),
        #('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ('QUERY', sequence),
        ('QUERY_FILE', query_file),
        ('QUERY_BELIEVE_DEFLINE', query_believe_defline),
        ('QUERY_FROM', query_from),
        ('QUERY_TO', query_to),
        #('RESULTS_FILE',...), - Can we use this parameter?
        ('SEARCHSP_EFF', searchsp_eff),
        ('SERVICE', service),
        ('THRESHOLD', threshold),
        ('UNGAPPED_ALIGNMENT', ungapped_alignment),
        ('WORD_SIZE', word_size),
        ('CMD', 'Put'),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message,
                       {"User-Agent": "BiopythonClient"})
    handle = _urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ('ALIGNMENTS', alignments),
        ('ALIGNMENT_VIEW', alignment_view),
        ('DESCRIPTIONS', descriptions),
        ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window),
        ('EXPECT_LOW', expect_low),
        ('EXPECT_HIGH', expect_high),
        ('FORMAT_ENTREZ_QUERY', format_entrez_query),
        ('FORMAT_OBJECT', format_object),
        ('FORMAT_TYPE', format_type),
        ('NCBI_GI', ncbi_gi),
        ('RID', rid),
        ('RESULTS_FILE', results_file),
        ('SERVICE', service),
        ('SHOW_OVERVIEW', show_overview),
        ('CMD', 'Get'),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Poll NCBI until the results are ready.  Use a backoff delay from 2 - 120 second wait
    delay = 2.0
    previous = time.time()
    while True:
        current = time.time()
        wait = previous + delay - current
        if wait > 0:
            time.sleep(wait)
            previous = current + wait
        else:
            previous = current
        if delay + .5 * delay <= 120:
            delay += .5 * delay
        else:
            delay = 120

        request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message,
                           {"User-Agent": "BiopythonClient"})
        handle = _urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results == "\n\n":
            continue
        # XML results don't have the Status tag when finished
        if "Status=" not in results:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i + len("Status="):j].strip()
        if status.upper() == "READY":
            break

    return StringIO(results)
コード例 #20
0
ファイル: __init__.py プロジェクト: gavieira/mitomaker
def _read(handle):
    record = None
    unread = ""
    for line in handle:
        #This is for Python 3 to cope with a binary handle (byte strings),
        #or a text handle (unicode strings):
        line = _as_string(line)
        key, value = line[:2], line[5:].rstrip()
        if unread:
            value = unread + " " + value
            unread = ""
        if key == '**':
            #See Bug 2353, some files from the EBI have extra lines
            #starting "**" (two asterisks/stars).  They appear
            #to be unofficial automated annotations. e.g.
            #**
            #**   #################    INTERNAL SECTION    ##################
            #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003.
            pass
        elif key == 'ID':
            record = Record()
            _read_id(record, line)
            _sequence_lines = []
        elif key == 'AC':
            accessions = [word for word in value.rstrip(";").split("; ")]
            record.accessions.extend(accessions)
        elif key == 'DT':
            _read_dt(record, line)
        elif key == 'DE':
            record.description.append(value.strip())
        elif key == 'GN':
            if record.gene_name:
                record.gene_name += " "
            record.gene_name += value
        elif key == 'OS':
            record.organism.append(value)
        elif key == 'OG':
            record.organelle += line[5:]
        elif key == 'OC':
            cols = [col for col in value.rstrip(";.").split("; ")]
            record.organism_classification.extend(cols)
        elif key == 'OX':
            _read_ox(record, line)
        elif key == 'OH':
            _read_oh(record, line)
        elif key == 'RN':
            reference = Reference()
            _read_rn(reference, value)
            record.references.append(reference)
        elif key == 'RP':
            assert record.references, "RP: missing RN"
            record.references[-1].positions.append(value)
        elif key == 'RC':
            assert record.references, "RC: missing RN"
            reference = record.references[-1]
            unread = _read_rc(reference, value)
        elif key == 'RX':
            assert record.references, "RX: missing RN"
            reference = record.references[-1]
            _read_rx(reference, value)
        elif key == 'RL':
            assert record.references, "RL: missing RN"
            reference = record.references[-1]
            reference.location.append(value)
        # In UniProt release 1.12 of 6/21/04, there is a new RG
        # (Reference Group) line, which references a group instead of
        # an author.  Each block must have at least 1 RA or RG line.
        elif key == 'RA':
            assert record.references, "RA: missing RN"
            reference = record.references[-1]
            reference.authors.append(value)
        elif key == 'RG':
            assert record.references, "RG: missing RN"
            reference = record.references[-1]
            reference.authors.append(value)
        elif key == "RT":
            assert record.references, "RT: missing RN"
            reference = record.references[-1]
            reference.title.append(value)
        elif key == 'CC':
            _read_cc(record, line)
        elif key == 'DR':
            _read_dr(record, value)
        elif key == 'PE':
            #TODO - Record this information?
            pass
        elif key == 'KW':
            cols = value.rstrip(";.").split('; ')
            record.keywords.extend(cols)
        elif key == 'FT':
            _read_ft(record, line)
        elif key == 'SQ':
            cols = value.split()
            assert len(cols) == 7, "I don't understand SQ line %s" % line
            # Do more checking here?
            record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
        elif key == '  ':
            _sequence_lines.append(value.replace(" ", "").rstrip())
        elif key == '//':
            # Join multiline data into one string
            record.description = " ".join(record.description)
            record.organism = " ".join(record.organism)
            record.organelle = record.organelle.rstrip()
            for reference in record.references:
                reference.authors = " ".join(reference.authors).rstrip(";")
                reference.title = " ".join(reference.title).rstrip(";")
                if reference.title.startswith(
                        '"') and reference.title.endswith('"'):
                    reference.title = reference.title[1:-1]  # remove quotes
                reference.location = " ".join(reference.location)
            record.sequence = "".join(_sequence_lines)
            return record
        else:
            raise ValueError("Unknown keyword '%s' found" % key)
    if record:
        raise ValueError("Unexpected end of stream.")
コード例 #21
0
def qblast(
    program,
    database,
    sequence,
    url_base=NCBI_BLAST_URL,
    auto_format=None,
    composition_based_statistics=None,
    db_genetic_code=None,
    endpoints=None,
    entrez_query='(none)',
    expect=10.0,
    filter=None,
    gapcosts=None,
    genetic_code=None,
    hitlist_size=50,
    i_thresh=None,
    layout=None,
    lcase_mask=None,
    matrix_name=None,
    nucl_penalty=None,
    nucl_reward=None,
    other_advanced=None,
    perc_ident=None,
    phi_pattern=None,
    query_file=None,
    query_believe_defline=None,
    query_from=None,
    query_to=None,
    searchsp_eff=None,
    service=None,
    threshold=None,
    ungapped_alignment=None,
    word_size=None,
    alignments=500,
    alignment_view=None,
    descriptions=500,
    entrez_links_new_window=None,
    expect_low=None,
    expect_high=None,
    format_entrez_query=None,
    format_object=None,
    format_type='XML',
    ncbi_gi=None,
    results_file=None,
    show_overview=None,
    megablast=None,
    template_type=None,
    template_length=None,
):
    """BLAST search using NCBI's QBLAST server or a cloud service provider.

    Supports all parameters of the qblast API for Put and Get.

    Please note that BLAST on the cloud supports the NCBI-BLAST Common
    URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To
    use this feature, please set url_base to
    'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and
    format_object='Alignment'. For more details, please see
    https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast

    Some useful parameters:

     - program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
     - database       Which database to search against (e.g. "nr").
     - sequence       The sequence to search.
     - ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
     - descriptions   Number of descriptions to show.  Def 500.
     - alignments     Number of alignments to show.  Def 500.
     - expect         An expect value cutoff.  Def 10.0.
     - matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
     - filter         "none" turns off filtering.  Default no filtering
     - format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
     - entrez_query   Entrez query to limit Blast search
     - hitlist_size   Number of hits to return. Default 50
     - megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
     - service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    https://ncbi.github.io/blast-cloud/dev/api.html

    """
    import time

    programs = ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
    if program not in programs:
        raise ValueError("Program specified is %s. Expected one of %s" %
                         (program, ", ".join(programs)))

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ('AUTO_FORMAT', auto_format),
        ('COMPOSITION_BASED_STATISTICS', composition_based_statistics),
        ('DATABASE', database),
        ('DB_GENETIC_CODE', db_genetic_code),
        ('ENDPOINTS', endpoints),
        ('ENTREZ_QUERY', entrez_query),
        ('EXPECT', expect),
        ('FILTER', filter),
        ('GAPCOSTS', gapcosts),
        ('GENETIC_CODE', genetic_code),
        ('HITLIST_SIZE', hitlist_size),
        ('I_THRESH', i_thresh),
        ('LAYOUT', layout),
        ('LCASE_MASK', lcase_mask),
        ('MEGABLAST', megablast),
        ('MATRIX_NAME', matrix_name),
        ('NUCL_PENALTY', nucl_penalty),
        ('NUCL_REWARD', nucl_reward),
        ('OTHER_ADVANCED', other_advanced),
        ('PERC_IDENT', perc_ident),
        ('PHI_PATTERN', phi_pattern),
        ('PROGRAM', program),
        # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ('QUERY', sequence),
        ('QUERY_FILE', query_file),
        ('QUERY_BELIEVE_DEFLINE', query_believe_defline),
        ('QUERY_FROM', query_from),
        ('QUERY_TO', query_to),
        # ('RESULTS_FILE',...), - Can we use this parameter?
        ('SEARCHSP_EFF', searchsp_eff),
        ('SERVICE', service),
        ('TEMPLATE_TYPE', template_type),
        ('TEMPLATE_LENGTH', template_length),
        ('THRESHOLD', threshold),
        ('UNGAPPED_ALIGNMENT', ungapped_alignment),
        ('WORD_SIZE', word_size),
        ('CMD', 'Put'),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = _Request(url_base, message, {"User-Agent": "BiopythonClient"})
    handle = _urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ('ALIGNMENTS', alignments),
        ('ALIGNMENT_VIEW', alignment_view),
        ('DESCRIPTIONS', descriptions),
        ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window),
        ('EXPECT_LOW', expect_low),
        ('EXPECT_HIGH', expect_high),
        ('FORMAT_ENTREZ_QUERY', format_entrez_query),
        ('FORMAT_OBJECT', format_object),
        ('FORMAT_TYPE', format_type),
        ('NCBI_GI', ncbi_gi),
        ('RID', rid),
        ('RESULTS_FILE', results_file),
        ('SERVICE', service),
        ('SHOW_OVERVIEW', show_overview),
        ('CMD', 'Get'),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Poll NCBI until the results are ready.
    # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
    # 1. Do not contact the server more often than once every 10 seconds.
    # 2. Do not poll for any single RID more often than once a minute.
    # 3. Use the URL parameter email and tool, so that the NCBI
    #    can contact you if there is a problem.
    # 4. Run scripts weekends or between 9 pm and 5 am Eastern time
    #    on weekdays if more than 50 searches will be submitted.
    # --
    # Could start with a 10s delay, but expect most short queries
    # will take longer thus at least 70s with delay. Therefore,
    # start with 20s delay, thereafter once a minute.
    delay = 20  # seconds
    while True:
        current = time.time()
        wait = qblast._previous + delay - current
        if wait > 0:
            time.sleep(wait)
            qblast._previous = current + wait
        else:
            qblast._previous = current
        # delay by at least 60 seconds only if running the request against the public NCBI API
        if delay < 60 and url_base == NCBI_BLAST_URL:
            # Wasn't a quick return, must wait at least a minute
            delay = 60

        request = _Request(url_base, message,
                           {"User-Agent": "BiopythonClient"})
        handle = _urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results == "\n\n":
            continue
        # XML results don't have the Status tag when finished
        if "Status=" not in results:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i + len("Status="):j].strip()
        if status.upper() == "READY":
            break
    return StringIO(results)
コード例 #22
0
def _parse_qblast_ref_page(handle):
    """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).

    The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably
    'Request Time of Execution' and RID would be 'Request Identifier'.
    """
    s = _as_string(handle.read())
    i = s.find("RID =")
    if i == -1:
        rid = None
    else:
        j = s.find("\n", i)
        rid = s[i + len("RID ="):j].strip()

    i = s.find("RTOE =")
    if i == -1:
        rtoe = None
    else:
        j = s.find("\n", i)
        rtoe = s[i + len("RTOE ="):j].strip()

    if not rid and not rtoe:
        # Can we reliably extract the error message from the HTML page?
        # e.g.  "Message ID#24 Error: Failed to read the Blast query:
        #       Nucleotide FASTA provided for protein sequence"
        # or    "Message ID#32 Error: Query contains no data: Query
        #       contains no sequence data"
        #
        # This used to occur inside a <div class="error msInf"> entry:
        i = s.find('<div class="error msInf">')
        if i != -1:
            msg = s[i + len('<div class="error msInf">'):].strip()
            msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip()
            if msg:
                raise ValueError("Error message from NCBI: %s" % msg)
        # In spring 2010 the markup was like this:
        i = s.find('<p class="error">')
        if i != -1:
            msg = s[i + len('<p class="error">'):].strip()
            msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip()
            if msg:
                raise ValueError("Error message from NCBI: %s" % msg)
        # Generic search based on the way the error messages start:
        i = s.find('Message ID#')
        if i != -1:
            # Break the message at the first HTML tag
            msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip()
            raise ValueError("Error message from NCBI: %s" % msg)
        # We didn't recognise the error layout :(
        # print s
        raise ValueError("No RID and no RTOE found in the 'please wait' page, "
                         "there was probably an error in your request but we "
                         "could not extract a helpful error message.")
    elif not rid:
        # Can this happen?
        raise ValueError("No RID found in the 'please wait' page."
                         " (although RTOE = %s)" % repr(rtoe))
    elif not rtoe:
        # Can this happen?
        raise ValueError("No RTOE found in the 'please wait' page."
                         " (although RID = %s)" % repr(rid))

    try:
        return rid, int(rtoe)
    except ValueError:
        raise ValueError("A non-integer RTOE found in "
                         "the 'please wait' page, %s" % repr(rtoe))
コード例 #23
0
def _read(handle):
    record = None
    unread = ""
    for line in handle:
        # This is for Python 3 to cope with a binary handle (byte strings),
        # or a text handle (unicode strings):
        line = _as_string(line)
        key, value = line[:2], line[5:].rstrip()
        if unread:
            value = unread + " " + value
            unread = ""
        if key == '**':
            # See Bug 2353, some files from the EBI have extra lines
            # starting "**" (two asterisks/stars).  They appear
            # to be unofficial automated annotations. e.g.
            # **
            # **   #################    INTERNAL SECTION    ##################
            # **HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003.
            pass
        elif key == 'ID':
            record = Record()
            _read_id(record, line)
            _sequence_lines = []
        elif key == 'AC':
            accessions = [word for word in value.rstrip(";").split("; ")]
            record.accessions.extend(accessions)
        elif key == 'DT':
            _read_dt(record, line)
        elif key == 'DE':
            record.description.append(value.strip())
        elif key == 'GN':
            if record.gene_name:
                record.gene_name += " "
            record.gene_name += value
        elif key == 'OS':
            record.organism.append(value)
        elif key == 'OG':
            record.organelle += line[5:]
        elif key == 'OC':
            cols = [col for col in value.rstrip(";.").split("; ")]
            record.organism_classification.extend(cols)
        elif key == 'OX':
            _read_ox(record, line)
        elif key == 'OH':
            _read_oh(record, line)
        elif key == 'RN':
            reference = Reference()
            _read_rn(reference, value)
            record.references.append(reference)
        elif key == 'RP':
            assert record.references, "RP: missing RN"
            record.references[-1].positions.append(value)
        elif key == 'RC':
            assert record.references, "RC: missing RN"
            reference = record.references[-1]
            unread = _read_rc(reference, value)
        elif key == 'RX':
            assert record.references, "RX: missing RN"
            reference = record.references[-1]
            _read_rx(reference, value)
        elif key == 'RL':
            assert record.references, "RL: missing RN"
            reference = record.references[-1]
            reference.location.append(value)
        # In UniProt release 1.12 of 6/21/04, there is a new RG
        # (Reference Group) line, which references a group instead of
        # an author.  Each block must have at least 1 RA or RG line.
        elif key == 'RA':
            assert record.references, "RA: missing RN"
            reference = record.references[-1]
            reference.authors.append(value)
        elif key == 'RG':
            assert record.references, "RG: missing RN"
            reference = record.references[-1]
            reference.authors.append(value)
        elif key == "RT":
            assert record.references, "RT: missing RN"
            reference = record.references[-1]
            reference.title.append(value)
        elif key == 'CC':
            _read_cc(record, line)
        elif key == 'DR':
            _read_dr(record, value)
        elif key == 'PE':
            _read_pe(record, value)
        elif key == 'KW':
            _read_kw(record, value)
        elif key == 'FT':
            _read_ft(record, line)
        elif key == 'SQ':
            cols = value.split()
            assert len(cols) == 7, "I don't understand SQ line %s" % line
            # Do more checking here?
            record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
        elif key == '  ':
            _sequence_lines.append(value.replace(" ", "").rstrip())
        elif key == '//':
            # Join multiline data into one string
            record.description = " ".join(record.description)
            record.organism = " ".join(record.organism)
            record.organelle = record.organelle.rstrip()
            for reference in record.references:
                reference.authors = " ".join(reference.authors).rstrip(";")
                reference.title = " ".join(reference.title).rstrip(";")
                if reference.title.startswith('"') and reference.title.endswith('"'):
                    reference.title = reference.title[1:-1]  # remove quotes
                reference.location = " ".join(reference.location)
            record.sequence = "".join(_sequence_lines)
            return record
        else:
            raise ValueError("Unknown keyword '%s' found" % key)
    if record:
        raise ValueError("Unexpected end of stream.")
コード例 #24
0
ファイル: NCBIWWW.py プロジェクト: Benjamin-Lee/biopython
def qblast(program, database, sequence, url_base=NCBI_BLAST_URL,
           auto_format=None, composition_based_statistics=None,
           db_genetic_code=None, endpoints=None, entrez_query='(none)',
           expect=10.0, filter=None, gapcosts=None, genetic_code=None,
           hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None,
           matrix_name=None, nucl_penalty=None, nucl_reward=None,
           other_advanced=None, perc_ident=None, phi_pattern=None,
           query_file=None, query_believe_defline=None, query_from=None,
           query_to=None, searchsp_eff=None, service=None, threshold=None,
           ungapped_alignment=None, word_size=None,
           alignments=500, alignment_view=None, descriptions=500,
           entrez_links_new_window=None, expect_low=None, expect_high=None,
           format_entrez_query=None, format_object=None, format_type='XML',
           ncbi_gi=None, results_file=None, show_overview=None, megablast=None,
           ):
    """Do a BLAST search using the QBLAST server at NCBI or a cloud service
    provider.

    Supports all parameters of the qblast API for Put and Get.

    Please note that BLAST on the cloud supports the NCBI-BLAST Common
    URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To
    use this feature, please set url_base to
    'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and
    format_object='Alignment'. For more details, please see
    https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast

    Some useful parameters:

     - program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
     - database       Which database to search against (e.g. "nr").
     - sequence       The sequence to search.
     - ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
     - descriptions   Number of descriptions to show.  Def 500.
     - alignments     Number of alignments to show.  Def 500.
     - expect         An expect value cutoff.  Def 10.0.
     - matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
     - filter         "none" turns off filtering.  Default no filtering
     - format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
     - entrez_query   Entrez query to limit Blast search
     - hitlist_size   Number of hits to return. Default 50
     - megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
     - service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html

    """
    import time

    assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ('AUTO_FORMAT', auto_format),
        ('COMPOSITION_BASED_STATISTICS', composition_based_statistics),
        ('DATABASE', database),
        ('DB_GENETIC_CODE', db_genetic_code),
        ('ENDPOINTS', endpoints),
        ('ENTREZ_QUERY', entrez_query),
        ('EXPECT', expect),
        ('FILTER', filter),
        ('GAPCOSTS', gapcosts),
        ('GENETIC_CODE', genetic_code),
        ('HITLIST_SIZE', hitlist_size),
        ('I_THRESH', i_thresh),
        ('LAYOUT', layout),
        ('LCASE_MASK', lcase_mask),
        ('MEGABLAST', megablast),
        ('MATRIX_NAME', matrix_name),
        ('NUCL_PENALTY', nucl_penalty),
        ('NUCL_REWARD', nucl_reward),
        ('OTHER_ADVANCED', other_advanced),
        ('PERC_IDENT', perc_ident),
        ('PHI_PATTERN', phi_pattern),
        ('PROGRAM', program),
        # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ('QUERY', sequence),
        ('QUERY_FILE', query_file),
        ('QUERY_BELIEVE_DEFLINE', query_believe_defline),
        ('QUERY_FROM', query_from),
        ('QUERY_TO', query_to),
        # ('RESULTS_FILE',...), - Can we use this parameter?
        ('SEARCHSP_EFF', searchsp_eff),
        ('SERVICE', service),
        ('THRESHOLD', threshold),
        ('UNGAPPED_ALIGNMENT', ungapped_alignment),
        ('WORD_SIZE', word_size),
        ('CMD', 'Put'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = _Request(url_base,
                       message,
                       {"User-Agent": "BiopythonClient"})
    handle = _urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ('ALIGNMENTS', alignments),
        ('ALIGNMENT_VIEW', alignment_view),
        ('DESCRIPTIONS', descriptions),
        ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window),
        ('EXPECT_LOW', expect_low),
        ('EXPECT_HIGH', expect_high),
        ('FORMAT_ENTREZ_QUERY', format_entrez_query),
        ('FORMAT_OBJECT', format_object),
        ('FORMAT_TYPE', format_type),
        ('NCBI_GI', ncbi_gi),
        ('RID', rid),
        ('RESULTS_FILE', results_file),
        ('SERVICE', service),
        ('SHOW_OVERVIEW', show_overview),
        ('CMD', 'Get'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Poll NCBI until the results are ready.  Use a backoff delay from 2 - 120 second wait
    delay = 2.0
    previous = time.time()
    while True:
        current = time.time()
        wait = previous + delay - current
        if wait > 0:
            time.sleep(wait)
            previous = current + wait
        else:
            previous = current
        if delay + .5 * delay <= 120:
            delay += .5 * delay
        else:
            delay = 120

        request = _Request(url_base,
                           message,
                           {"User-Agent": "BiopythonClient"})
        handle = _urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results == "\n\n":
            continue
        # XML results don't have the Status tag when finished
        if "Status=" not in results:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i + len("Status="):j].strip()
        if status.upper() == "READY":
            break

    return StringIO(results)
コード例 #25
0
def qblast(
    program,
    database,
    sequence,
    url_base=NCBI_BLAST_URL,
    auto_format=None,
    composition_based_statistics=None,
    db_genetic_code=None,
    endpoints=None,
    entrez_query="(none)",
    expect=10.0,
    filter=None,
    gapcosts=None,
    genetic_code=None,
    hitlist_size=50,
    i_thresh=None,
    layout=None,
    lcase_mask=None,
    matrix_name=None,
    nucl_penalty=None,
    nucl_reward=None,
    other_advanced=None,
    perc_ident=None,
    phi_pattern=None,
    query_file=None,
    query_believe_defline=None,
    query_from=None,
    query_to=None,
    searchsp_eff=None,
    service=None,
    threshold=None,
    ungapped_alignment=None,
    word_size=None,
    short_query=None,
    alignments=500,
    alignment_view=None,
    descriptions=500,
    entrez_links_new_window=None,
    expect_low=None,
    expect_high=None,
    format_entrez_query=None,
    format_object=None,
    format_type="XML",
    ncbi_gi=None,
    results_file=None,
    show_overview=None,
    megablast=None,
    template_type=None,
    template_length=None,
):
    """BLAST search using NCBI's QBLAST server or a cloud service provider.

    Supports all parameters of the old qblast API for Put and Get.

    Please note that NCBI uses the new Common URL API for BLAST searches
    on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus,
    some of the parameters used by this function are not (or are no longer)
    officially supported by NCBI. Although they are still functioning, this
    may change in the future.

    The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows
    doing BLAST searches on cloud servers. To use this feature, please set
    ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'``
    and ``format_object='Alignment'``. For more details, please see
    https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast

    Some useful parameters:

     - program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
     - database       Which database to search against (e.g. "nr").
     - sequence       The sequence to search.
     - ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
     - descriptions   Number of descriptions to show.  Def 500.
     - alignments     Number of alignments to show.  Def 500.
     - expect         An expect value cutoff.  Def 10.0.
     - matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
     - filter         "none" turns off filtering.  Default no filtering
     - format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
     - entrez_query   Entrez query to limit Blast search
     - hitlist_size   Number of hits to return. Default 50
     - megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
     - short_query    TRUE/FALSE whether to adjust the search parameters for a
                      short query sequence. Note that this will override
                      manually set parameters like word size and e value. Turns
                      off when sequence length is > 30 residues. Default: None.
     - service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    https://ncbi.github.io/blast-cloud/dev/api.html

    """
    import time

    programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"]
    if program not in programs:
        raise ValueError("Program specified is %s. Expected one of %s" %
                         (program, ", ".join(programs)))

    # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter
    # assignment from NCBIs side).
    # Thus we set the (known) parameters directly:
    if short_query and program == "blastn":
        short_query = None
        # We only use the 'short-query' parameters for short sequences:
        if len(sequence) < 31:
            expect = 1000
            word_size = 7
            nucl_reward = 1
            filter = None
            lcase_mask = None
            warnings.warn(
                '"SHORT_QUERY_ADJUST" is incorrectly implemented '
                "(by NCBI) for blastn. We bypass the problem by "
                "manually adjusting the search parameters. Thus, "
                "results may slightly differ from web page "
                "searches.", BiopythonWarning)

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ("AUTO_FORMAT", auto_format),
        ("COMPOSITION_BASED_STATISTICS", composition_based_statistics),
        ("DATABASE", database),
        ("DB_GENETIC_CODE", db_genetic_code),
        ("ENDPOINTS", endpoints),
        ("ENTREZ_QUERY", entrez_query),
        ("EXPECT", expect),
        ("FILTER", filter),
        ("GAPCOSTS", gapcosts),
        ("GENETIC_CODE", genetic_code),
        ("HITLIST_SIZE", hitlist_size),
        ("I_THRESH", i_thresh),
        ("LAYOUT", layout),
        ("LCASE_MASK", lcase_mask),
        ("MEGABLAST", megablast),
        ("MATRIX_NAME", matrix_name),
        ("NUCL_PENALTY", nucl_penalty),
        ("NUCL_REWARD", nucl_reward),
        ("OTHER_ADVANCED", other_advanced),
        ("PERC_IDENT", perc_ident),
        ("PHI_PATTERN", phi_pattern),
        ("PROGRAM", program),
        # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ("QUERY", sequence),
        ("QUERY_FILE", query_file),
        ("QUERY_BELIEVE_DEFLINE", query_believe_defline),
        ("QUERY_FROM", query_from),
        ("QUERY_TO", query_to),
        # ('RESULTS_FILE',...), - Can we use this parameter?
        ("SEARCHSP_EFF", searchsp_eff),
        ("SERVICE", service),
        ("SHORT_QUERY_ADJUST", short_query),
        ("TEMPLATE_TYPE", template_type),
        ("TEMPLATE_LENGTH", template_length),
        ("THRESHOLD", threshold),
        ("UNGAPPED_ALIGNMENT", ungapped_alignment),
        ("WORD_SIZE", word_size),
        ("CMD", "Put"),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = _Request(url_base, message, {"User-Agent": "BiopythonClient"})
    handle = _urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ("ALIGNMENTS", alignments),
        ("ALIGNMENT_VIEW", alignment_view),
        ("DESCRIPTIONS", descriptions),
        ("ENTREZ_LINKS_NEW_WINDOW", entrez_links_new_window),
        ("EXPECT_LOW", expect_low),
        ("EXPECT_HIGH", expect_high),
        ("FORMAT_ENTREZ_QUERY", format_entrez_query),
        ("FORMAT_OBJECT", format_object),
        ("FORMAT_TYPE", format_type),
        ("NCBI_GI", ncbi_gi),
        ("RID", rid),
        ("RESULTS_FILE", results_file),
        ("SERVICE", service),
        ("SHOW_OVERVIEW", show_overview),
        ("CMD", "Get"),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Poll NCBI until the results are ready.
    # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
    # 1. Do not contact the server more often than once every 10 seconds.
    # 2. Do not poll for any single RID more often than once a minute.
    # 3. Use the URL parameter email and tool, so that the NCBI
    #    can contact you if there is a problem.
    # 4. Run scripts weekends or between 9 pm and 5 am Eastern time
    #    on weekdays if more than 50 searches will be submitted.
    # --
    # Could start with a 10s delay, but expect most short queries
    # will take longer thus at least 70s with delay. Therefore,
    # start with 20s delay, thereafter once a minute.
    delay = 20  # seconds
    while True:
        current = time.time()
        wait = qblast._previous + delay - current
        if wait > 0:
            time.sleep(wait)
            qblast._previous = current + wait
        else:
            qblast._previous = current
        # delay by at least 60 seconds only if running the request against the public NCBI API
        if delay < 60 and url_base == NCBI_BLAST_URL:
            # Wasn't a quick return, must wait at least a minute
            delay = 60

        request = _Request(url_base, message,
                           {"User-Agent": "BiopythonClient"})
        handle = _urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results == "\n\n":
            continue
        # XML results don't have the Status tag when finished
        if "Status=" not in results:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i + len("Status="):j].strip()
        if status.upper() == "READY":
            break
    return StringIO(results)
コード例 #26
0
 def test_get_hsa_10458_list_ece_Z5100_as_aaseq(self):
     h = kegg_get(["hsa:10458", "ece:Z5100"], "aaseq")
     data = _as_string(h.read())
     self.assertEqual(data.count(">"), 2)
     self.assertEqual(h.url, "http://rest.kegg.jp/get/hsa:10458+ece:Z5100/aaseq")
     h.close()
コード例 #27
0
ファイル: NCBIWWW.py プロジェクト: rwbarrette/biopython
def qblast(
    program,
    database,
    sequence,
    auto_format=None,
    composition_based_statistics=None,
    db_genetic_code=None,
    endpoints=None,
    entrez_query="(none)",
    expect=10.0,
    filter=None,
    gapcosts=None,
    genetic_code=None,
    hitlist_size=50,
    i_thresh=None,
    layout=None,
    lcase_mask=None,
    matrix_name=None,
    nucl_penalty=None,
    nucl_reward=None,
    other_advanced=None,
    perc_ident=None,
    phi_pattern=None,
    query_file=None,
    query_believe_defline=None,
    query_from=None,
    query_to=None,
    searchsp_eff=None,
    service=None,
    threshold=None,
    ungapped_alignment=None,
    word_size=None,
    alignments=500,
    alignment_view=None,
    descriptions=500,
    entrez_links_new_window=None,
    expect_low=None,
    expect_high=None,
    format_entrez_query=None,
    format_object=None,
    format_type="XML",
    ncbi_gi=None,
    results_file=None,
    show_overview=None,
    megablast=None,
):
    """Do a BLAST search using the QBLAST server at NCBI.

    Supports all parameters of the qblast API for Put and Get.
    Some useful parameters:
    program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
    database       Which database to search against (e.g. "nr").
    sequence       The sequence to search.
    ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
    descriptions   Number of descriptions to show.  Def 500.
    alignments     Number of alignments to show.  Def 500.
    expect         An expect value cutoff.  Def 10.0.
    matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
    filter         "none" turns off filtering.  Default no filtering
    format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
    entrez_query   Entrez query to limit Blast search
    hitlist_size   Number of hits to return. Default 50
    megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
    service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html

    """
    import urllib
    import urllib2
    import time

    assert program in ["blastn", "blastp", "blastx", "tblastn", "tblastx"]

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ("AUTO_FORMAT", auto_format),
        ("COMPOSITION_BASED_STATISTICS", composition_based_statistics),
        ("DATABASE", database),
        ("DB_GENETIC_CODE", db_genetic_code),
        ("ENDPOINTS", endpoints),
        ("ENTREZ_QUERY", entrez_query),
        ("EXPECT", expect),
        ("FILTER", filter),
        ("GAPCOSTS", gapcosts),
        ("GENETIC_CODE", genetic_code),
        ("HITLIST_SIZE", hitlist_size),
        ("I_THRESH", i_thresh),
        ("LAYOUT", layout),
        ("LCASE_MASK", lcase_mask),
        ("MEGABLAST", megablast),
        ("MATRIX_NAME", matrix_name),
        ("NUCL_PENALTY", nucl_penalty),
        ("NUCL_REWARD", nucl_reward),
        ("OTHER_ADVANCED", other_advanced),
        ("PERC_IDENT", perc_ident),
        ("PHI_PATTERN", phi_pattern),
        ("PROGRAM", program),
        # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ("QUERY", sequence),
        ("QUERY_FILE", query_file),
        ("QUERY_BELIEVE_DEFLINE", query_believe_defline),
        ("QUERY_FROM", query_from),
        ("QUERY_TO", query_to),
        # ('RESULTS_FILE',...), - Can we use this parameter?
        ("SEARCHSP_EFF", searchsp_eff),
        ("SERVICE", service),
        ("THRESHOLD", threshold),
        ("UNGAPPED_ALIGNMENT", ungapped_alignment),
        ("WORD_SIZE", word_size),
        ("CMD", "Put"),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(urllib.urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent": "BiopythonClient"})
    handle = urllib2.urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ("ALIGNMENTS", alignments),
        ("ALIGNMENT_VIEW", alignment_view),
        ("DESCRIPTIONS", descriptions),
        ("ENTREZ_LINKS_NEW_WINDOW", entrez_links_new_window),
        ("EXPECT_LOW", expect_low),
        ("EXPECT_HIGH", expect_high),
        ("FORMAT_ENTREZ_QUERY", format_entrez_query),
        ("FORMAT_OBJECT", format_object),
        ("FORMAT_TYPE", format_type),
        ("NCBI_GI", ncbi_gi),
        ("RID", rid),
        ("RESULTS_FILE", results_file),
        ("SERVICE", service),
        ("SHOW_OVERVIEW", show_overview),
        ("CMD", "Get"),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(urllib.urlencode(query))

    # Poll NCBI until the results are ready.  Use a 3 second wait
    delay = 3.0
    previous = time.time()
    while True:
        current = time.time()
        wait = previous + delay - current
        if wait > 0:
            time.sleep(wait)
            previous = current + wait
        else:
            previous = current

        request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent": "BiopythonClient"})
        handle = urllib2.urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results == "\n\n":
            continue
        # XML results don't have the Status tag when finished
        if "Status=" not in results:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i + len("Status=") : j].strip()
        if status.upper() == "READY":
            break

    return StringIO(results)
コード例 #28
0
ファイル: NCBIWWW.py プロジェクト: Benjamin-Lee/biopython
def _parse_qblast_ref_page(handle):
    """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).

    The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably
    'Request Time of Execution' and RID would be 'Request Identifier'.
    """
    s = _as_string(handle.read())
    i = s.find("RID =")
    if i == -1:
        rid = None
    else:
        j = s.find("\n", i)
        rid = s[i + len("RID ="):j].strip()

    i = s.find("RTOE =")
    if i == -1:
        rtoe = None
    else:
        j = s.find("\n", i)
        rtoe = s[i + len("RTOE ="):j].strip()

    if not rid and not rtoe:
        # Can we reliably extract the error message from the HTML page?
        # e.g.  "Message ID#24 Error: Failed to read the Blast query:
        #       Nucleotide FASTA provided for protein sequence"
        # or    "Message ID#32 Error: Query contains no data: Query
        #       contains no sequence data"
        #
        # This used to occur inside a <div class="error msInf"> entry:
        i = s.find('<div class="error msInf">')
        if i != -1:
            msg = s[i + len('<div class="error msInf">'):].strip()
            msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip()
            if msg:
                raise ValueError("Error message from NCBI: %s" % msg)
        # In spring 2010 the markup was like this:
        i = s.find('<p class="error">')
        if i != -1:
            msg = s[i + len('<p class="error">'):].strip()
            msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip()
            if msg:
                raise ValueError("Error message from NCBI: %s" % msg)
        # Generic search based on the way the error messages start:
        i = s.find('Message ID#')
        if i != -1:
            # Break the message at the first HTML tag
            msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip()
            raise ValueError("Error message from NCBI: %s" % msg)
        # We didn't recognise the error layout :(
        # print s
        raise ValueError("No RID and no RTOE found in the 'please wait' page, "
                         "there was probably an error in your request but we "
                         "could not extract a helpful error message.")
    elif not rid:
        # Can this happen?
        raise ValueError("No RID found in the 'please wait' page."
                         " (although RTOE = %s)" % repr(rtoe))
    elif not rtoe:
        # Can this happen?
        raise ValueError("No RTOE found in the 'please wait' page."
                         " (although RID = %s)" % repr(rid))

    try:
        return rid, int(rtoe)
    except ValueError:
        raise ValueError("A non-integer RTOE found in "
                         "the 'please wait' page, %s" % repr(rtoe))