asis             549 TCTTCTTACTCTTAGGAGGATGGGCGCTAGAAAGAGTTTTAAGAGGGTGT    598
                                                                       
asis             311 --------------------------------------------------    311

asis             599 GAAAGGGGGTTAATAGC    615
                                      
asis             311 -----------------    311


#---------------------------------------
#---------------------------------------"""

    from Bio._py3k import StringIO

    alignments = list(EmbossIterator(StringIO(pair_example)))
    assert len(alignments) == 1
    assert len(alignments[0]) == 2
    assert [r.id for r in alignments[0]] \
           == ["IXI_234", "IXI_235"]

    alignments = list(EmbossIterator(StringIO(simple_example)))
    assert len(alignments) == 1
    assert len(alignments[0]) == 4
    assert [r.id for r in alignments[0]] \
           == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"]

    alignments = list(EmbossIterator(StringIO(pair_example + simple_example)))
    assert len(alignments) == 2
    assert len(alignments[0]) == 2
    assert len(alignments[1]) == 4
Beispiel #2
0
def qblast(program, database, sequence, url_base=NCBI_BLAST_URL,
           auto_format=None, composition_based_statistics=None,
           db_genetic_code=None, endpoints=None, entrez_query='(none)',
           expect=10.0, filter=None, gapcosts=None, genetic_code=None,
           hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None,
           matrix_name=None, nucl_penalty=None, nucl_reward=None,
           other_advanced=None, perc_ident=None, phi_pattern=None,
           query_file=None, query_believe_defline=None, query_from=None,
           query_to=None, searchsp_eff=None, service=None, threshold=None,
           ungapped_alignment=None, word_size=None,
           alignments=500, alignment_view=None, descriptions=500,
           entrez_links_new_window=None, expect_low=None, expect_high=None,
           format_entrez_query=None, format_object=None, format_type='XML',
           ncbi_gi=None, results_file=None, show_overview=None, megablast=None,
           template_type=None, template_length=None,
           ):
    """BLAST search using NCBI's QBLAST server or a cloud service provider.

    Supports all parameters of the qblast API for Put and Get.

    Please note that BLAST on the cloud supports the NCBI-BLAST Common
    URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To
    use this feature, please set url_base to
    'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and
    format_object='Alignment'. For more details, please see
    https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast

    Some useful parameters:

     - program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
     - database       Which database to search against (e.g. "nr").
     - sequence       The sequence to search.
     - ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
     - descriptions   Number of descriptions to show.  Def 500.
     - alignments     Number of alignments to show.  Def 500.
     - expect         An expect value cutoff.  Def 10.0.
     - matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
     - filter         "none" turns off filtering.  Default no filtering
     - format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
     - entrez_query   Entrez query to limit Blast search
     - hitlist_size   Number of hits to return. Default 50
     - megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
     - service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    https://ncbi.github.io/blast-cloud/dev/api.html

    """
    import time

    assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ('AUTO_FORMAT', auto_format),
        ('COMPOSITION_BASED_STATISTICS', composition_based_statistics),
        ('DATABASE', database),
        ('DB_GENETIC_CODE', db_genetic_code),
        ('ENDPOINTS', endpoints),
        ('ENTREZ_QUERY', entrez_query),
        ('EXPECT', expect),
        ('FILTER', filter),
        ('GAPCOSTS', gapcosts),
        ('GENETIC_CODE', genetic_code),
        ('HITLIST_SIZE', hitlist_size),
        ('I_THRESH', i_thresh),
        ('LAYOUT', layout),
        ('LCASE_MASK', lcase_mask),
        ('MEGABLAST', megablast),
        ('MATRIX_NAME', matrix_name),
        ('NUCL_PENALTY', nucl_penalty),
        ('NUCL_REWARD', nucl_reward),
        ('OTHER_ADVANCED', other_advanced),
        ('PERC_IDENT', perc_ident),
        ('PHI_PATTERN', phi_pattern),
        ('PROGRAM', program),
        # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ('QUERY', sequence),
        ('QUERY_FILE', query_file),
        ('QUERY_BELIEVE_DEFLINE', query_believe_defline),
        ('QUERY_FROM', query_from),
        ('QUERY_TO', query_to),
        # ('RESULTS_FILE',...), - Can we use this parameter?
        ('SEARCHSP_EFF', searchsp_eff),
        ('SERVICE', service),
        ('TEMPLATE_TYPE', template_type),
        ('TEMPLATE_LENGTH', template_length),
        ('THRESHOLD', threshold),
        ('UNGAPPED_ALIGNMENT', ungapped_alignment),
        ('WORD_SIZE', word_size),
        ('CMD', 'Put'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = _Request(url_base,
                       message,
                       {"User-Agent": "BiopythonClient"})
    handle = _urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ('ALIGNMENTS', alignments),
        ('ALIGNMENT_VIEW', alignment_view),
        ('DESCRIPTIONS', descriptions),
        ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window),
        ('EXPECT_LOW', expect_low),
        ('EXPECT_HIGH', expect_high),
        ('FORMAT_ENTREZ_QUERY', format_entrez_query),
        ('FORMAT_OBJECT', format_object),
        ('FORMAT_TYPE', format_type),
        ('NCBI_GI', ncbi_gi),
        ('RID', rid),
        ('RESULTS_FILE', results_file),
        ('SERVICE', service),
        ('SHOW_OVERVIEW', show_overview),
        ('CMD', 'Get'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Poll NCBI until the results are ready.  Use a backoff delay from 2 - 120 second wait
    delay = 2.0
    previous = time.time()
    while True:
        current = time.time()
        wait = previous + delay - current
        if wait > 0:
            time.sleep(wait)
            previous = current + wait
        else:
            previous = current
        if delay + .5 * delay <= 120:
            delay += .5 * delay
        else:
            delay = 120

        request = _Request(url_base,
                           message,
                           {"User-Agent": "BiopythonClient"})
        handle = _urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results == "\n\n":
            continue
        # XML results don't have the Status tag when finished
        if "Status=" not in results:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i + len("Status="):j].strip()
        if status.upper() == "READY":
            break

    return StringIO(results)
Beispiel #3
0
>>><<<


579 residues in 3 query   sequences
45119 residues in 180 library sequences
 Scomplib [34.26]
 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008
 Total Scan time:  0.020 Total Display time:  0.010

Function used was FASTA [version 34.26 January 12, 2007]

"""

    from Bio._py3k import StringIO

    alignments = list(FastaM10Iterator(StringIO(simple_example)))
    assert len(alignments) == 4, len(alignments)
    assert len(alignments[0]) == 2
    for a in alignments:
        print("Alignment %i sequences of length %i"
              % (len(a), a.get_alignment_length()))
        for r in a:
            print("%s %s %i" % (r.seq, r.id, r.annotations["original_length"]))
        # print(a.annotations)
    print("Done")

    import os
    path = "../../Tests/Fasta/"
    files = sorted(f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10")
    for filename in files:
        if os.path.splitext(filename)[-1] == ".m10":
Beispiel #4
0
 def __str__(self):
     from Bio._py3k import StringIO
     handle = StringIO()
     save(self, handle)
     handle.seek(0)
     return handle.read()
Beispiel #5
0
V_Harveyi_PATH                 LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
B_subtilis_YXEM                LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
B_subtilis_GlnH_homo_YCKK      LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
YA80_HAEIN                     LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
FLIY_ECOLI                     LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
E_coli_GlnH                    LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
Deinococcus_radiodurans        LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
HISJ_E_COLI                    LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
HISJ_E_COLI                    LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
                               *.: . *        .  *     *:          :

"""

    from Bio._py3k import StringIO

    alignments = list(ClustalIterator(StringIO(aln_example1)))
    assert 1 == len(alignments)
    assert alignments[0]._version == "1.81"
    alignment = alignments[0]
    assert 2 == len(alignment)
    assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069"
    assert alignment[1].id == "gi|671626|emb|CAA85685.1|"
    assert str(alignment[0].seq) == \
          "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \
          "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \
          "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \
          "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \
          "VPTTRAQRRA"

    alignments = list(ClustalIterator(StringIO(aln_example2)))
    assert 1 == len(alignments)
Beispiel #6
0
 def parse_str(self, string):
     """Make string a handle, so it can be taken by parse."""
     return self.parse(StringIO(string))
Beispiel #7
0
 def test_newick_read_scinot(self):
     """Parse Newick branch lengths in scientific notation."""
     tree = Phylo.read(StringIO("(foo:1e-1,bar:0.1)"), 'newick')
     clade_a = tree.clade[0]
     self.assertEqual(clade_a.name, 'foo')
     self.assertAlmostEqual(clade_a.branch_length, 0.1)
 def test_three(self):
     alignments = list(ClustalIterator(StringIO(aln_example3)))
     self.assertEqual(1, len(alignments))
     self.assertEqual(alignments[0]._version, "2.0.9")
 def test_kalign_header(self):
     """Make sure we can parse the Kalign header."""
     alignments = next(ClustalIterator(StringIO(aln_example4)))
     self.assertEqual(2, len(alignments))
    def get_raw_check(self, filename, format, alphabet, comp):
        # Also checking the key_function here
        if comp:
            h = gzip.open(filename, "rb")
            raw_file = h.read()
            h.close()
            h = gzip_open(filename, format)
            id_list = [
                rec.id.lower() for rec in SeqIO.parse(h, format, alphabet)
            ]
            h.close()
        else:
            h = open(filename, "rb")
            raw_file = h.read()
            h.close()
            id_list = [
                rec.id.lower()
                for rec in SeqIO.parse(filename, format, alphabet)
            ]

        if format in ["sff"]:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', BiopythonParserWarning)
                rec_dict = SeqIO.index(filename,
                                       format,
                                       alphabet,
                                       key_function=lambda x: x.lower())
        else:
            rec_dict = SeqIO.index(filename,
                                   format,
                                   alphabet,
                                   key_function=lambda x: x.lower())

        self.assertEqual(set(id_list), set(rec_dict))
        self.assertEqual(len(id_list), len(rec_dict))
        for key in id_list:
            self.assertTrue(key in rec_dict)
            self.assertEqual(key, rec_dict[key].id.lower())
            self.assertEqual(key, rec_dict.get(key).id.lower())
            raw = rec_dict.get_raw(key)
            self.assertTrue(isinstance(raw, bytes),
                            "Didn't get bytes from %s get_raw" % format)
            self.assertTrue(raw.strip())
            self.assertTrue(raw in raw_file)
            rec1 = rec_dict[key]
            # Following isn't very elegant, but it lets me test the
            # __getitem__ SFF code is working.
            if format in SeqIO._BinaryFormats:
                handle = BytesIO(raw)
            else:
                handle = StringIO(_bytes_to_string(raw))
            if format == "sff":
                rec2 = SeqIO.SffIO._sff_read_seq_record(
                    handle,
                    rec_dict._proxy._flows_per_read,
                    rec_dict._proxy._flow_chars,
                    rec_dict._proxy._key_sequence,
                    rec_dict._proxy._alphabet,
                    trim=False)
            elif format == "sff-trim":
                rec2 = SeqIO.SffIO._sff_read_seq_record(
                    handle,
                    rec_dict._proxy._flows_per_read,
                    rec_dict._proxy._flow_chars,
                    rec_dict._proxy._key_sequence,
                    rec_dict._proxy._alphabet,
                    trim=True)
            elif format == "uniprot-xml":
                self.assertTrue(raw.startswith(_as_bytes("<entry ")))
                self.assertTrue(raw.endswith(_as_bytes("</entry>")))
                # Currently the __getitem__ method uses this
                # trick too, but we hope to fix that later
                raw = """<?xml version='1.0' encoding='UTF-8'?>
                <uniprot xmlns="http://uniprot.org/uniprot"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                xsi:schemaLocation="http://uniprot.org/uniprot
                http://www.uniprot.org/support/docs/uniprot.xsd">
                %s
                </uniprot>
                """ % _bytes_to_string(raw)
                handle = StringIO(raw)
                rec2 = SeqIO.read(handle, format, alphabet)
            else:
                rec2 = SeqIO.read(handle, format, alphabet)
            self.assertEqual(True, compare_record(rec1, rec2))
        rec_dict.close()
        del rec_dict
 def test_empy(self):
     """Checking empty file."""
     self.assertEqual(0, len(list(ClustalIterator(StringIO("")))))
Beispiel #12
0
                            self.xml_generator.endElement("property")

                elif isinstance(value, (int, float, basestring)):

                    attr = {"name": key, "value": str(value)}
                    self.xml_generator.startElement(
                        "property", AttributesImpl(attr))
                    self.xml_generator.endElement("property")

if __name__ == "__main__":
    print("Running quick self test")

    from Bio import SeqIO
    import sys

    with open("Tests/SeqXML/protein_example.xml", "r") as fileHandle:
        records = list(SeqIO.parse(fileHandle, "seqxml"))

    from Bio._py3k import StringIO
    stringHandle = StringIO()

    SeqIO.write(records, stringHandle, "seqxml")
    SeqIO.write(records, sys.stdout, "seqxml")
    print("")

    stringHandle.seek(0)
    records = list(SeqIO.parse(stringHandle, "seqxml"))

    SeqIO.write(records, sys.stdout, "seqxml")
    print("")
Beispiel #13
0
 def test_invalid_format(self):
     """Check convert file format checking."""
     self.assertRaises(ValueError, TogoWS.convert, StringIO("PLACEHOLDER"),
                       "genbank", "invalid_for_testing")
     self.assertRaises(ValueError, TogoWS.convert, StringIO("PLACEHOLDER"),
                       "invalid_for_testing", "fasta")
            raise ValueError("Need a DNA, RNA or Protein alphabet")


if __name__ == "__main__":
    from Bio._py3k import StringIO
    print("Quick self test")
    print("")
    print("Repeated names without a TAXA block")
    handle = StringIO("""#NEXUS
    [TITLE: NoName]

    begin data;
    dimensions ntax=4 nchar=50;
    format interleave datatype=protein   gap=- symbols="FSTNKEYVQMCLAWPHDRIG";

    matrix
    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 
    ALEU_HORVU          MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 
    CATH_HUMAN          ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK----
    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X
    ;
    end; 
    """)  # noqa for pep8 W291 trailing whitespace
    for a in NexusIterator(handle):
        print(a)
        for r in a:
            print("%r %s %s" % (r.seq, r.name, r.id))
    print("Done")

    print("")
    print("Repeated names with a TAXA block")
Beispiel #15
0
 def setUp(self):
     self.old_stdout = sys.stdout
     sys.stdout = StringIO()
Beispiel #16
0
 def from_string(cls, treetext):
     """Convert file handle to StringIO object."""
     handle = StringIO(treetext)
     return cls(handle)
Beispiel #17
0
 def test_parse_qblast_ref_page(self):
     with open("Blast/html_msgid_29_blastx_001.html", "r") as f:
         handle = StringIO(f.read())
     self.assertRaises(ValueError, NCBIWWW._parse_qblast_ref_page, handle)
Beispiel #18
0
def _do_segmentation(cnarr,
                     method,
                     threshold,
                     variants=None,
                     skip_low=False,
                     skip_outliers=10,
                     min_weight=0,
                     save_dataframe=False,
                     rscript_path="Rscript"):
    """Infer copy number segments from the given coverage table."""
    if not len(cnarr):
        return cnarr

    filtered_cn = cnarr.copy()
    # Filter out bins with no or near-zero sequencing coverage
    if skip_low:
        filtered_cn = filtered_cn.drop_low_coverage(verbose=False)
    # Filter by distance from rolling quantiles
    if skip_outliers:
        filtered_cn = drop_outliers(filtered_cn, 50, skip_outliers)
    # Filter by bin weights
    if min_weight:
        weight_too_low = (filtered_cn["weight"] < min_weight).fillna(True)
    else:
        weight_too_low = (filtered_cn["weight"] == 0).fillna(True)
    n_weight_too_low = weight_too_low.sum() if len(weight_too_low) else 0
    if n_weight_too_low:
        filtered_cn = filtered_cn[~weight_too_low]
        if min_weight:
            logging.debug("Dropped %d bins with weight below %s",
                          n_weight_too_low, min_weight)
        else:
            logging.debug("Dropped %d bins with zero weight", n_weight_too_low)

    if len(filtered_cn) != len(cnarr):
        msg = ("Dropped %d / %d bins" %
               (len(cnarr) - len(filtered_cn), len(cnarr)))
        if cnarr["chromosome"].iat[0] == cnarr["chromosome"].iat[-1]:
            msg += " on chromosome " + str(cnarr["chromosome"].iat[0])
        logging.info(msg)
    if not len(filtered_cn):
        return filtered_cn

    seg_out = ""
    if method == 'haar':
        segarr = haar.segment_haar(filtered_cn, threshold)

    elif method == 'none':
        segarr = none.segment_none(filtered_cn)

    elif method.startswith('hmm'):
        segarr = hmm.segment_hmm(filtered_cn, method, threshold)

    elif method in ('cbs', 'flasso'):
        # Run R scripts to calculate copy number segments
        rscript = {
            'cbs': cbs.CBS_RSCRIPT,
            'flasso': flasso.FLASSO_RSCRIPT,
        }[method]

        filtered_cn['start'] += 1  # Convert to 1-indexed coordinates for R
        with tempfile.NamedTemporaryFile(suffix='.cnr', mode="w+t") as tmp:
            # TODO tabio.write(filtered_cn, tmp, 'seg')
            filtered_cn.data.to_csv(tmp,
                                    index=False,
                                    sep='\t',
                                    float_format='%.6g',
                                    mode="w+t")
            tmp.flush()
            script_strings = {
                'probes_fname': tmp.name,
                'sample_id': cnarr.sample_id,
                'threshold': threshold,
            }
            with core.temp_write_text(rscript % script_strings,
                                      mode='w+t') as script_fname:
                seg_out = core.call_quiet(rscript_path, '--vanilla',
                                          script_fname)
        # Convert R dataframe contents (SEG) to a proper CopyNumArray
        # NB: Automatically shifts 'start' back from 1- to 0-indexed
        segarr = tabio.read(StringIO(seg_out.decode()), "seg", into=CNA)
        if method == 'flasso':
            # Merge adjacent bins with same log2 value into segments
            if 'weight' in filtered_cn:
                segarr['weight'] = filtered_cn['weight']
            else:
                segarr['weight'] = 1.0
            segarr = squash_by_groups(segarr, segarr['log2'], by_arm=True)

    else:
        raise ValueError("Unknown method %r" % method)

    segarr.meta = cnarr.meta.copy()
    if variants and not method.startswith('hmm'):
        # Re-segment the variant allele freqs within each segment
        newsegs = [
            haar.variants_in_segment(subvarr, segment, 0.01 * threshold)
            for segment, subvarr in variants.by_ranges(segarr)
        ]
        segarr = segarr.as_dataframe(pd.concat(newsegs))
        segarr['baf'] = variants.baf_by_ranges(segarr)

    segarr = transfer_fields(segarr, cnarr)
    if save_dataframe:
        return segarr, seg_out
    else:
        return segarr
Beispiel #19
0
 def test_int_labels(self):
     """Read newick formatted tree with numeric labels."""
     tree = Phylo.read(StringIO('(((0:0.1,1:0.1)0.99:0.1,2:0.1)0.98:0.0);'),
                       'newick')
     self.assertEqual(set(leaf.name for leaf in tree.get_terminals()),
                      set(['0', '1', '2']))
Beispiel #20
0
                 SeqRecord(Seq('AAAA'), id='other_sequence'),]
    alignment = MultipleSeqAlignment(sequences)
    try:
        # This should raise a ValueError
        AlignIO.write(alignment, handle, 'phylip')
        assert False, "Duplicate IDs after truncation are not allowed."
    except ValueError as e:
        # Expected - check the error
        assert "Repeated name 'longsequen'" in str(e)

check_phylip_reject_duplicate()


#Check parsers can cope with an empty file
for t_format in AlignIO._FormatToIterator:
    handle = StringIO()
    alignments = list(AlignIO.parse(handle, t_format))
    assert len(alignments) == 0

#Check writers can cope with no alignments
for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter):
    handle = StringIO()
    assert 0 == AlignIO.write([], handle, t_format), \
           "Writing no alignments to %s format should work!" \
           % t_format

#Check writers reject non-alignments
list_of_records = list(AlignIO.read(open("Clustalw/opuntia.aln"),"clustal"))
for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter):
    handle = StringIO()
    try:
Beispiel #21
0
        title = self.clean(record.id)
        seq = self._get_seq_string(record)  # Catches sequence being None
        assert "\t" not in title
        assert "\n" not in title
        assert "\r" not in title
        assert "\t" not in seq
        assert "\n" not in seq
        assert "\r" not in seq
        self.handle.write("%s\t%s\n" % (title, seq))


if __name__ == "__main__":
    print("Running quick self test")
    from Bio._py3k import StringIO

    #This example has a trailing blank line which should be ignored
    handle = StringIO("Alpha\tAAAAAAA\nBeta\tCCCCCCC\n\n")
    records = list(TabIterator(handle))
    assert len(records) == 2

    handle = StringIO("Alpha\tAAAAAAA\tExtra\nBeta\tCCCCCCC\n")
    try:
        records = list(TabIterator(handle))
        assert False, "Should have reject this invalid example!"
    except ValueError:
        #Good!
        pass

    print("Done")
Beispiel #22
0
def check_simple_write_read(alignments, indent=" "):
    #print(indent+"Checking we can write and then read back these alignments")
    for format in test_write_read_align_with_seq_count:
        records_per_alignment = len(alignments[0])
        for a in alignments:
            if records_per_alignment != len(a):
                records_per_alignment = None
        #Can we expect this format to work?
        if not records_per_alignment \
        and format not in test_write_read_alignment_formats:
            continue

        print(indent+"Checking can write/read as '%s' format" % format)

        #Going to write to a handle...
        handle = StringIO()

        try:
            c = AlignIO.write(alignments, handle=handle, format=format)
            assert c == len(alignments)
        except ValueError as e:
            #This is often expected to happen, for example when we try and
            #write sequences of different lengths to an alignment file.
            print(indent+"Failed: %s" % str(e))
            #Carry on to the next format:
            continue

        #First, try with the seq_count
        if records_per_alignment:
            handle.flush()
            handle.seek(0)
            try:
                alignments2 = list(AlignIO.parse(handle=handle, format=format,
                                                 seq_count=records_per_alignment))
            except ValueError as e:
                #This is BAD.  We can't read our own output.
                #I want to see the output when called from the test harness,
                #run_tests.py (which can be funny about new lines on Windows)
                handle.seek(0)
                raise ValueError("%s\n\n%s\n\n%s"
                                  % (str(e), repr(handle.read()), repr(alignments2)))
            simple_alignment_comparison(alignments, alignments2, format)

        if format in test_write_read_alignment_formats:
            #Don't need the seq_count
            handle.flush()
            handle.seek(0)
            try:
                alignments2 = list(AlignIO.parse(handle=handle, format=format))
            except ValueError as e:
                #This is BAD.  We can't read our own output.
                #I want to see the output when called from the test harness,
                #run_tests.py (which can be funny about new lines on Windows)
                handle.seek(0)
                raise ValueError("%s\n\n%s\n\n%s"
                                  % (str(e), repr(handle.read()), repr(alignments2)))
            simple_alignment_comparison(alignments, alignments2, format)

        if len(alignments)>1:
            #Try writing just one Alignment (not a list)
            handle = StringIO()
            SeqIO.write(alignments[0], handle, format)
            assert handle.getvalue() == alignments[0].format(format)
Beispiel #23
0
 def get(self, offset):
     """Returns SeqRecord."""
     #Should be overridden for binary file formats etc:
     return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
Beispiel #24
0
def qblast(
    program,
    database,
    sequence,
    url_base=NCBI_BLAST_URL,
    auto_format=None,
    composition_based_statistics=None,
    db_genetic_code=None,
    endpoints=None,
    entrez_query='(none)',
    expect=10.0,
    filter=None,
    gapcosts=None,
    genetic_code=None,
    hitlist_size=50,
    i_thresh=None,
    layout=None,
    lcase_mask=None,
    matrix_name=None,
    nucl_penalty=None,
    nucl_reward=None,
    other_advanced=None,
    perc_ident=None,
    phi_pattern=None,
    query_file=None,
    query_believe_defline=None,
    query_from=None,
    query_to=None,
    searchsp_eff=None,
    service=None,
    threshold=None,
    ungapped_alignment=None,
    word_size=None,
    short_query=None,
    alignments=500,
    alignment_view=None,
    descriptions=500,
    entrez_links_new_window=None,
    expect_low=None,
    expect_high=None,
    format_entrez_query=None,
    format_object=None,
    format_type='XML',
    ncbi_gi=None,
    results_file=None,
    show_overview=None,
    megablast=None,
    template_type=None,
    template_length=None,
):
    """BLAST search using NCBI's QBLAST server or a cloud service provider.

    Supports all parameters of the old qblast API for Put and Get.

    Please note that NCBI uses the new Common URL API for BLAST searches
    on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus,
    some of the parameters used by this function are not (or are no longer)
    officially supported by NCBI. Although they are still functioning, this
    may change in the future.

    The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows
    doing BLAST searches on cloud servers. To use this feature, please set
    ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'``
    and ``format_object='Alignment'``. For more details, please see
    https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast

    Some useful parameters:

     - program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
     - database       Which database to search against (e.g. "nr").
     - sequence       The sequence to search.
     - ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
     - descriptions   Number of descriptions to show.  Def 500.
     - alignments     Number of alignments to show.  Def 500.
     - expect         An expect value cutoff.  Def 10.0.
     - matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
     - filter         "none" turns off filtering.  Default no filtering
     - format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
     - entrez_query   Entrez query to limit Blast search
     - hitlist_size   Number of hits to return. Default 50
     - megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
     - short_query    TRUE/FALSE whether to adjust the search parameters for a
                      short query sequence. Note that this will override
                      manually set parameters like word size and e value. Turns
                      off when sequence length is > 30 residues. Default: None.
     - service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    https://ncbi.github.io/blast-cloud/dev/api.html

    """
    import time

    programs = ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
    if program not in programs:
        raise ValueError("Program specified is %s. Expected one of %s" %
                         (program, ", ".join(programs)))

    # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter
    # assignment from NCBIs side).
    # Thus we set the (known) parameters directly:
    if short_query and program == 'blastn':
        short_query = None
        # We only use the 'short-query' parameters for short sequences:
        if len(sequence) < 31:
            expect = 1000
            word_size = 7
            nucl_reward = 1
            filter = None
            lcase_mask = None
            warnings.warn(
                '"SHORT_QUERY_ADJUST" is incorrectly implemented '
                '(by NCBI) for blastn. We bypass the problem by '
                'manually adjusting the search parameters. Thus, '
                'results may slightly differ from web page '
                'searches.', BiopythonWarning)

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ('AUTO_FORMAT', auto_format),
        ('COMPOSITION_BASED_STATISTICS', composition_based_statistics),
        ('DATABASE', database),
        ('DB_GENETIC_CODE', db_genetic_code),
        ('ENDPOINTS', endpoints),
        ('ENTREZ_QUERY', entrez_query),
        ('EXPECT', expect),
        ('FILTER', filter),
        ('GAPCOSTS', gapcosts),
        ('GENETIC_CODE', genetic_code),
        ('HITLIST_SIZE', hitlist_size),
        ('I_THRESH', i_thresh),
        ('LAYOUT', layout),
        ('LCASE_MASK', lcase_mask),
        ('MEGABLAST', megablast),
        ('MATRIX_NAME', matrix_name),
        ('NUCL_PENALTY', nucl_penalty),
        ('NUCL_REWARD', nucl_reward),
        ('OTHER_ADVANCED', other_advanced),
        ('PERC_IDENT', perc_ident),
        ('PHI_PATTERN', phi_pattern),
        ('PROGRAM', program),
        # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ('QUERY', sequence),
        ('QUERY_FILE', query_file),
        ('QUERY_BELIEVE_DEFLINE', query_believe_defline),
        ('QUERY_FROM', query_from),
        ('QUERY_TO', query_to),
        # ('RESULTS_FILE',...), - Can we use this parameter?
        ('SEARCHSP_EFF', searchsp_eff),
        ('SERVICE', service),
        ('SHORT_QUERY_ADJUST', short_query),
        ('TEMPLATE_TYPE', template_type),
        ('TEMPLATE_LENGTH', template_length),
        ('THRESHOLD', threshold),
        ('UNGAPPED_ALIGNMENT', ungapped_alignment),
        ('WORD_SIZE', word_size),
        ('CMD', 'Put'),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = _Request(url_base, message, {"User-Agent": "BiopythonClient"})
    handle = _urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ('ALIGNMENTS', alignments),
        ('ALIGNMENT_VIEW', alignment_view),
        ('DESCRIPTIONS', descriptions),
        ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window),
        ('EXPECT_LOW', expect_low),
        ('EXPECT_HIGH', expect_high),
        ('FORMAT_ENTREZ_QUERY', format_entrez_query),
        ('FORMAT_OBJECT', format_object),
        ('FORMAT_TYPE', format_type),
        ('NCBI_GI', ncbi_gi),
        ('RID', rid),
        ('RESULTS_FILE', results_file),
        ('SERVICE', service),
        ('SHOW_OVERVIEW', show_overview),
        ('CMD', 'Get'),
    ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(_urlencode(query))

    # Poll NCBI until the results are ready.
    # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
    # 1. Do not contact the server more often than once every 10 seconds.
    # 2. Do not poll for any single RID more often than once a minute.
    # 3. Use the URL parameter email and tool, so that the NCBI
    #    can contact you if there is a problem.
    # 4. Run scripts weekends or between 9 pm and 5 am Eastern time
    #    on weekdays if more than 50 searches will be submitted.
    # --
    # Could start with a 10s delay, but expect most short queries
    # will take longer thus at least 70s with delay. Therefore,
    # start with 20s delay, thereafter once a minute.
    delay = 20  # seconds
    while True:
        current = time.time()
        wait = qblast._previous + delay - current
        if wait > 0:
            time.sleep(wait)
            qblast._previous = current + wait
        else:
            qblast._previous = current
        # delay by at least 60 seconds only if running the request against the public NCBI API
        if delay < 60 and url_base == NCBI_BLAST_URL:
            # Wasn't a quick return, must wait at least a minute
            delay = 60

        request = _Request(url_base, message,
                           {"User-Agent": "BiopythonClient"})
        handle = _urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results == "\n\n":
            continue
        # XML results don't have the Status tag when finished
        if "Status=" not in results:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i + len("Status="):j].strip()
        if status.upper() == "READY":
            break
    return StringIO(results)
Beispiel #25
0
 def from_string(cls, treetext):
     """Instantiate the Newick Tree class from the given string."""
     handle = StringIO(treetext)
     return cls(handle)
Beispiel #26
0
            count += 1
            print_record(record)
        assert count == 1
        print(str(record.__class__))

    if os.path.isfile(faa_filename):
        print("--------")
        print("FastaIterator (multiple sequences)")
        iterator = FastaIterator(open(faa_filename, "r"),
                                 alphabet=generic_protein,
                                 title2ids=genbank_name_function)
        count = 0
        for record in iterator:
            count += 1
            print_record(record)
            break
        assert count > 0
        print(str(record.__class__))

    from Bio._py3k import StringIO
    print("--------")
    print("FastaIterator (empty input file)")
    #Just to make sure no errors happen
    iterator = FastaIterator(StringIO(""))
    count = 0
    for record in iterator:
        count += 1
    assert count == 0

    print("Done")
Beispiel #27
0
 def parse_str(self, string):
     return self.parse(StringIO(string))
print(is_blank_line('', allow_spaces=1))  # 1
print(is_blank_line('', allow_spaces=0))  # 1
print(is_blank_line(string.whitespace, allow_spaces=1))  # 1
print(is_blank_line('hello'))  # 0
print(is_blank_line('hello', allow_spaces=1))  # 0
print(is_blank_line('hello', allow_spaces=0))  # 0
print(is_blank_line(string.whitespace, allow_spaces=0))  # 0

# safe_readline

print("Running tests on safe_readline")

data = """This
file"""

h = File.UndoHandle(StringIO(data))

safe_readline = ParserSupport.safe_readline
print(safe_readline(h))  # "This"
print(safe_readline(h))  # "file"
try:
    safe_readline(h)
except ValueError:
    print("correctly failed")
else:
    print("ERROR, should have failed")

# safe_peekline

print("Running tests on safe_peekline")
safe_peekline = ParserSupport.safe_peekline
Beispiel #29
0
 def from_string(cls, treetext):
     handle = StringIO(treetext)
     return cls(handle)
Beispiel #30
0
def CifAtomIterator(handle):
    """Return SeqRecord objects for each chain in a PDB file.

    The sequences are derived from the 3D structure (ATOM records), not the
    SEQRES lines in the PDB file header.

    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
    are converted to "X" in the sequence.

    In addition to information from the PDB header (which is the same for all
    records), the following chain specific information is placed in the
    annotation:

    record.annotations["residues"] = List of residue ID strings
    record.annotations["chain"] = Chain ID (typically A, B ,...)
    record.annotations["model"] = Model ID (typically zero)

    Where amino acids are missing from the structure, as indicated by residue
    numbering, the sequence is filled in with 'X' characters to match the size
    of the missing region, and  None is included as the corresponding entry in
    the list record.annotations["residues"].

    This function uses the Bio.PDB module to do most of the hard work. The
    annotation information could be improved but this extra parsing should be
    done in parse_pdb_header, not this module.

    This gets called internally via Bio.SeqIO for the atom based interpretation
    of the PDB file format:

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-atom"):
    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...
    Record id 1A8O:A, chain A

    Equivalently,

    >>> with open("PDB/1A8O.cif") as handle:
    ...     for record in CifAtomIterator(handle):
    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...
    Record id 1A8O:A, chain A

    """
    # TODO - Add record.annotations to the doctest, esp the residues (not working?)

    # Only import parser when needed, to avoid/delay NumPy dependency in SeqIO
    from Bio.PDB.MMCIFParser import MMCIFParser
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict

    # The PdbAtomIterator uses UndoHandle to peek at the first line and get the
    # PDB ID. The equivalent for mmCIF is the _entry.id field. AFAIK, the mmCIF
    # format does not constrain the order of fields, so we need to parse the
    # entire file using MMCIF2Dict. We copy the contents of the handle into a
    # StringIO buffer first, so that both MMCIF2Dict and MMCIFParser can
    # consume the handle.
    buffer = StringIO()
    shutil.copyfileobj(handle, buffer)

    # check if file is empty
    if len(buffer.getvalue()) == 0:
        raise ValueError("Empty file.")

    buffer.seek(0)
    mmcif_dict = MMCIF2Dict(buffer)
    if "_entry.id" in mmcif_dict:
        pdb_id = mmcif_dict["_entry.id"]
        if isinstance(pdb_id, list):
            pdb_id = pdb_id[0]
    else:
        warnings.warn("Could not find the '_entry.id' field; can't determine "
                      "PDB ID.", BiopythonParserWarning)
        pdb_id = "????"

    buffer.seek(0)
    struct = MMCIFParser().get_structure(pdb_id, buffer)
    for record in AtomIterator(pdb_id, struct):
        yield record