asis 549 TCTTCTTACTCTTAGGAGGATGGGCGCTAGAAAGAGTTTTAAGAGGGTGT 598 asis 311 -------------------------------------------------- 311 asis 599 GAAAGGGGGTTAATAGC 615 asis 311 ----------------- 311 #--------------------------------------- #---------------------------------------""" from Bio._py3k import StringIO alignments = list(EmbossIterator(StringIO(pair_example))) assert len(alignments) == 1 assert len(alignments[0]) == 2 assert [r.id for r in alignments[0]] \ == ["IXI_234", "IXI_235"] alignments = list(EmbossIterator(StringIO(simple_example))) assert len(alignments) == 1 assert len(alignments[0]) == 4 assert [r.id for r in alignments[0]] \ == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"] alignments = list(EmbossIterator(StringIO(pair_example + simple_example))) assert len(alignments) == 2 assert len(alignments[0]) == 2 assert len(alignments[1]) == 4
def qblast(program, database, sequence, url_base=NCBI_BLAST_URL, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, template_type=None, template_length=None, ): """BLAST search using NCBI's QBLAST server or a cloud service provider. Supports all parameters of the qblast API for Put and Get. Please note that BLAST on the cloud supports the NCBI-BLAST Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To use this feature, please set url_base to 'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and format_object='Alignment'. For more details, please see https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast Some useful parameters: - program blastn, blastp, blastx, tblastn, or tblastx (lower case) - database Which database to search against (e.g. "nr"). - sequence The sequence to search. - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. - descriptions Number of descriptions to show. Def 500. - alignments Number of alignments to show. Def 500. - expect An expect value cutoff. Def 10.0. - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). - filter "none" turns off filtering. Default no filtering - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". - entrez_query Entrez query to limit Blast search - hitlist_size Number of hits to return. Default 50 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) - service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: https://ncbi.github.io/blast-cloud/dev/api.html """ import time assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT', auto_format), ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), ('DATABASE', database), ('DB_GENETIC_CODE', db_genetic_code), ('ENDPOINTS', endpoints), ('ENTREZ_QUERY', entrez_query), ('EXPECT', expect), ('FILTER', filter), ('GAPCOSTS', gapcosts), ('GENETIC_CODE', genetic_code), ('HITLIST_SIZE', hitlist_size), ('I_THRESH', i_thresh), ('LAYOUT', layout), ('LCASE_MASK', lcase_mask), ('MEGABLAST', megablast), ('MATRIX_NAME', matrix_name), ('NUCL_PENALTY', nucl_penalty), ('NUCL_REWARD', nucl_reward), ('OTHER_ADVANCED', other_advanced), ('PERC_IDENT', perc_ident), ('PHI_PATTERN', phi_pattern), ('PROGRAM', program), # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY', sequence), ('QUERY_FILE', query_file), ('QUERY_BELIEVE_DEFLINE', query_believe_defline), ('QUERY_FROM', query_from), ('QUERY_TO', query_to), # ('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF', searchsp_eff), ('SERVICE', service), ('TEMPLATE_TYPE', template_type), ('TEMPLATE_LENGTH', template_length), ('THRESHOLD', threshold), ('UNGAPPED_ALIGNMENT', ungapped_alignment), ('WORD_SIZE', word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS', alignments), ('ALIGNMENT_VIEW', alignment_view), ('DESCRIPTIONS', descriptions), ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), ('EXPECT_LOW', expect_low), ('EXPECT_HIGH', expect_high), ('FORMAT_ENTREZ_QUERY', format_entrez_query), ('FORMAT_OBJECT', format_object), ('FORMAT_TYPE', format_type), ('NCBI_GI', ncbi_gi), ('RID', rid), ('RESULTS_FILE', results_file), ('SERVICE', service), ('SHOW_OVERVIEW', show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait delay = 2.0 previous = time.time() while True: current = time.time() wait = previous + delay - current if wait > 0: time.sleep(wait) previous = current + wait else: previous = current if delay + .5 * delay <= 120: delay += .5 * delay else: delay = 120 request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results == "\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i + len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
>>><<< 579 residues in 3 query sequences 45119 residues in 180 library sequences Scomplib [34.26] start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 Total Scan time: 0.020 Total Display time: 0.010 Function used was FASTA [version 34.26 January 12, 2007] """ from Bio._py3k import StringIO alignments = list(FastaM10Iterator(StringIO(simple_example))) assert len(alignments) == 4, len(alignments) assert len(alignments[0]) == 2 for a in alignments: print("Alignment %i sequences of length %i" % (len(a), a.get_alignment_length())) for r in a: print("%s %s %i" % (r.seq, r.id, r.annotations["original_length"])) # print(a.annotations) print("Done") import os path = "../../Tests/Fasta/" files = sorted(f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10") for filename in files: if os.path.splitext(filename)[-1] == ".m10":
def __str__(self): from Bio._py3k import StringIO handle = StringIO() save(self, handle) handle.seek(0) return handle.read()
V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV *.: . * . * *: : """ from Bio._py3k import StringIO alignments = list(ClustalIterator(StringIO(aln_example1))) assert 1 == len(alignments) assert alignments[0]._version == "1.81" alignment = alignments[0] assert 2 == len(alignment) assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069" assert alignment[1].id == "gi|671626|emb|CAA85685.1|" assert str(alignment[0].seq) == \ "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ "VPTTRAQRRA" alignments = list(ClustalIterator(StringIO(aln_example2))) assert 1 == len(alignments)
def parse_str(self, string): """Make string a handle, so it can be taken by parse.""" return self.parse(StringIO(string))
def test_newick_read_scinot(self): """Parse Newick branch lengths in scientific notation.""" tree = Phylo.read(StringIO("(foo:1e-1,bar:0.1)"), 'newick') clade_a = tree.clade[0] self.assertEqual(clade_a.name, 'foo') self.assertAlmostEqual(clade_a.branch_length, 0.1)
def test_three(self): alignments = list(ClustalIterator(StringIO(aln_example3))) self.assertEqual(1, len(alignments)) self.assertEqual(alignments[0]._version, "2.0.9")
def test_kalign_header(self): """Make sure we can parse the Kalign header.""" alignments = next(ClustalIterator(StringIO(aln_example4))) self.assertEqual(2, len(alignments))
def get_raw_check(self, filename, format, alphabet, comp): # Also checking the key_function here if comp: h = gzip.open(filename, "rb") raw_file = h.read() h.close() h = gzip_open(filename, format) id_list = [ rec.id.lower() for rec in SeqIO.parse(h, format, alphabet) ] h.close() else: h = open(filename, "rb") raw_file = h.read() h.close() id_list = [ rec.id.lower() for rec in SeqIO.parse(filename, format, alphabet) ] if format in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) else: rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) self.assertEqual(set(id_list), set(rec_dict)) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertTrue(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(isinstance(raw, bytes), "Didn't get bytes from %s get_raw" % format) self.assertTrue(raw.strip()) self.assertTrue(raw in raw_file) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(_as_bytes("<entry "))) self.assertTrue(raw.endswith(_as_bytes("</entry>"))) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict
def test_empy(self): """Checking empty file.""" self.assertEqual(0, len(list(ClustalIterator(StringIO("")))))
self.xml_generator.endElement("property") elif isinstance(value, (int, float, basestring)): attr = {"name": key, "value": str(value)} self.xml_generator.startElement( "property", AttributesImpl(attr)) self.xml_generator.endElement("property") if __name__ == "__main__": print("Running quick self test") from Bio import SeqIO import sys with open("Tests/SeqXML/protein_example.xml", "r") as fileHandle: records = list(SeqIO.parse(fileHandle, "seqxml")) from Bio._py3k import StringIO stringHandle = StringIO() SeqIO.write(records, stringHandle, "seqxml") SeqIO.write(records, sys.stdout, "seqxml") print("") stringHandle.seek(0) records = list(SeqIO.parse(stringHandle, "seqxml")) SeqIO.write(records, sys.stdout, "seqxml") print("")
def test_invalid_format(self): """Check convert file format checking.""" self.assertRaises(ValueError, TogoWS.convert, StringIO("PLACEHOLDER"), "genbank", "invalid_for_testing") self.assertRaises(ValueError, TogoWS.convert, StringIO("PLACEHOLDER"), "invalid_for_testing", "fasta")
raise ValueError("Need a DNA, RNA or Protein alphabet") if __name__ == "__main__": from Bio._py3k import StringIO print("Quick self test") print("") print("Repeated names without a TAXA block") handle = StringIO("""#NEXUS [TITLE: NoName] begin data; dimensions ntax=4 nchar=50; format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; matrix CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X ; end; """) # noqa for pep8 W291 trailing whitespace for a in NexusIterator(handle): print(a) for r in a: print("%r %s %s" % (r.seq, r.name, r.id)) print("Done") print("") print("Repeated names with a TAXA block")
def setUp(self): self.old_stdout = sys.stdout sys.stdout = StringIO()
def from_string(cls, treetext): """Convert file handle to StringIO object.""" handle = StringIO(treetext) return cls(handle)
def test_parse_qblast_ref_page(self): with open("Blast/html_msgid_29_blastx_001.html", "r") as f: handle = StringIO(f.read()) self.assertRaises(ValueError, NCBIWWW._parse_qblast_ref_page, handle)
def _do_segmentation(cnarr, method, threshold, variants=None, skip_low=False, skip_outliers=10, min_weight=0, save_dataframe=False, rscript_path="Rscript"): """Infer copy number segments from the given coverage table.""" if not len(cnarr): return cnarr filtered_cn = cnarr.copy() # Filter out bins with no or near-zero sequencing coverage if skip_low: filtered_cn = filtered_cn.drop_low_coverage(verbose=False) # Filter by distance from rolling quantiles if skip_outliers: filtered_cn = drop_outliers(filtered_cn, 50, skip_outliers) # Filter by bin weights if min_weight: weight_too_low = (filtered_cn["weight"] < min_weight).fillna(True) else: weight_too_low = (filtered_cn["weight"] == 0).fillna(True) n_weight_too_low = weight_too_low.sum() if len(weight_too_low) else 0 if n_weight_too_low: filtered_cn = filtered_cn[~weight_too_low] if min_weight: logging.debug("Dropped %d bins with weight below %s", n_weight_too_low, min_weight) else: logging.debug("Dropped %d bins with zero weight", n_weight_too_low) if len(filtered_cn) != len(cnarr): msg = ("Dropped %d / %d bins" % (len(cnarr) - len(filtered_cn), len(cnarr))) if cnarr["chromosome"].iat[0] == cnarr["chromosome"].iat[-1]: msg += " on chromosome " + str(cnarr["chromosome"].iat[0]) logging.info(msg) if not len(filtered_cn): return filtered_cn seg_out = "" if method == 'haar': segarr = haar.segment_haar(filtered_cn, threshold) elif method == 'none': segarr = none.segment_none(filtered_cn) elif method.startswith('hmm'): segarr = hmm.segment_hmm(filtered_cn, method, threshold) elif method in ('cbs', 'flasso'): # Run R scripts to calculate copy number segments rscript = { 'cbs': cbs.CBS_RSCRIPT, 'flasso': flasso.FLASSO_RSCRIPT, }[method] filtered_cn['start'] += 1 # Convert to 1-indexed coordinates for R with tempfile.NamedTemporaryFile(suffix='.cnr', mode="w+t") as tmp: # TODO tabio.write(filtered_cn, tmp, 'seg') filtered_cn.data.to_csv(tmp, index=False, sep='\t', float_format='%.6g', mode="w+t") tmp.flush() script_strings = { 'probes_fname': tmp.name, 'sample_id': cnarr.sample_id, 'threshold': threshold, } with core.temp_write_text(rscript % script_strings, mode='w+t') as script_fname: seg_out = core.call_quiet(rscript_path, '--vanilla', script_fname) # Convert R dataframe contents (SEG) to a proper CopyNumArray # NB: Automatically shifts 'start' back from 1- to 0-indexed segarr = tabio.read(StringIO(seg_out.decode()), "seg", into=CNA) if method == 'flasso': # Merge adjacent bins with same log2 value into segments if 'weight' in filtered_cn: segarr['weight'] = filtered_cn['weight'] else: segarr['weight'] = 1.0 segarr = squash_by_groups(segarr, segarr['log2'], by_arm=True) else: raise ValueError("Unknown method %r" % method) segarr.meta = cnarr.meta.copy() if variants and not method.startswith('hmm'): # Re-segment the variant allele freqs within each segment newsegs = [ haar.variants_in_segment(subvarr, segment, 0.01 * threshold) for segment, subvarr in variants.by_ranges(segarr) ] segarr = segarr.as_dataframe(pd.concat(newsegs)) segarr['baf'] = variants.baf_by_ranges(segarr) segarr = transfer_fields(segarr, cnarr) if save_dataframe: return segarr, seg_out else: return segarr
def test_int_labels(self): """Read newick formatted tree with numeric labels.""" tree = Phylo.read(StringIO('(((0:0.1,1:0.1)0.99:0.1,2:0.1)0.98:0.0);'), 'newick') self.assertEqual(set(leaf.name for leaf in tree.get_terminals()), set(['0', '1', '2']))
SeqRecord(Seq('AAAA'), id='other_sequence'),] alignment = MultipleSeqAlignment(sequences) try: # This should raise a ValueError AlignIO.write(alignment, handle, 'phylip') assert False, "Duplicate IDs after truncation are not allowed." except ValueError as e: # Expected - check the error assert "Repeated name 'longsequen'" in str(e) check_phylip_reject_duplicate() #Check parsers can cope with an empty file for t_format in AlignIO._FormatToIterator: handle = StringIO() alignments = list(AlignIO.parse(handle, t_format)) assert len(alignments) == 0 #Check writers can cope with no alignments for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter): handle = StringIO() assert 0 == AlignIO.write([], handle, t_format), \ "Writing no alignments to %s format should work!" \ % t_format #Check writers reject non-alignments list_of_records = list(AlignIO.read(open("Clustalw/opuntia.aln"),"clustal")) for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter): handle = StringIO() try:
title = self.clean(record.id) seq = self._get_seq_string(record) # Catches sequence being None assert "\t" not in title assert "\n" not in title assert "\r" not in title assert "\t" not in seq assert "\n" not in seq assert "\r" not in seq self.handle.write("%s\t%s\n" % (title, seq)) if __name__ == "__main__": print("Running quick self test") from Bio._py3k import StringIO #This example has a trailing blank line which should be ignored handle = StringIO("Alpha\tAAAAAAA\nBeta\tCCCCCCC\n\n") records = list(TabIterator(handle)) assert len(records) == 2 handle = StringIO("Alpha\tAAAAAAA\tExtra\nBeta\tCCCCCCC\n") try: records = list(TabIterator(handle)) assert False, "Should have reject this invalid example!" except ValueError: #Good! pass print("Done")
def check_simple_write_read(alignments, indent=" "): #print(indent+"Checking we can write and then read back these alignments") for format in test_write_read_align_with_seq_count: records_per_alignment = len(alignments[0]) for a in alignments: if records_per_alignment != len(a): records_per_alignment = None #Can we expect this format to work? if not records_per_alignment \ and format not in test_write_read_alignment_formats: continue print(indent+"Checking can write/read as '%s' format" % format) #Going to write to a handle... handle = StringIO() try: c = AlignIO.write(alignments, handle=handle, format=format) assert c == len(alignments) except ValueError as e: #This is often expected to happen, for example when we try and #write sequences of different lengths to an alignment file. print(indent+"Failed: %s" % str(e)) #Carry on to the next format: continue #First, try with the seq_count if records_per_alignment: handle.flush() handle.seek(0) try: alignments2 = list(AlignIO.parse(handle=handle, format=format, seq_count=records_per_alignment)) except ValueError as e: #This is BAD. We can't read our own output. #I want to see the output when called from the test harness, #run_tests.py (which can be funny about new lines on Windows) handle.seek(0) raise ValueError("%s\n\n%s\n\n%s" % (str(e), repr(handle.read()), repr(alignments2))) simple_alignment_comparison(alignments, alignments2, format) if format in test_write_read_alignment_formats: #Don't need the seq_count handle.flush() handle.seek(0) try: alignments2 = list(AlignIO.parse(handle=handle, format=format)) except ValueError as e: #This is BAD. We can't read our own output. #I want to see the output when called from the test harness, #run_tests.py (which can be funny about new lines on Windows) handle.seek(0) raise ValueError("%s\n\n%s\n\n%s" % (str(e), repr(handle.read()), repr(alignments2))) simple_alignment_comparison(alignments, alignments2, format) if len(alignments)>1: #Try writing just one Alignment (not a list) handle = StringIO() SeqIO.write(alignments[0], handle, format) assert handle.getvalue() == alignments[0].format(format)
def get(self, offset): """Returns SeqRecord.""" #Should be overridden for binary file formats etc: return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
def qblast( program, database, sequence, url_base=NCBI_BLAST_URL, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, short_query=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, template_type=None, template_length=None, ): """BLAST search using NCBI's QBLAST server or a cloud service provider. Supports all parameters of the old qblast API for Put and Get. Please note that NCBI uses the new Common URL API for BLAST searches on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus, some of the parameters used by this function are not (or are no longer) officially supported by NCBI. Although they are still functioning, this may change in the future. The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows doing BLAST searches on cloud servers. To use this feature, please set ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'`` and ``format_object='Alignment'``. For more details, please see https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast Some useful parameters: - program blastn, blastp, blastx, tblastn, or tblastx (lower case) - database Which database to search against (e.g. "nr"). - sequence The sequence to search. - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. - descriptions Number of descriptions to show. Def 500. - alignments Number of alignments to show. Def 500. - expect An expect value cutoff. Def 10.0. - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). - filter "none" turns off filtering. Default no filtering - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". - entrez_query Entrez query to limit Blast search - hitlist_size Number of hits to return. Default 50 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) - short_query TRUE/FALSE whether to adjust the search parameters for a short query sequence. Note that this will override manually set parameters like word size and e value. Turns off when sequence length is > 30 residues. Default: None. - service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: https://ncbi.github.io/blast-cloud/dev/api.html """ import time programs = ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] if program not in programs: raise ValueError("Program specified is %s. Expected one of %s" % (program, ", ".join(programs))) # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter # assignment from NCBIs side). # Thus we set the (known) parameters directly: if short_query and program == 'blastn': short_query = None # We only use the 'short-query' parameters for short sequences: if len(sequence) < 31: expect = 1000 word_size = 7 nucl_reward = 1 filter = None lcase_mask = None warnings.warn( '"SHORT_QUERY_ADJUST" is incorrectly implemented ' '(by NCBI) for blastn. We bypass the problem by ' 'manually adjusting the search parameters. Thus, ' 'results may slightly differ from web page ' 'searches.', BiopythonWarning) # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT', auto_format), ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), ('DATABASE', database), ('DB_GENETIC_CODE', db_genetic_code), ('ENDPOINTS', endpoints), ('ENTREZ_QUERY', entrez_query), ('EXPECT', expect), ('FILTER', filter), ('GAPCOSTS', gapcosts), ('GENETIC_CODE', genetic_code), ('HITLIST_SIZE', hitlist_size), ('I_THRESH', i_thresh), ('LAYOUT', layout), ('LCASE_MASK', lcase_mask), ('MEGABLAST', megablast), ('MATRIX_NAME', matrix_name), ('NUCL_PENALTY', nucl_penalty), ('NUCL_REWARD', nucl_reward), ('OTHER_ADVANCED', other_advanced), ('PERC_IDENT', perc_ident), ('PHI_PATTERN', phi_pattern), ('PROGRAM', program), # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY', sequence), ('QUERY_FILE', query_file), ('QUERY_BELIEVE_DEFLINE', query_believe_defline), ('QUERY_FROM', query_from), ('QUERY_TO', query_to), # ('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF', searchsp_eff), ('SERVICE', service), ('SHORT_QUERY_ADJUST', short_query), ('TEMPLATE_TYPE', template_type), ('TEMPLATE_LENGTH', template_length), ('THRESHOLD', threshold), ('UNGAPPED_ALIGNMENT', ungapped_alignment), ('WORD_SIZE', word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS', alignments), ('ALIGNMENT_VIEW', alignment_view), ('DESCRIPTIONS', descriptions), ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), ('EXPECT_LOW', expect_low), ('EXPECT_HIGH', expect_high), ('FORMAT_ENTREZ_QUERY', format_entrez_query), ('FORMAT_OBJECT', format_object), ('FORMAT_TYPE', format_type), ('NCBI_GI', ncbi_gi), ('RID', rid), ('RESULTS_FILE', results_file), ('SERVICE', service), ('SHOW_OVERVIEW', show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo # 1. Do not contact the server more often than once every 10 seconds. # 2. Do not poll for any single RID more often than once a minute. # 3. Use the URL parameter email and tool, so that the NCBI # can contact you if there is a problem. # 4. Run scripts weekends or between 9 pm and 5 am Eastern time # on weekdays if more than 50 searches will be submitted. # -- # Could start with a 10s delay, but expect most short queries # will take longer thus at least 70s with delay. Therefore, # start with 20s delay, thereafter once a minute. delay = 20 # seconds while True: current = time.time() wait = qblast._previous + delay - current if wait > 0: time.sleep(wait) qblast._previous = current + wait else: qblast._previous = current # delay by at least 60 seconds only if running the request against the public NCBI API if delay < 60 and url_base == NCBI_BLAST_URL: # Wasn't a quick return, must wait at least a minute delay = 60 request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results == "\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i + len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
def from_string(cls, treetext): """Instantiate the Newick Tree class from the given string.""" handle = StringIO(treetext) return cls(handle)
count += 1 print_record(record) assert count == 1 print(str(record.__class__)) if os.path.isfile(faa_filename): print("--------") print("FastaIterator (multiple sequences)") iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function) count = 0 for record in iterator: count += 1 print_record(record) break assert count > 0 print(str(record.__class__)) from Bio._py3k import StringIO print("--------") print("FastaIterator (empty input file)") #Just to make sure no errors happen iterator = FastaIterator(StringIO("")) count = 0 for record in iterator: count += 1 assert count == 0 print("Done")
def parse_str(self, string): return self.parse(StringIO(string))
print(is_blank_line('', allow_spaces=1)) # 1 print(is_blank_line('', allow_spaces=0)) # 1 print(is_blank_line(string.whitespace, allow_spaces=1)) # 1 print(is_blank_line('hello')) # 0 print(is_blank_line('hello', allow_spaces=1)) # 0 print(is_blank_line('hello', allow_spaces=0)) # 0 print(is_blank_line(string.whitespace, allow_spaces=0)) # 0 # safe_readline print("Running tests on safe_readline") data = """This file""" h = File.UndoHandle(StringIO(data)) safe_readline = ParserSupport.safe_readline print(safe_readline(h)) # "This" print(safe_readline(h)) # "file" try: safe_readline(h) except ValueError: print("correctly failed") else: print("ERROR, should have failed") # safe_peekline print("Running tests on safe_peekline") safe_peekline = ParserSupport.safe_peekline
def from_string(cls, treetext): handle = StringIO(treetext) return cls(handle)
def CifAtomIterator(handle): """Return SeqRecord objects for each chain in a PDB file. The sequences are derived from the 3D structure (ATOM records), not the SEQRES lines in the PDB file header. Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries are converted to "X" in the sequence. In addition to information from the PDB header (which is the same for all records), the following chain specific information is placed in the annotation: record.annotations["residues"] = List of residue ID strings record.annotations["chain"] = Chain ID (typically A, B ,...) record.annotations["model"] = Model ID (typically zero) Where amino acids are missing from the structure, as indicated by residue numbering, the sequence is filled in with 'X' characters to match the size of the missing region, and None is included as the corresponding entry in the list record.annotations["residues"]. This function uses the Bio.PDB module to do most of the hard work. The annotation information could be improved but this extra parsing should be done in parse_pdb_header, not this module. This gets called internally via Bio.SeqIO for the atom based interpretation of the PDB file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-atom"): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... Record id 1A8O:A, chain A Equivalently, >>> with open("PDB/1A8O.cif") as handle: ... for record in CifAtomIterator(handle): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... Record id 1A8O:A, chain A """ # TODO - Add record.annotations to the doctest, esp the residues (not working?) # Only import parser when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB.MMCIFParser import MMCIFParser from Bio.PDB.MMCIF2Dict import MMCIF2Dict # The PdbAtomIterator uses UndoHandle to peek at the first line and get the # PDB ID. The equivalent for mmCIF is the _entry.id field. AFAIK, the mmCIF # format does not constrain the order of fields, so we need to parse the # entire file using MMCIF2Dict. We copy the contents of the handle into a # StringIO buffer first, so that both MMCIF2Dict and MMCIFParser can # consume the handle. buffer = StringIO() shutil.copyfileobj(handle, buffer) # check if file is empty if len(buffer.getvalue()) == 0: raise ValueError("Empty file.") buffer.seek(0) mmcif_dict = MMCIF2Dict(buffer) if "_entry.id" in mmcif_dict: pdb_id = mmcif_dict["_entry.id"] if isinstance(pdb_id, list): pdb_id = pdb_id[0] else: warnings.warn("Could not find the '_entry.id' field; can't determine " "PDB ID.", BiopythonParserWarning) pdb_id = "????" buffer.seek(0) struct = MMCIFParser().get_structure(pdb_id, buffer) for record in AtomIterator(pdb_id, struct): yield record