def __format__(self, format_spec): """Returns the alignment as a string in the specified file format. This method supports the python format() function added in Python 2.6/3.0. The format_spec should be a lower case string supported by Bio.AlignIO as an output file format. See also the alignment's format() method.""" if format_spec: from SAP.Bio._py3k import StringIO from SAP.Bio import AlignIO handle = StringIO() AlignIO.write([self], handle, format_spec) return handle.getvalue() else: #Follow python convention and default to using __str__ return str(self)
def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False): """Generator function to parse UniProt XML as SeqRecord objects. parses an XML entry at a time from any UniProt XML file returns a SeqRecord for each iteration This generator can be used in Bio.SeqIO return_raw_comments = True --> comment fields are returned as complete XML to allow further processing skip_parsing_errors = True --> if parsing errors are found, skip to next entry """ if isinstance(alphabet, Alphabet.NucleotideAlphabet): raise ValueError("Wrong alphabet %r" % alphabet) if isinstance(alphabet, Alphabet.Gapped): if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): raise ValueError("Wrong alphabet %r" % alphabet) if not hasattr(handle, "read"): if isinstance(handle, str): handle = StringIO(handle) else: raise Exception('An XML-containing handler or an XML string must be passed') if ElementTree is None: from SAP.Bio import MissingExternalDependencyError raise MissingExternalDependencyError( "No ElementTree module was found. " "Use Python 2.5+, lxml or elementtree if you " "want to use Bio.SeqIO.UniprotIO.") for event, elem in ElementTree.iterparse(handle, events=("start", "end")): if event == "end" and elem.tag == NS + "entry": yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() elem.clear()
def __format__(self, format_spec): """Serialize the tree as a string in the specified file format. This method supports the ``format`` built-in function added in Python 2.6/3.0. :param format_spec: a lower-case string supported by `Bio.Phylo.write` as an output file format. """ if format_spec: from SAP.Bio._py3k import StringIO from SAP.Bio.Phylo import _io handle = StringIO() _io.write([self], handle, format_spec) return handle.getvalue() else: # Follow python convention and default to using __str__ return str(self)
def __format__(self, format_spec): """Returns the record as a string in the specified file format. This method supports the python format() function added in Python 2.6/3.0. The format_spec should be a lower case string supported by Bio.SeqIO as an output file format. See also the SeqRecord's format() method. Under Python 3 please note that for binary formats a bytes string is returned, otherwise a (unicode) string is returned. """ if not format_spec: #Follow python convention and default to using __str__ return str(self) from SAP.Bio import SeqIO if format_spec in SeqIO._BinaryFormats: #Return bytes on Python 3 from io import BytesIO handle = BytesIO() else: from SAP.Bio._py3k import StringIO handle = StringIO() SeqIO.write(self, handle, format_spec) return handle.getvalue()
def __str__(self): from SAP.Bio._py3k import StringIO handle = StringIO() save(self, handle) handle.seek(0) return handle.read()
asis 549 TCTTCTTACTCTTAGGAGGATGGGCGCTAGAAAGAGTTTTAAGAGGGTGT 598 asis 311 -------------------------------------------------- 311 asis 599 GAAAGGGGGTTAATAGC 615 asis 311 ----------------- 311 #--------------------------------------- #---------------------------------------""" from SAP.Bio._py3k import StringIO alignments = list(EmbossIterator(StringIO(pair_example))) assert len(alignments) == 1 assert len(alignments[0]) == 2 assert [r.id for r in alignments[0]] \ == ["IXI_234", "IXI_235"] alignments = list(EmbossIterator(StringIO(simple_example))) assert len(alignments) == 1 assert len(alignments[0]) == 4 assert [r.id for r in alignments[0]] \ == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"] alignments = list(EmbossIterator(StringIO(pair_example + simple_example))) assert len(alignments) == 2 assert len(alignments[0]) == 2 assert len(alignments[1]) == 4
def from_string(cls, treetext): handle = StringIO(treetext) return cls(handle)
count += 1 print_record(record) assert count == 1 print(str(record.__class__)) if os.path.isfile(faa_filename): print("--------") print("FastaIterator (multiple sequences)") with open(faa_filename, "r") as h: iterator = FastaIterator(h, alphabet=generic_protein, title2ids=genbank_name_function) count = 0 for record in iterator: count += 1 print_record(record) break assert count > 0 print(str(record.__class__)) from SAP.Bio._py3k import StringIO print("--------") print("FastaIterator (empty input file)") #Just to make sure no errors happen iterator = FastaIterator(StringIO("")) count = 0 for record in iterator: count += 1 assert count == 0 print("Done")
def get(self, offset): """Returns SeqRecord.""" #Should be overridden for binary file formats etc: return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
def parse_str(self, string): return self.parse(StringIO(string))
def get(self, offset): return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- """ from SAP.Bio._py3k import StringIO handle = StringIO(phylip_text) count = 0 for alignment in PhylipIterator(handle): for record in alignment: count = count+1 print(record.id) #print str(record.seq) assert count == 8 expected = """mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre alnkafaemradgtyeklak kyfdfdvygg""".replace(" ", "").replace("\n", "").upper() assert str(record.seq).replace("-", "") == expected
>>><<< 579 residues in 3 query sequences 45119 residues in 180 library sequences Scomplib [34.26] start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 Total Scan time: 0.020 Total Display time: 0.010 Function used was FASTA [version 34.26 January 12, 2007] """ from SAP.Bio._py3k import StringIO alignments = list(FastaM10Iterator(StringIO(simple_example))) assert len(alignments) == 4, len(alignments) assert len(alignments[0]) == 2 for a in alignments: print("Alignment %i sequences of length %i" \ % (len(a), a.get_alignment_length())) for r in a: print("%s %s %i" % (r.seq, r.id, r.annotations["original_length"])) #print(a.annotations) print("Done") import os path = "../../Tests/Fasta/" files = sorted(f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10") for filename in files:
title = self.clean(record.id) seq = self._get_seq_string(record) # Catches sequence being None assert "\t" not in title assert "\n" not in title assert "\r" not in title assert "\t" not in seq assert "\n" not in seq assert "\r" not in seq self.handle.write("%s\t%s\n" % (title, seq)) if __name__ == "__main__": print("Running quick self test") from SAP.Bio._py3k import StringIO #This example has a trailing blank line which should be ignored handle = StringIO("Alpha\tAAAAAAA\nBeta\tCCCCCCC\n\n") records = list(TabIterator(handle)) assert len(records) == 2 handle = StringIO("Alpha\tAAAAAAA\tExtra\nBeta\tCCCCCCC\n") try: records = list(TabIterator(handle)) assert False, "Should have reject this invalid example!" except ValueError: #Good! pass print("Done")
def qblast(program, database, sequence, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, ): """Do a BLAST search using the QBLAST server at NCBI. Supports all parameters of the qblast API for Put and Get. Some useful parameters: program blastn, blastp, blastx, tblastn, or tblastx (lower case) database Which database to search against (e.g. "nr"). sequence The sequence to search. ncbi_gi TRUE/FALSE whether to give 'gi' identifier. descriptions Number of descriptions to show. Def 500. alignments Number of alignments to show. Def 500. expect An expect value cutoff. Def 10.0. matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). filter "none" turns off filtering. Default no filtering format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". entrez_query Entrez query to limit Blast search hitlist_size Number of hits to return. Default 50 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html """ import time assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT', auto_format), ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), ('DATABASE', database), ('DB_GENETIC_CODE', db_genetic_code), ('ENDPOINTS', endpoints), ('ENTREZ_QUERY', entrez_query), ('EXPECT', expect), ('FILTER', filter), ('GAPCOSTS', gapcosts), ('GENETIC_CODE', genetic_code), ('HITLIST_SIZE', hitlist_size), ('I_THRESH', i_thresh), ('LAYOUT', layout), ('LCASE_MASK', lcase_mask), ('MEGABLAST', megablast), ('MATRIX_NAME', matrix_name), ('NUCL_PENALTY', nucl_penalty), ('NUCL_REWARD', nucl_reward), ('OTHER_ADVANCED', other_advanced), ('PERC_IDENT', perc_ident), ('PHI_PATTERN', phi_pattern), ('PROGRAM', program), #('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY', sequence), ('QUERY_FILE', query_file), ('QUERY_BELIEVE_DEFLINE', query_believe_defline), ('QUERY_FROM', query_from), ('QUERY_TO', query_to), #('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF', searchsp_eff), ('SERVICE', service), ('THRESHOLD', threshold), ('UNGAPPED_ALIGNMENT', ungapped_alignment), ('WORD_SIZE', word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent":"BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS', alignments), ('ALIGNMENT_VIEW', alignment_view), ('DESCRIPTIONS', descriptions), ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), ('EXPECT_LOW', expect_low), ('EXPECT_HIGH', expect_high), ('FORMAT_ENTREZ_QUERY', format_entrez_query), ('FORMAT_OBJECT', format_object), ('FORMAT_TYPE', format_type), ('NCBI_GI', ncbi_gi), ('RID', rid), ('RESULTS_FILE', results_file), ('SERVICE', service), ('SHOW_OVERVIEW', show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait delay = 2.0 previous = time.time() while True: current = time.time() wait = previous + delay - current if wait > 0: time.sleep(wait) previous = current + wait else: previous = current if delay + .5*delay <= 120: delay += .5*delay else: delay = 120 request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent":"BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results=="\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i+len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
"MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \ "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \ "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV" for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)): print("Alignment with %i records of length %i" \ % (len(alignment), alignment.get_alignment_length())) print("Checking empty file...") assert 0 == len(list(ClustalIterator(StringIO("")))) print("Checking write/read...") alignments = list(ClustalIterator(StringIO(aln_example1))) \ + list(ClustalIterator(StringIO(aln_example2)))*2 handle = StringIO() ClustalWriter(handle).write_file(alignments) handle.seek(0) for i, a in enumerate(ClustalIterator(handle)): assert a.get_alignment_length() == alignments[i].get_alignment_length() handle.seek(0) print("Testing write/read when there is only one sequence...") alignment = alignment[0:1] handle = StringIO() ClustalWriter(handle).write_file([alignment]) handle.seek(0) for i, a in enumerate(ClustalIterator(handle)): assert a.get_alignment_length() == alignment.get_alignment_length() assert len(a) == 1
self.xml_generator.endElement("property") elif isinstance(value, (int, float, basestring)): attr = {"name": key, "value": str(value)} self.xml_generator.startElement( "property", AttributesImpl(attr)) self.xml_generator.endElement("property") if __name__ == "__main__": print("Running quick self test") from SAP.Bio import SeqIO import sys with open("Tests/SeqXML/protein_example.xml", "r") as fileHandle: records = list(SeqIO.parse(fileHandle, "seqxml")) from SAP.Bio._py3k import StringIO stringHandle = StringIO() SeqIO.write(records, stringHandle, "seqxml") SeqIO.write(records, sys.stdout, "seqxml") print("") stringHandle.seek(0) records = list(SeqIO.parse(stringHandle, "seqxml")) SeqIO.write(records, sys.stdout, "seqxml") print("")
VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- """ from SAP.Bio._py3k import StringIO handle = StringIO(phylip_text) count = 0 for alignment in PhylipIterator(handle): for record in alignment: count = count + 1 print(record.id) #print str(record.seq) assert count == 8 expected = """mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre alnkafaemradgtyeklak kyfdfdvygg""".replace(" ", "").replace("\n", "").upper() assert str(record.seq).replace("-", "") == expected
#just the generic Alphabet (default for fasta files) raise ValueError("Need a DNA, RNA or Protein alphabet") if __name__ == "__main__": from SAP.Bio._py3k import StringIO print("Quick self test") print("") print("Repeated names without a TAXA block") handle = StringIO("""#NEXUS [TITLE: NoName] begin data; dimensions ntax=4 nchar=50; format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; matrix CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X ; end; """) for a in NexusIterator(handle): print(a) for r in a: print("%r %s %s" % (r.seq, r.name, r.id)) print("Done") print("") print("Repeated names with a TAXA block")
V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV *.: . * . * *: : """ from SAP.Bio._py3k import StringIO alignments = list(ClustalIterator(StringIO(aln_example1))) assert 1 == len(alignments) assert alignments[0]._version == "1.81" alignment = alignments[0] assert 2 == len(alignment) assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069" assert alignment[1].id == "gi|671626|emb|CAA85685.1|" assert str(alignment[0].seq) == \ "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ "VPTTRAQRRA" alignments = list(ClustalIterator(StringIO(aln_example2))) assert 1 == len(alignments)
elif isinstance(value, (int, float, basestring)): attr = {"name": key, "value": str(value)} self.xml_generator.startElement("property", AttributesImpl(attr)) self.xml_generator.endElement("property") if __name__ == "__main__": print("Running quick self test") from SAP.Bio import SeqIO import sys with open("Tests/SeqXML/protein_example.xml", "r") as fileHandle: records = list(SeqIO.parse(fileHandle, "seqxml")) from SAP.Bio._py3k import StringIO stringHandle = StringIO() SeqIO.write(records, stringHandle, "seqxml") SeqIO.write(records, sys.stdout, "seqxml") print("") stringHandle.seek(0) records = list(SeqIO.parse(stringHandle, "seqxml")) SeqIO.write(records, sys.stdout, "seqxml") print("")