def test_save_and_load(self): states = "NR" alphabet = "AGTC" p_initial = array([1.0, 0.0]) p_transition = array([[0.75, 0.25], [0.25, 0.75]]) p_emission = array( [[0.45, 0.36, 0.06, 0.13], [0.24, 0.18, 0.12, 0.46]]) markov_model_save = MarkovModel.MarkovModel( states, alphabet, p_initial, p_transition, p_emission) handle = StringIO() MarkovModel.save(markov_model_save, handle) handle.seek(0) markov_model_load = MarkovModel.load(handle) self.assertEqual(''.join(markov_model_load.states), states) self.assertEqual(''.join(markov_model_load.alphabet), alphabet) self.assertTrue(array_equal(markov_model_load.p_initial, p_initial)) self.assertTrue(array_equal (markov_model_load.p_transition, p_transition)) self.assertTrue(array_equal(markov_model_load.p_emission, p_emission))
def test_write_species(self): """Test writing species from annotation tags.""" record = SeqIO.read("SwissProt/sp016", "swiss") self.assertEqual(record.annotations["organism"], "H**o sapiens (Human)") self.assertEqual(record.annotations["ncbi_taxid"], ["9606"]) handle = StringIO() SeqIO.write(record, handle, "seqxml") handle.seek(0) output = handle.getvalue() self.assertTrue("H**o sapiens (Human)" in output) self.assertTrue("9606" in output) if '<species name="H**o sapiens (Human)" ncbiTaxID="9606"/>' in output: # Good, but don't get this (do we?) pass elif '<species name="H**o sapiens (Human)" ncbiTaxID="9606"></species>' in output: # Not as concise, but fine (seen on C Python) pass elif '<species ncbiTaxID="9606" name="H**o sapiens (Human)"></species>' in output: # Jython uses a different order pass elif '<species ncbiTaxID="9606" name="H**o sapiens (Human)"/>' in output: # This would be fine too, but don't get this (do we?) pass else: raise ValueError("Mising expected <species> tag: %r" % output)
def do_comparison(good_record, test_record): """Compare two records to see if they are the same. Ths compares the two GenBank record, and will raise an AssertionError if two lines do not match, showing the non-matching lines. """ good_handle = StringIO(good_record) test_handle = StringIO(test_record) while True: good_line = good_handle.readline() test_line = test_handle.readline() if not (good_line) and not (test_line): break if not (good_line): raise AssertionError("Extra info in Test: %r" % test_line) if not (test_line): raise AssertionError("Extra info in Expected: %r" % good_line) test_normalized = " ".join(x for x in test_line.split() if x) good_normalized = " ".join(x for x in good_line.split() if x) assert test_normalized == good_normalized, "Expected does not match Test.\nExpect: %r\nTest: %r\n" % ( good_line, test_line, )
def test_000_write_invalid_but_parsed_locus_line(self): """Make sure we survive writing slightly invalid LOCUS lines we could parse.""" # grab a valid file with open(path.join('GenBank', 'NC_005816.gb'), 'r') as handle: lines = handle.readlines() # futz with the molecule type to make it lower case invalid_line = "LOCUS NC_005816 9609 bp dna circular BCT 21-JUL-2008\n" lines[0] = invalid_line fake_handle = StringIO("".join(lines)) # Make sure parsing this actually raises a warning with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") rec = SeqIO.read(fake_handle, 'genbank') self.assertEqual(len(caught), 1) self.assertEqual(caught[0].category, BiopythonParserWarning) self.assertEqual(str(caught[0].message), "Non-upper case molecule type in LOCUS line: dna") out_handle = StringIO() ret = SeqIO.write([rec], out_handle, 'genbank') self.assertEqual(ret, 1) out_handle.seek(0) out_lines = out_handle.readlines() self.assertEqual(out_lines[0], invalid_line)
def test_fasta_out(self): """Check FASTQ to FASTA output""" records = SeqIO.parse("Quality/example.fastq", "fastq") h = StringIO() SeqIO.write(records, h, "fasta") with open("Quality/example.fasta") as expected: self.assertEqual(h.getvalue(), expected.read())
def test_widget(self): """Try widget derived functionality. """ test_widget = BasicChromosome.ChromosomeSegment() expected_string = "chr_percent = 0.25" # trick to write the properties to a string save_stdout = sys.stdout new_stdout = StringIO() sys.stdout = new_stdout test_widget.dumpProperties() properties = new_stdout.getvalue() sys.stdout = save_stdout self.assertTrue(expected_string in properties, "Unexpected results from dumpProperties: \n %s" % properties) properties = test_widget.getProperties() self.assertEqual(properties["label_size"], 6, "Unexpected results from getProperties: %s" % properties) test_widget.setProperties({"start_x_position": 12}) self.assertEqual(test_widget.start_x_position, 12, "setProperties doesn't seem to work right: %s" % test_widget.start_x_position)
def test_qual_negative(self): """Check QUAL negative scores mapped to PHRED zero""" data = """>1117_10_107_F3 23 31 -1 -1 -1 29 -1 -1 20 32 -1 18 25 7 -1 6 -1 -1 -1 30 -1 20 13 7 -1 -1 21 30 -1 24 -1 22 -1 -1 22 14 -1 12 26 21 -1 5 -1 -1 -1 20 -1 -1 12 28 >1117_10_146_F3 20 33 -1 -1 -1 29 -1 -1 28 28 -1 7 16 5 -1 30 -1 -1 -1 14 -1 4 13 4 -1 -1 11 13 -1 5 -1 7 -1 -1 10 16 -1 4 12 15 -1 8 -1 -1 -1 16 -1 -1 10 4 >1117_10_1017_F3 33 33 -1 -1 -1 27 -1 -1 17 16 -1 28 24 11 -1 6 -1 -1 -1 29 -1 8 29 24 -1 -1 8 8 -1 20 -1 13 -1 -1 8 13 -1 28 10 24 -1 10 -1 -1 -1 4 -1 -1 7 6 >1117_11_136_F3 16 22 -1 -1 -1 33 -1 -1 30 27 -1 27 28 32 -1 29 -1 -1 -1 27 -1 18 9 6 -1 -1 23 16 -1 26 -1 5 7 -1 22 7 -1 18 14 8 -1 8 -1 -1 -1 11 -1 -1 4 24""" h = StringIO(data) h2 = StringIO() self.assertEqual(4, SeqIO.convert(h, "qual", h2, "fastq")) self.assertEqual(h2.getvalue(), """@1117_10_107_F3 ?????????????????????????????????????????????????? + 8@!!!>!!5A!3:(!'!!!?!5.(!!6?!9!7!!7/!-;6!&!!!5!!-= @1117_10_146_F3 ?????????????????????????????????????????????????? + 5B!!!>!!==!(1&!?!!!/!%.%!!,.!&!(!!+1!%-0!)!!!1!!+% @1117_10_1017_F3 ?????????????????????????????????????????????????? + BB!!!<!!21!=9,!'!!!>!)>9!!))!5!.!!).!=+9!+!!!%!!(' @1117_11_136_F3 ?????????????????????????????????????????????????? + 17!!!B!!?<!<=A!>!!!<!3*'!!81!;!&(!7(!3/)!)!!!,!!%9 """)
def test_fastq_2000(self): """Read and write back simple example with upper case 2000bp read""" data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", "ACGT"*500, "!@a~"*500) handle = StringIO() self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue())
def test_fastq_1000(self): """Read and write back simple example with mixed case 1000bp read""" data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", "ACGTNncgta"*100, "abcd!!efgh"*100) handle = StringIO() self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue())
def test_generated(self): """Write and read back odd SeqRecord objects""" record1 = SeqRecord(Seq("ACGT"*500, generic_dna), id="Test", description="Long "*500, letter_annotations={"phred_quality":[40,30,20,10]*500}) record2 = SeqRecord(MutableSeq("NGGC"*1000), id="Mut", description="very "*1000+"long", letter_annotations={"phred_quality":[0,5,5,10]*1000}) record3 = SeqRecord(UnknownSeq(2000,character="N"), id="Unk", description="l"+("o"*1000)+"ng", letter_annotations={"phred_quality":[0,1]*1000}) record4 = SeqRecord(Seq("ACGT"*500), id="no_descr", description="", name="", letter_annotations={"phred_quality":[40,50,60,62]*500}) record5 = SeqRecord(Seq("",generic_dna), id="empty_p", description="(could have been trimmed lots)", letter_annotations={"phred_quality":[]}) record6 = SeqRecord(Seq(""), id="empty_s", description="(could have been trimmed lots)", letter_annotations={"solexa_quality":[]}) record7 = SeqRecord(Seq("ACNN"*500), id="Test_Sol", description="Long "*500, letter_annotations={"solexa_quality":[40,30,0,-5]*500}) record8 = SeqRecord(Seq("ACGT"), id="HighQual", description="With very large qualities that even Sanger FASTQ can't hold!", letter_annotations={"solexa_quality":[0,10,100,1000]}) #TODO - Record with no identifier? records = [record1, record2, record3, record4, record5, record6, record7, record8] #TODO - Have a Biopython defined "DataLossWarning?" warnings.simplefilter('ignore', BiopythonWarning) #TODO - Include phd output? for format in ["fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"]: handle = StringIO() SeqIO.write(records, handle, format) handle.seek(0) compare_records(records, list(SeqIO.parse(handle, format)), truncation_expected(format)) warnings.filters.pop()
def check_convert_fails(in_filename, in_format, out_format, alphabet=None): qual_truncate = truncation_expected(out_format) #We want the SAME error message from parse/write as convert! err1 = None try: records = list(SeqIO.parse(in_filename,in_format, alphabet)) handle = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.write(records, handle, out_format) if qual_truncate: warnings.filters.pop() handle.seek(0) assert False, "Parse or write should have failed!" except ValueError as err: err1 = err #Now do the conversion... try: handle2 = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet) if qual_truncate: warnings.filters.pop() assert False, "Convert should have failed!" except ValueError as err2: assert str(err1) == str(err2), \ "Different failures, parse/write:\n%s\nconvert:\n%s" \ % (err1, err2)
def test_longer_locus_line(self): """Check that we can read and write files with longer locus lines.""" # Create example file from existing file with open(path.join("GenBank", "DS830848.gb"), 'r') as inhandle: data = inhandle.readlines() data[0] = "LOCUS AZZZAA021234567891234 2147483647 bp DNA linear PRI 15-OCT-2018\n" # Create memory file from modified genbank file in_tmp = StringIO() in_tmp.writelines(data) in_tmp.seek(0) with warnings.catch_warnings(): warnings.simplefilter("ignore") in_tmp.seek(0) record = SeqIO.read(in_tmp, 'genbank') # Create temporary output memory file out_tmp = StringIO() SeqIO.write(record, out_tmp, 'genbank') # Check that the written file can be read back in out_tmp.seek(0) record_in = SeqIO.read(out_tmp, 'genbank') self.assertEqual(record_in.id, "DS830848.1") self.assertEqual(record_in.name, "AZZZAA021234567891234") self.assertEqual(len(record_in.seq), 2147483647)
def test_draw_ascii(self): """Tree to Graph conversion.""" handle = StringIO() tree = Phylo.read(EX_APAF, 'phyloxml') Phylo.draw_ascii(tree, file=handle) Phylo.draw_ascii(tree, file=handle, column_width=120) handle.close()
def simple_check(self, base_name, in_variant): for out_variant in ["sanger", "solexa", "illumina"]: in_filename = "Quality/%s_original_%s.fastq" \ % (base_name, in_variant) self.assertTrue(os.path.isfile(in_filename)) # Load the reference output... with open("Quality/%s_as_%s.fastq" % (base_name, out_variant), _universal_read_mode) as handle: expected = handle.read() with warnings.catch_warnings(): if out_variant != "sanger": # Ignore data loss warnings from max qualities warnings.simplefilter("ignore", BiopythonWarning) warnings.simplefilter("ignore", UserWarning) # Check matches using convert... handle = StringIO() SeqIO.convert(in_filename, "fastq-"+in_variant, handle, "fastq-"+out_variant) self.assertEqual(expected, handle.getvalue()) # Check matches using parse/write handle = StringIO() SeqIO.write(SeqIO.parse(in_filename, "fastq-"+in_variant), handle, "fastq-"+out_variant) self.assertEqual(expected, handle.getvalue())
def __str__(self): """Create a string representation of the MarkovModel object.""" from Bio._py3k import StringIO handle = StringIO() save(self, handle) handle.seek(0) return handle.read()
def loop(self, filename, format): original_records = list(SeqIO.parse(filename, format)) # now open a connection to load the database server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) db_name = "test_loop_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(original_records) self.assertEqual(count, len(original_records)) server.commit() # Now read them back... biosql_records = [db.lookup(name=rec.name) for rec in original_records] # And check they agree self.assertTrue(compare_records(original_records, biosql_records)) # Now write to a handle... handle = StringIO() SeqIO.write(biosql_records, handle, "gb") # Now read them back... handle.seek(0) new_records = list(SeqIO.parse(handle, "gb")) # And check they still agree self.assertEqual(len(new_records), len(original_records)) for old, new in zip(original_records, new_records): # TODO - remove this hack because we don't yet write these (yet): for key in ["comment", "references", "db_source"]: if key in old.annotations and key not in new.annotations: del old.annotations[key] self.assertTrue(compare_record(old, new)) # Done handle.close() server.close()
def test_save(self): trieobj = trie.trie() trieobj["foo"] = 1 k = trieobj.keys() self.assertEqual(k, ["foo"]) v = trieobj.values() self.assertEqual(v, [1]) self.assertEqual(trieobj.get("bar", 99), 99) trieobj["hello"] = "55a" self.assertEqual(trieobj.get_approximate("foo", 0), [("foo", 1, 0)]) self.assertEqual(trieobj.get_approximate("foo", 1), [("foo", 1, 0)]) self.assertEqual(trieobj.get_approximate("foa", 0), []) self.assertEqual(trieobj.get_approximate("foa", 1), [("foo", 1, 1)]) x = sorted(trieobj.get_approximate("foa", 2)) self.assertEqual(x, [("foo", 1, 1), ("foo", 1, 2), ("foo", 1, 2)]) # foo foo- foo- # foa f-oa fo-a # mismatch a->o # insertion after f, deletion of o # insertion after o, deletion of o x = trieobj.get_approximate("foo", 4) y = {} for z in x: y[z] = y.get(z, 0) + 1 x = sorted(y.items()) self.assertEqual(x, [(("foo", 1, 0), 1), (("hello", "55a", 4), 6)]) h = StringIO() trie.save(h, trieobj) h.seek(0) trieobj = trie.load(h) k = trieobj.keys() self.assertTrue("foo" in k) self.assertTrue("hello" in k) self.assertEqual(repr(trieobj["foo"]), "1") self.assertEqual(repr(trieobj["hello"]), "'55a'")
def test_write_read(self): """Checking write/read.""" alignments = list(ClustalIterator(StringIO(aln_example1))) + list(ClustalIterator(StringIO(aln_example2))) * 2 handle = StringIO() self.assertEqual(3, ClustalWriter(handle).write_file(alignments)) handle.seek(0) for i, a in enumerate(ClustalIterator(handle)): self.assertEqual(a.get_alignment_length(), alignments[i].get_alignment_length())
def test_format_branch_length(self): """Custom format string for Newick branch length serialization.""" tree = Phylo.read(StringIO("A:0.1;"), "newick") mem_file = StringIO() Phylo.write(tree, mem_file, "newick", format_branch_length="%.0e") # Py2.5 compat: Windows with Py2.5- represents this as 1e-001; # on all other platforms it's 1e-01 self.assertTrue(mem_file.getvalue().strip() in ["A:1e-01;", "A:1e-001;"])
def test_format_phylip(self): dm = DistanceMatrix(self.names, self.matrix) handle = StringIO() dm.format_phylip(handle) lines = handle.getvalue().splitlines() self.assertEqual(len(lines), len(dm) + 1) self.assertTrue(lines[0].endswith(str(len(dm)))) for name, line in zip(self.names, lines[1:]): self.assertTrue(line.startswith(name))
def test_no_name(self): """Test FASTA record with no identifier.""" handle = StringIO(">\nACGT") record = SeqIO.read(handle, "fasta") handle.close() self.assertEqual(str(record.seq), "ACGT") self.assertEqual("", record.id) self.assertEqual("", record.name) self.assertEqual("", record.description)
def read_longer_than_maxsize(): with open(path.join("GenBank", "DS830848.gb"), 'r') as inhandle: data2 = inhandle.readlines() data2[0] = "LOCUS AZZZAA02123456789 " + str(sys.maxsize + 1) + " bp DNA linear PRI 15-OCT-2018\n" long_in_tmp = StringIO() long_in_tmp.writelines(data2) long_in_tmp.seek(0) record = SeqIO.read(long_in_tmp, 'genbank')
def test_write_read_single(self): """Testing write/read when there is only one sequence.""" alignment = next(ClustalIterator(StringIO(aln_example1))) # Now thae just the first row as a new alignment: alignment = alignment[0:1] handle = StringIO() ClustalWriter(handle).write_file([alignment]) handle.seek(0) for i, a in enumerate(ClustalIterator(handle)): self.assertEqual(a.get_alignment_length(), alignment.get_alignment_length()) self.assertEqual(len(a), 1)
def test_locus_line_topogoly(self): """Test if chromosome topology is conserved""" record = SeqIO.read('GenBank/DS830848.gb', 'genbank') self.assertEqual(record.annotations['topology'], 'linear') out_handle = StringIO() SeqIO.write([record], out_handle, 'genbank') first_line = out_handle.getvalue().split('\n')[0] self.assertIn('linear', first_line) with open('GenBank/DS830848.gb', 'r') as fh: orig_first_line = fh.readline().strip() self.assertEqual(first_line, orig_first_line)
def test_write(self): correct_output_a = """graph [ directed 1 node [ id 0 label "1" a 1 ] node [ id 1 label "2" ] edge [ source 0 target 1 x "x" ] edge [ source 1 target 0 label "zzzz" ] ]""" correct_output_b = """graph [ directed 1 node [ id 0 label "2" ] node [ id 1 label "1" a 1 ] edge [ source 1 target 0 x "x" ] edge [ source 0 target 1 label "zzzz" ] ]""" out = StringIO() writer = GmlWriter(out) graph = DiGraph() graph.add_node(1, {'a' : 1 }) graph.add_edge(1, 2, {'x' : 'x'}) graph.add_edge(2, 1, "zzzz") writer.write(graph) self.assertIn(out.getvalue(), set([correct_output_a, correct_output_b]))
def test_newick_write(self): """Parse a Nexus file with multiple trees.""" # Tree with internal node labels mem_file = StringIO() tree = Phylo.read(StringIO("(A,B,(C,D)E)F;"), "newick") Phylo.write(tree, mem_file, "newick") mem_file.seek(0) tree2 = Phylo.read(mem_file, "newick") # Sanity check self.assertEqual(tree2.count_terminals(), 4) # Check internal node labels were retained internal_names = set(c.name for c in tree2.get_nonterminals() if c is not None) self.assertEqual(internal_names, set(("E", "F")))
def test_genbank_date_datetime(self): """Check if datetime objects are handled correctly.""" sequence_object = Seq("ATGC", generic_dna) record = SeqRecord(sequence_object, id='123456789', name='UnitTest', description='Test case for date parsing') record.annotations["date"] = datetime(2000, 2, 2) handle = StringIO() SeqIO.write(record, handle, 'genbank') handle.seek(0) gb = SeqIO.read(handle, "gb") self.assertEqual(gb.annotations["date"], "02-FEB-2000")
def _write_parse_and_compare(self, read1_records): handle = StringIO() SeqIO.write(read1_records, handle, "seqxml") handle.seek(0) read2_records = list(SeqIO.parse(handle, "seqxml")) self.assertEqual(len(read1_records), len(read2_records)) for record1, record2 in zip(read1_records, read2_records): assert_equal_records(self, record1, record2)
def test_genbank_date_default(self): """Check if default date is handled correctly.""" sequence_object = Seq("ATGC", generic_dna) # check if default value is inserted correctly record = SeqRecord(sequence_object, id='123456789', name='UnitTest', description='Test case for date parsing') handle = StringIO() SeqIO.write(record, handle, 'genbank') handle.seek(0) gb = SeqIO.read(handle, "gb") self.assertEqual(gb.annotations["date"], "01-JAN-1980")
def test_genbank_date_correct(self): """Check if user provided date is inserted correctly.""" sequence_object = Seq("ATGC", generic_dna) record = SeqRecord(sequence_object, id='123456789', name='UnitTest', description='Test case for date parsing') record.annotations["date"] = "24-DEC-2015" handle = StringIO() SeqIO.write(record, handle, 'genbank') handle.seek(0) gb = SeqIO.read(handle, "gb") self.assertEqual(gb.annotations["date"], "24-DEC-2015")
def get(self, offset): """Returns SeqRecord.""" # Should be overridden for binary file formats etc: return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
def check_simple_write_read(alignments, indent=" "): # print(indent+"Checking we can write and then read back these alignments") for format in test_write_read_align_with_seq_count: records_per_alignment = len(alignments[0]) for a in alignments: if records_per_alignment != len(a): records_per_alignment = None # Can we expect this format to work? if not records_per_alignment \ and format not in test_write_read_alignment_formats: continue print(indent + "Checking can write/read as '%s' format" % format) # Going to write to a handle... handle = StringIO() try: c = AlignIO.write(alignments, handle=handle, format=format) assert c == len(alignments) except ValueError as e: # This is often expected to happen, for example when we try and # write sequences of different lengths to an alignment file. print(indent + "Failed: %s" % str(e)) # Carry on to the next format: continue # First, try with the seq_count if records_per_alignment: handle.flush() handle.seek(0) try: alignments2 = list( AlignIO.parse(handle=handle, format=format, seq_count=records_per_alignment)) except ValueError as e: # This is BAD. We can't read our own output. # I want to see the output when called from the test harness, # run_tests.py (which can be funny about new lines on Windows) handle.seek(0) raise ValueError( "%s\n\n%s\n\n%s" % (str(e), repr(handle.read()), repr(alignments2))) simple_alignment_comparison(alignments, alignments2, format) if format in test_write_read_alignment_formats: # Don't need the seq_count handle.flush() handle.seek(0) try: alignments2 = list(AlignIO.parse(handle=handle, format=format)) except ValueError as e: # This is BAD. We can't read our own output. # I want to see the output when called from the test harness, # run_tests.py (which can be funny about new lines on Windows) handle.seek(0) raise ValueError( "%s\n\n%s\n\n%s" % (str(e), repr(handle.read()), repr(alignments2))) simple_alignment_comparison(alignments, alignments2, format) if len(alignments) > 1: # Try writing just one Alignment (not a list) handle = StringIO() AlignIO.write(alignments[0:1], handle, format) assert handle.getvalue() == alignments[0].format(format)
import os.path import unittest import shutil from Bio._py3k import StringIO import tempfile from Bio import File data = """This is a multi-line file""" ### UndoHandle h = File.UndoHandle(StringIO(data)) print(h.readline()) # 'This' print(h.peekline()) # 'is' print(h.readline()) # 'is' h.saveline("saved") print(h.peekline()) # 'saved' h.saveline("another") print(h.readline()) # 'another' print(h.readline()) # 'saved' # Test readlines after saveline h.saveline("saved again") lines = h.readlines() print(repr(lines[0])) # 'saved again' print(repr(lines[1])) # 'a multi-line'
def testParse(self): f = open("./SCOP/dir.cla.scop.txt_test") try: cla = f.read() f.close() f = open("./SCOP/dir.des.scop.txt_test") des = f.read() f.close() f = open("./SCOP/dir.hie.scop.txt_test") hie = f.read() finally: f.close() scop = Scop(StringIO(cla), StringIO(des), StringIO(hie)) cla_out = StringIO() scop.write_cla(cla_out) lines = zip(cla.rstrip().split('\n'), cla_out.getvalue().rstrip().split('\n')) for expected_line, line in lines: self.assertTrue(self._compare_cla_lines(expected_line, line)) des_out = StringIO() scop.write_des(des_out) self.assertEqual(des_out.getvalue(), des) hie_out = StringIO() scop.write_hie(hie_out) self.assertEqual(hie_out.getvalue(), hie) domain = scop.getDomainBySid("d1hbia_") self.assertEqual(domain.sunid, 14996) domains = scop.getDomains() self.assertEqual(len(domains), 14) self.assertEqual(domains[4].sunid, 14988) dom = scop.getNodeBySunid(-111) self.assertEqual(dom, None) dom = scop.getDomainBySid("no such domain") self.assertEqual(dom, None)
asis 549 TCTTCTTACTCTTAGGAGGATGGGCGCTAGAAAGAGTTTTAAGAGGGTGT 598 asis 311 -------------------------------------------------- 311 asis 599 GAAAGGGGGTTAATAGC 615 asis 311 ----------------- 311 #--------------------------------------- #---------------------------------------""" from Bio._py3k import StringIO alignments = list(EmbossIterator(StringIO(pair_example))) assert len(alignments) == 1 assert len(alignments[0]) == 2 assert [r.id for r in alignments[0]] \ == ["IXI_234", "IXI_235"] alignments = list(EmbossIterator(StringIO(simple_example))) assert len(alignments) == 1 assert len(alignments[0]) == 4 assert [r.id for r in alignments[0]] \ == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"] alignments = list(EmbossIterator(StringIO(pair_example + simple_example))) assert len(alignments) == 2 assert len(alignments[0]) == 2 assert len(alignments[1]) == 4
def test_stringio(self): s = StringIO() with File.as_handle(s) as handle: self.assertIs(s, handle)
def test_read_write_clustal(self): """Test the base alignment stuff.""" path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln") alignment = AlignIO.read(path, "clustal", alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) self.assertEqual(len(alignment), 7) seq_record = alignment[0] self.assertEqual(seq_record.description, "gi|6273285|gb|AF191659.1|AF191") self.assertEqual(seq_record.seq, Seq("TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCCATTGATTTAGTGTACCAGA")) seq_record = alignment[1] self.assertEqual(seq_record.description, "gi|6273284|gb|AF191658.1|AF191") self.assertEqual(seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATA--------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA") seq_record = alignment[2] self.assertEqual(seq_record.description, "gi|6273287|gb|AF191661.1|AF191") self.assertEqual(seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA") seq_record = alignment[3] self.assertEqual(seq_record.description, "gi|6273286|gb|AF191660.1|AF191") self.assertEqual(seq_record.seq, "TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTATAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA") seq_record = alignment[4] self.assertEqual(seq_record.description, "gi|6273290|gb|AF191664.1|AF191") self.assertEqual(seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA") seq_record = alignment[5] self.assertEqual(seq_record.description, "gi|6273289|gb|AF191663.1|AF191") self.assertEqual(seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTATACCAGA") seq_record = alignment[6] self.assertEqual(seq_record.description, "gi|6273291|gb|AF191665.1|AF191") self.assertEqual(seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA") self.assertEqual(alignment.get_alignment_length(), 156) align_info = AlignInfo.SummaryInfo(alignment) consensus = align_info.dumb_consensus() self.assertIsInstance(consensus, Seq) self.assertEqual(consensus, "TATACATTAAAGXAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTXCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA") dictionary = align_info.replacement_dictionary(["N"]) self.assertEqual(len(dictionary), 16) self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1) self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1) self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1) self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1) self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1) self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1) self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1) self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1) self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1) self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1) self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1) self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1) self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1) self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1) self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1) self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1) matrix = align_info.pos_specific_score_matrix(consensus, ["N"]) self.assertEqual(str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 X 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 T 0.0 0.0 0.0 3.0 A 3.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 X 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) matrix = align_info.pos_specific_score_matrix(chars_to_ignore=["N"]) self.assertEqual(str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 X 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 T 0.0 0.0 0.0 3.0 A 3.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 X 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) second_seq = alignment[1].seq matrix = align_info.pos_specific_score_matrix(second_seq, ["N"]) self.assertEqual(str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 - 0.0 0.0 0.0 3.0 - 3.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) value = align_info.information_content(5, 50, chars_to_ignore=["N"]) self.assertAlmostEqual(value, 88.42, places=2) value = align_info.information_content(chars_to_ignore=["N"]) self.assertAlmostEqual(value, 287.55, places=2) e_freq = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25} e_freq_table = FreqTable.FreqTable(e_freq, FreqTable.FREQ, IUPAC.unambiguous_dna) value = align_info.information_content(e_freq_table=e_freq_table, chars_to_ignore=["N"]) self.assertAlmostEqual(value, 287.55, places=2) self.assertEqual(align_info.get_column(1), "AAAAAAA") self.assertAlmostEqual(align_info.ic_vector[1], 2.00, places=2) self.assertEqual(align_info.get_column(7), "TTTATTT") self.assertAlmostEqual(align_info.ic_vector[7], 1.41, places=2) handle = StringIO() AlignInfo.print_info_content(align_info, fout=handle) self.assertEqual(handle.getvalue(), """\ 0 T 2.000 1 A 2.000 2 T 2.000 3 A 2.000 4 C 2.000 5 A 2.000 6 T 2.000 7 T 1.408 8 A 2.000 9 A 2.000 10 A 2.000 11 G 2.000 12 A 1.015 13 A 2.000 14 G 2.000 15 G 2.000 16 G 2.000 17 G 2.000 18 G 2.000 19 A 2.000 20 T 2.000 21 G 2.000 22 C 2.000 23 G 2.000 24 G 2.000 25 A 2.000 26 T 2.000 27 A 2.000 28 A 2.000 29 A 2.000 30 T 2.000 31 G 2.000 32 G 2.000 33 A 2.000 34 A 2.000 35 A 2.000 36 G 2.000 37 G 2.000 38 C 2.000 39 G 2.000 40 A 2.000 41 A 2.000 42 A 2.000 43 G 2.000 44 A 2.000 45 A 2.000 46 A 2.000 47 G 2.000 48 A 2.000 49 A 2.000 50 T 2.000 51 A 2.000 52 T 2.000 53 A 2.000 54 T 2.000 55 A 2.000 56 - 0.682 57 - 0.682 58 - 0.333 59 - 0.333 60 - -0.115 61 - -0.115 62 - -0.115 63 - -0.115 64 - -0.115 65 - -0.115 66 A 2.000 67 T 2.000 68 A 2.000 69 T 2.000 70 A 2.000 71 T 2.000 72 T 2.000 73 T 2.000 74 C 1.408 75 A 1.408 76 A 2.000 77 A 2.000 78 T 2.000 79 T 2.000 80 T 1.015 81 C 2.000 82 C 2.000 83 T 2.000 84 T 2.000 85 A 2.000 86 T 2.000 87 A 2.000 88 T 2.000 89 A 2.000 90 C 1.137 91 C 2.000 92 C 2.000 93 A 2.000 94 A 2.000 95 A 2.000 96 T 2.000 97 A 2.000 98 T 2.000 99 A 2.000 100 A 2.000 101 A 2.000 102 A 2.000 103 A 2.000 104 T 2.000 105 A 2.000 106 T 2.000 107 C 2.000 108 T 2.000 109 A 2.000 110 A 2.000 111 T 2.000 112 A 2.000 113 A 2.000 114 A 2.000 115 T 2.000 116 T 2.000 117 A 2.000 118 G 2.000 119 A 2.000 120 T 2.000 121 G 2.000 122 A 2.000 123 A 2.000 124 T 2.000 125 A 2.000 126 T 2.000 127 C 2.000 128 A 2.000 129 A 2.000 130 A 2.000 131 G 2.000 132 A 2.000 133 A 2.000 134 T 2.000 135 C 2.000 136 C 1.408 137 A 2.000 138 T 2.000 139 T 2.000 140 G 2.000 141 A 2.000 142 T 2.000 143 T 2.000 144 T 2.000 145 A 2.000 146 G 2.000 147 T 2.000 148 G 1.408 149 T 2.000 150 A 2.000 151 C 2.000 152 C 2.000 153 A 2.000 154 G 2.000 155 A 2.000 """)
def from_string(cls, treetext): """Convert file handle to StringIO object.""" handle = StringIO(treetext) return cls(handle)
def test_invalid_format(self): """Check convert file format checking.""" self.assertRaises(ValueError, TogoWS.convert, StringIO("PLACEHOLDER"), "genbank", "invalid_for_testing") self.assertRaises(ValueError, TogoWS.convert, StringIO("PLACEHOLDER"), "invalid_for_testing", "fasta")
print(is_blank_line('', allow_spaces=1)) # 1 print(is_blank_line('', allow_spaces=0)) # 1 print(is_blank_line(string.whitespace, allow_spaces=1)) # 1 print(is_blank_line('hello')) # 0 print(is_blank_line('hello', allow_spaces=1)) # 0 print(is_blank_line('hello', allow_spaces=0)) # 0 print(is_blank_line(string.whitespace, allow_spaces=0)) # 0 ### safe_readline print("Running tests on safe_readline") data = """This file""" h = File.UndoHandle(StringIO(data)) safe_readline = ParserSupport.safe_readline print(safe_readline(h)) # "This" print(safe_readline(h)) # "file" try: safe_readline(h) except ValueError: print("correctly failed") else: print("ERROR, should have failed") ### safe_peekline print("Running tests on safe_peekline") safe_peekline = ParserSupport.safe_peekline
def qblast( program, database, sequence, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, ): """Do a BLAST search using the QBLAST server at NCBI. Supports all parameters of the qblast API for Put and Get. Some useful parameters: program blastn, blastp, blastx, tblastn, or tblastx (lower case) database Which database to search against (e.g. "nr"). sequence The sequence to search. ncbi_gi TRUE/FALSE whether to give 'gi' identifier. descriptions Number of descriptions to show. Def 500. alignments Number of alignments to show. Def 500. expect An expect value cutoff. Def 10.0. matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). filter "none" turns off filtering. Default no filtering format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". entrez_query Entrez query to limit Blast search hitlist_size Number of hits to return. Default 50 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html """ import time assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT', auto_format), ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), ('DATABASE', database), ('DB_GENETIC_CODE', db_genetic_code), ('ENDPOINTS', endpoints), ('ENTREZ_QUERY', entrez_query), ('EXPECT', expect), ('FILTER', filter), ('GAPCOSTS', gapcosts), ('GENETIC_CODE', genetic_code), ('HITLIST_SIZE', hitlist_size), ('I_THRESH', i_thresh), ('LAYOUT', layout), ('LCASE_MASK', lcase_mask), ('MEGABLAST', megablast), ('MATRIX_NAME', matrix_name), ('NUCL_PENALTY', nucl_penalty), ('NUCL_REWARD', nucl_reward), ('OTHER_ADVANCED', other_advanced), ('PERC_IDENT', perc_ident), ('PHI_PATTERN', phi_pattern), ('PROGRAM', program), # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY', sequence), ('QUERY_FILE', query_file), ('QUERY_BELIEVE_DEFLINE', query_believe_defline), ('QUERY_FROM', query_from), ('QUERY_TO', query_to), # ('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF', searchsp_eff), ('SERVICE', service), ('THRESHOLD', threshold), ('UNGAPPED_ALIGNMENT', ungapped_alignment), ('WORD_SIZE', word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS', alignments), ('ALIGNMENT_VIEW', alignment_view), ('DESCRIPTIONS', descriptions), ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), ('EXPECT_LOW', expect_low), ('EXPECT_HIGH', expect_high), ('FORMAT_ENTREZ_QUERY', format_entrez_query), ('FORMAT_OBJECT', format_object), ('FORMAT_TYPE', format_type), ('NCBI_GI', ncbi_gi), ('RID', rid), ('RESULTS_FILE', results_file), ('SERVICE', service), ('SHOW_OVERVIEW', show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait delay = 2.0 previous = time.time() while True: current = time.time() wait = previous + delay - current if wait > 0: time.sleep(wait) previous = current + wait else: previous = current if delay + .5 * delay <= 120: delay += .5 * delay else: delay = 120 request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results == "\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i + len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
def get_raw_check(self, filename, format, alphabet, comp): # Also checking the key_function here if comp: h = gzip.open(filename, "rb") raw_file = h.read() h.close() h = gzip_open(filename, format) id_list = [ rec.id.lower() for rec in SeqIO.parse(h, format, alphabet) ] h.close() else: h = open(filename, "rb") raw_file = h.read() h.close() id_list = [ rec.id.lower() for rec in SeqIO.parse(filename, format, alphabet) ] if format in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) else: rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) self.assertEqual(set(id_list), set(rec_dict)) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertTrue(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(raw.strip()) self.assertTrue(raw in raw_file) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(_as_bytes("<entry "))) self.assertTrue(raw.endswith(_as_bytes("</entry>"))) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict
def test_pair_example3(self): alignments = list(EmbossIterator(StringIO(pair_example3))) self.assertEqual(len(alignments), 1) self.assertEqual(len(alignments[0]), 2) self.assertEqual([r.id for r in alignments[0]], ["asis", "asis"])
def test_simple_example(self): alignments = list(EmbossIterator(StringIO(simple_example))) self.assertEqual(len(alignments), 1) self.assertEqual(len(alignments[0]), 4) self.assertEqual([r.id for r in alignments[0]], ["IXI_234", "IXI_235", "IXI_236", "IXI_237"])
def parse_str(self, string): return self.parse(StringIO(string))
self.xml_generator.endElement("property") elif isinstance(value, (int, float, basestring)): attr = {"name": key, "value": str(value)} self.xml_generator.startElement( "property", AttributesImpl(attr)) self.xml_generator.endElement("property") if __name__ == "__main__": print("Running quick self test") from Bio import SeqIO import sys with open("Tests/SeqXML/protein_example.xml", "r") as fileHandle: records = list(SeqIO.parse(fileHandle, "seqxml")) from Bio._py3k import StringIO stringHandle = StringIO() SeqIO.write(records, stringHandle, "seqxml") SeqIO.write(records, sys.stdout, "seqxml") print("") stringHandle.seek(0) records = list(SeqIO.parse(stringHandle, "seqxml")) SeqIO.write(records, sys.stdout, "seqxml") print("")
def test_parse_qblast_ref_page(self): with open("Blast/html_msgid_29_blastx_001.html", "r") as f: handle = StringIO(f.read()) self.assertRaises(ValueError, NCBIWWW._parse_qblast_ref_page, handle)
def from_string(cls, treetext): """Instantiate the Newick Tree class from the given string.""" handle = StringIO(treetext) return cls(handle)
def test_read(self): """Test read method""" h = File.UndoHandle(StringIO("some text")) h.saveline("more text") self.assertEqual(h.read(), 'more textsome text')
def test_empty_file_read(self): self.assertEqual([], list(NexusIterator(StringIO())))
def _do_segmentation(cnarr, method, threshold, variants=None, skip_low=False, skip_outliers=10, min_weight=0, save_dataframe=False, rlibpath=None): """Infer copy number segments from the given coverage table.""" if not len(cnarr): return cnarr filtered_cn = cnarr.copy() # Filter out bins with no or near-zero sequencing coverage if skip_low: filtered_cn = filtered_cn.drop_low_coverage(verbose=False) # Filter by distance from rolling quantiles if skip_outliers: filtered_cn = drop_outliers(filtered_cn, 50, skip_outliers) # Filter by bin weights if min_weight: weight_too_low = (filtered_cn["weight"] < min_weight).fillna(True) else: weight_too_low = (filtered_cn["weight"] == 0).fillna(True) n_weight_too_low = weight_too_low.sum() if len(weight_too_low) else 0 if n_weight_too_low: filtered_cn = filtered_cn[~weight_too_low] if min_weight: logging.debug("Dropped %d bins with weight below %s", n_weight_too_low, min_weight) else: logging.debug("Dropped %d bins with zero weight", n_weight_too_low) if len(filtered_cn) != len(cnarr): msg = ("Dropped %d / %d bins" % (len(cnarr) - len(filtered_cn), len(cnarr))) if cnarr["chromosome"].iat[0] == cnarr["chromosome"].iat[-1]: msg += " on chromosome " + str(cnarr["chromosome"].iat[0]) logging.info(msg) if not len(filtered_cn): return filtered_cn seg_out = "" if method == 'haar': segarr = haar.segment_haar(filtered_cn, threshold) elif method == 'none': segarr = none.segment_none(filtered_cn) elif method.startswith('hmm'): segarr = hmm.segment_hmm(filtered_cn, method, threshold) elif method in ('cbs', 'flasso'): # Run R scripts to calculate copy number segments rscript = {'cbs': cbs.CBS_RSCRIPT, 'flasso': flasso.FLASSO_RSCRIPT, }[method] filtered_cn['start'] += 1 # Convert to 1-indexed coordinates for R with tempfile.NamedTemporaryFile(suffix='.cnr', mode="w+t") as tmp: # TODO tabio.write(filtered_cn, tmp, 'seg') filtered_cn.data.to_csv(tmp, index=False, sep='\t', float_format='%.6g', mode="w+t") tmp.flush() script_strings = { 'probes_fname': tmp.name, 'sample_id': cnarr.sample_id, 'threshold': threshold, 'rlibpath': ('.libPaths(c("%s"))' % rlibpath if rlibpath else ''), } with core.temp_write_text(rscript % script_strings, mode='w+t') as script_fname: seg_out = core.call_quiet('~/conda/cnvkit_r/bin/Rscript', '--vanilla', script_fname) # Convert R dataframe contents (SEG) to a proper CopyNumArray # NB: Automatically shifts 'start' back from 1- to 0-indexed segarr = tabio.read(StringIO(seg_out.decode()), "seg", into=CNA) if method == 'flasso': # Merge adjacent bins with same log2 value into segments if 'weight' in filtered_cn: segarr['weight'] = filtered_cn['weight'] else: segarr['weight'] = 1.0 segarr = squash_by_groups(segarr, segarr['log2'], by_arm=True) else: raise ValueError("Unknown method %r" % method) segarr.meta = cnarr.meta.copy() if variants and not method.startswith('hmm'): # Re-segment the variant allele freqs within each segment newsegs = [haar.variants_in_segment(subvarr, segment, 0.01 * threshold) for segment, subvarr in variants.by_ranges(segarr)] segarr = segarr.as_dataframe(pd.concat(newsegs)) segarr['baf'] = variants.baf_by_ranges(segarr) segarr = transfer_fields(segarr, cnarr) if save_dataframe: return segarr, seg_out else: return segarr
def test_write_alignment(self): # Default causes no interleave (columns <= 1000) records = [ SeqRecord(Seq("ATGCTGCTGA" * 90, alphabet=ambiguous_dna), id=_id) for _id in ["foo", "bar", "baz"] ] a = MultipleSeqAlignment(records, alphabet=ambiguous_dna) handle = StringIO() NexusWriter(handle).write_alignment(a) handle.seek(0) data = handle.read() self.assertIn("ATGCTGCTGA" * 90, data) # Default causes interleave (columns > 1000) records = [ SeqRecord(Seq("ATGCTGCTGA" * 110, alphabet=ambiguous_dna), id=_id) for _id in ["foo", "bar", "baz"] ] a = MultipleSeqAlignment(records, alphabet=ambiguous_dna) handle = StringIO() NexusWriter(handle).write_alignment(a) handle.seek(0) data = handle.read() self.assertNotIn("ATGCTGCTGA" * 90, data) self.assertIn("ATGCTGCTGA" * 7, data) # Override interleave: True records = [ SeqRecord(Seq("ATGCTGCTGA" * 9, alphabet=ambiguous_dna), id=_id) for _id in ["foo", "bar", "baz"] ] a = MultipleSeqAlignment(records, alphabet=ambiguous_dna) handle = StringIO() NexusWriter(handle).write_alignment(a, interleave=True) handle.seek(0) data = handle.read() self.assertNotIn("ATGCTGCTGA" * 9, data) self.assertIn("ATGCTGCTGA" * 7, data) # Override interleave: False records = [ SeqRecord(Seq("ATGCTGCTGA" * 110, alphabet=ambiguous_dna), id=_id) for _id in ["foo", "bar", "baz"] ] a = MultipleSeqAlignment(records, alphabet=ambiguous_dna) handle = StringIO() NexusWriter(handle).write_alignment(a, interleave=False) handle.seek(0) data = handle.read() self.assertIn("ATGCTGCTGA" * 110, data)
def qblast( program, database, sequence, url_base=NCBI_BLAST_URL, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query="(none)", expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, short_query=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type="XML", ncbi_gi=None, results_file=None, show_overview=None, megablast=None, template_type=None, template_length=None, ): """BLAST search using NCBI's QBLAST server or a cloud service provider. Supports all parameters of the old qblast API for Put and Get. Please note that NCBI uses the new Common URL API for BLAST searches on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus, some of the parameters used by this function are not (or are no longer) officially supported by NCBI. Although they are still functioning, this may change in the future. The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows doing BLAST searches on cloud servers. To use this feature, please set ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'`` and ``format_object='Alignment'``. For more details, please see https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast Some useful parameters: - program blastn, blastp, blastx, tblastn, or tblastx (lower case) - database Which database to search against (e.g. "nr"). - sequence The sequence to search. - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. - descriptions Number of descriptions to show. Def 500. - alignments Number of alignments to show. Def 500. - expect An expect value cutoff. Def 10.0. - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). - filter "none" turns off filtering. Default no filtering - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". - entrez_query Entrez query to limit Blast search - hitlist_size Number of hits to return. Default 50 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) - short_query TRUE/FALSE whether to adjust the search parameters for a short query sequence. Note that this will override manually set parameters like word size and e value. Turns off when sequence length is > 30 residues. Default: None. - service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: https://ncbi.github.io/blast-cloud/dev/api.html """ import time programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"] if program not in programs: raise ValueError("Program specified is %s. Expected one of %s" % (program, ", ".join(programs))) # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter # assignment from NCBIs side). # Thus we set the (known) parameters directly: if short_query and program == "blastn": short_query = None # We only use the 'short-query' parameters for short sequences: if len(sequence) < 31: expect = 1000 word_size = 7 nucl_reward = 1 filter = None lcase_mask = None warnings.warn( '"SHORT_QUERY_ADJUST" is incorrectly implemented ' "(by NCBI) for blastn. We bypass the problem by " "manually adjusting the search parameters. Thus, " "results may slightly differ from web page " "searches.", BiopythonWarning) # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ("AUTO_FORMAT", auto_format), ("COMPOSITION_BASED_STATISTICS", composition_based_statistics), ("DATABASE", database), ("DB_GENETIC_CODE", db_genetic_code), ("ENDPOINTS", endpoints), ("ENTREZ_QUERY", entrez_query), ("EXPECT", expect), ("FILTER", filter), ("GAPCOSTS", gapcosts), ("GENETIC_CODE", genetic_code), ("HITLIST_SIZE", hitlist_size), ("I_THRESH", i_thresh), ("LAYOUT", layout), ("LCASE_MASK", lcase_mask), ("MEGABLAST", megablast), ("MATRIX_NAME", matrix_name), ("NUCL_PENALTY", nucl_penalty), ("NUCL_REWARD", nucl_reward), ("OTHER_ADVANCED", other_advanced), ("PERC_IDENT", perc_ident), ("PHI_PATTERN", phi_pattern), ("PROGRAM", program), # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ("QUERY", sequence), ("QUERY_FILE", query_file), ("QUERY_BELIEVE_DEFLINE", query_believe_defline), ("QUERY_FROM", query_from), ("QUERY_TO", query_to), # ('RESULTS_FILE',...), - Can we use this parameter? ("SEARCHSP_EFF", searchsp_eff), ("SERVICE", service), ("SHORT_QUERY_ADJUST", short_query), ("TEMPLATE_TYPE", template_type), ("TEMPLATE_LENGTH", template_length), ("THRESHOLD", threshold), ("UNGAPPED_ALIGNMENT", ungapped_alignment), ("WORD_SIZE", word_size), ("CMD", "Put"), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ("ALIGNMENTS", alignments), ("ALIGNMENT_VIEW", alignment_view), ("DESCRIPTIONS", descriptions), ("ENTREZ_LINKS_NEW_WINDOW", entrez_links_new_window), ("EXPECT_LOW", expect_low), ("EXPECT_HIGH", expect_high), ("FORMAT_ENTREZ_QUERY", format_entrez_query), ("FORMAT_OBJECT", format_object), ("FORMAT_TYPE", format_type), ("NCBI_GI", ncbi_gi), ("RID", rid), ("RESULTS_FILE", results_file), ("SERVICE", service), ("SHOW_OVERVIEW", show_overview), ("CMD", "Get"), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo # 1. Do not contact the server more often than once every 10 seconds. # 2. Do not poll for any single RID more often than once a minute. # 3. Use the URL parameter email and tool, so that the NCBI # can contact you if there is a problem. # 4. Run scripts weekends or between 9 pm and 5 am Eastern time # on weekdays if more than 50 searches will be submitted. # -- # Could start with a 10s delay, but expect most short queries # will take longer thus at least 70s with delay. Therefore, # start with 20s delay, thereafter once a minute. delay = 20 # seconds while True: current = time.time() wait = qblast._previous + delay - current if wait > 0: time.sleep(wait) qblast._previous = current + wait else: qblast._previous = current # delay by at least 60 seconds only if running the request against the public NCBI API if delay < 60 and url_base == NCBI_BLAST_URL: # Wasn't a quick return, must wait at least a minute delay = 60 request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results == "\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i + len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
def from_string(cls, treetext): handle = StringIO(treetext) return cls(handle)
def test_newick_read_scinot(self): """Parse Newick branch lengths in scientific notation.""" tree = Phylo.read(StringIO("(foo:1e-1,bar:0.1)"), 'newick') clade_a = tree.clade[0] self.assertEqual(clade_a.name, 'foo') self.assertAlmostEqual(clade_a.branch_length, 0.1)
>>><<< 579 residues in 3 query sequences 45119 residues in 180 library sequences Scomplib [34.26] start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 Total Scan time: 0.020 Total Display time: 0.010 Function used was FASTA [version 34.26 January 12, 2007] """ from Bio._py3k import StringIO alignments = list(FastaM10Iterator(StringIO(simple_example))) assert len(alignments) == 4, len(alignments) assert len(alignments[0]) == 2 for a in alignments: print("Alignment %i sequences of length %i" % (len(a), a.get_alignment_length())) for r in a: print("%s %s %i" % (r.seq, r.id, r.annotations["original_length"])) # print(a.annotations) print("Done") import os path = "../../Tests/Fasta/" files = sorted(f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10") for filename in files:
def CifAtomIterator(handle): """Return SeqRecord objects for each chain in a PDB file. The sequences are derived from the 3D structure (ATOM records), not the SEQRES lines in the PDB file header. Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries are converted to "X" in the sequence. In addition to information from the PDB header (which is the same for all records), the following chain specific information is placed in the annotation: record.annotations["residues"] = List of residue ID strings record.annotations["chain"] = Chain ID (typically A, B ,...) record.annotations["model"] = Model ID (typically zero) Where amino acids are missing from the structure, as indicated by residue numbering, the sequence is filled in with 'X' characters to match the size of the missing region, and None is included as the corresponding entry in the list record.annotations["residues"]. This function uses the Bio.PDB module to do most of the hard work. The annotation information could be improved but this extra parsing should be done in parse_pdb_header, not this module. This gets called internally via Bio.SeqIO for the atom based interpretation of the PDB file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-atom"): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... Record id 1A8O:A, chain A Equivalently, >>> with open("PDB/1A8O.cif") as handle: ... for record in CifAtomIterator(handle): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... Record id 1A8O:A, chain A """ # TODO - Add record.annotations to the doctest, esp the residues (not working?) # Only import parser when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB.MMCIFParser import MMCIFParser # The PdbAtomIterator uses UndoHandle to peek at the first line and get the # PDB ID. The equivalent for mmCIF is the _entry.id field. AFAIK, the mmCIF # format does not constrain the order of fields, so we need to parse the # entire file using MMCIF2Dict. We copy the contents of the handle into a # StringIO buffer first, so that both MMCIF2Dict and MMCIFParser can # consume the handle. buffer = StringIO() shutil.copyfileobj(handle, buffer) buffer.seek(0) mmcif_dict = MMCIF2Dict(buffer) if "_entry.id" in mmcif_dict: pdb_id = mmcif_dict["_entry.id"] if isinstance(pdb_id, list): pdb_id = pdb_id[0] else: warnings.warn( "Could not find the '_entry.id' field; can't determine " "PDB ID.", BiopythonParserWarning) pdb_id = '????' buffer.seek(0) struct = MMCIFParser().get_structure(pdb_id, buffer) for record in AtomIterator(pdb_id, struct): yield record
def test_phenotype_IO(self): """Test basic functionalities of phenotype IO methods.""" p1 = phenotype.read(SMALL_JSON_PLATE, "pm-json") p2 = next(phenotype.parse(SMALL_CSV_PLATES, "pm-csv")) handle = StringIO() c = phenotype.write([p1, p2], handle, "pm-json") self.assertEqual(c, 2) handle.flush() handle.seek(0) # Now ready to read back from the handle... try: records = list(phenotype.parse(handle, "pm-json")) except ValueError as e: # This is BAD. We can't read our own output. # I want to see the output when called from the test harness, # run_tests.py (which can be funny about new lines on Windows) handle.seek(0) raise ValueError("%s\n\n%s\n\n%s" % (str(e), repr(handle.read()), repr(records))) self.assertEqual(p1, records[0]) handle.close() handle = StringIO() self.assertRaises(TypeError, phenotype.write, p1, handle, 1) self.assertRaises(ValueError, phenotype.write, p1, handle, "PM-JSON") self.assertRaises(ValueError, phenotype.write, p1, handle, "pm-csv") handle.close()
#just the generic Alphabet (default for fasta files) raise ValueError("Need a DNA, RNA or Protein alphabet") if __name__ == "__main__": from Bio._py3k import StringIO print("Quick self test") print print("Repeated names without a TAXA block") handle = StringIO("""#NEXUS [TITLE: NoName] begin data; dimensions ntax=4 nchar=50; format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; matrix CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X ; end; """) for a in NexusIterator(handle): print(a) for r in a: print("%r %s %s" % (r.seq, r.name, r.id)) print("Done") print("") print("Repeated names with a TAXA block")
count = 0 for record in iterator: count += 1 print_record(record) assert count == 1 print(str(record.__class__)) if os.path.isfile(faa_filename): print("--------") print("FastaIterator (multiple sequences)") iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function) count = 0 for record in iterator: count += 1 print_record(record) break assert count > 0 print(str(record.__class__)) from Bio._py3k import StringIO print("--------") print("FastaIterator (empty input file)") #Just to make sure no errors happen iterator = FastaIterator(StringIO("")) count = 0 for record in iterator: count += 1 assert count == 0 print("Done")