print(coding_dna.translate(to_stop=True)) from Bio.Data import CodonTable standard_table = CodonTable.unambiguous_dna_by_name["Standard"] mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"] print(mito_table) print(mito_table.stop_codons) print(mito_table.start_codons) print(mito_table.forward_table["ACG"]) my_seq[1] = "N" mutable_seq = my_seq.tomutable() # or from Bio.Seq import MutableSeq mutable_seq = MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA') mutable_seq[5] = "A" print(mutable_seq) del mutable_seq[4] mutable_seq.remove('A') print(mutable_seq) new_seq = mutable_seq.toseq() print(new_seq) from Bio.Seq import UnknownSeq unk = UnknownSeq(10) print(unk) unk = UnknownSeq(10, character="A") print(unk) unk_protein = unk.translate()
def clean_seqs(fasta_in,fasta_out=None,filter_include_expression=None,filter_exclude_expression=None,bp_ranges=None,start_date=None,end_date=None,ungap=None): iso_date_re = re.compile(r'(\d{4}-\d{2}-\d{2})') bp_ranges = bp_ranges or [] bp_range_str = "_".join([str(t[0])+"-"+str(t[1])+"bp" for t in bp_ranges]) start_date_str = "" if not start_date else "starting_"+start_date.strftime("%Y-%m-%d") end_date_str = "" if not end_date else "ending_"+end_date.strftime("%Y-%m-%d") filter_include_str = "" if not filter_include_expression else "only_subset_by_filter" filter_exclude_str = "" if not filter_exclude_expression else "excluding_some_by_filter" output_summary_string = "_".join(s for s in [bp_range_str,start_date_str,end_date_str,filter_include_str,filter_exclude_str] if len(s)>0) if len(output_summary_string)>0: output_summary_string="_"+output_summary_string in_fasta_basename = os.path.splitext(os.path.basename(fasta_in.name))[0] out_basedir = os.path.realpath(os.path.dirname(fasta_in.name)) out_filepath = fasta_out or os.path.join(out_basedir,in_fasta_basename+"_cleaned"+output_summary_string+".fasta") if os.path.exists(out_filepath): raise IOError("%s already exists; skipping..." % out_filepath) if filter_include_expression: filter_include_re = re.compile(filter_include_expression) if filter_exclude_expression: filter_exclude_re = re.compile(filter_exclude_expression) with open(out_filepath, "w") as handle: fasta_out = FastaIO.FastaWriter(handle, wrap=80) # wrap=None fasta_out.write_header() for record in SeqIO.parse(fasta_in.name, "fasta"): should_output=True if filter_include_expression: should_output=False if filter_include_re.search(record.id) or filter_include_re.search(record.description): should_output=True if filter_exclude_expression and (filter_exclude_re.search(record.id) or filter_exclude_re.search(record.description)): should_output=False if start_date: for field in [record.description,record.id]: match = iso_date_re.search(field) if match: seq_date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d") if seq_date<start_date: should_output=False if end_date: for field in [record.description,record.id]: match = iso_date_re.search(field) if match: seq_date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d") if seq_date>end_date: should_output=False if should_output: if len(bp_ranges)==0: record.seq=MutableSeq(str(record.seq).upper(), DNAAlphabet()) else: output_seq=MutableSeq("", DNAAlphabet()) for start,end in bp_ranges: start-=1 # remove one since biopython seqs are zero-indexed # end-=1 # remove one since biopython seqs are zero-indexed; not needed because slice upper is exclusive start=max(start,0) # bound to limit of sequence end=min(end,len(record)) # bound to limit of sequence output_seq+=record.seq[start:end] record.seq=Seq(str(output_seq).upper(),DNAAlphabet()) if ungap!=None: record.seq=Seq(str(record.seq).upper(),DNAAlphabet()).ungap(ungap) #record.id=copy.deepcopy(record.id).replace(" ",CHARACTER_TO_USE) #record.description=copy.deepcopy(record.description).replace(" ",CHARACTER_TO_USE) # set the id to the description, which is the ID in the case of GISAID # and remove the description. record.id=copy.deepcopy(record.description).replace(" ",CHARACTER_TO_USE) record.description="" fasta_out.write_record(record)
def genome_generator(): """Generate a genome for testing purposes. """ return MutableSeq("1234", TestAlphabet())
def test_reverse_complement_mutable_seq(self): s = SeqRecord(MutableSeq("ACTG")) self.assertEqual("CAGT", str(s.reverse_complement().seq))
def setUp(self): genome = MutableSeq("1111", TestAlphabet()) self.organism = Organism(genome, test_fitness)
lowerCheck = int(covInfo[1]) upperCov = int(float(covInfo[2])) if lowerCheck < lowerCov: #print('Lower < 10') lowerCov = lowerCheck maskingInfo = maskingInfo + strainName + ":\tlowLimit=" + str( lowerCov) + "\tupperLimit=" + str(upperCov) + "\n" strainDict = {} genomeDict = {} #fastaToChangeName = strainName+".fasta" fastaToChange = open(fastaToChangeName, 'r') for seq_record in SeqIO.parse(fastaToChange, "fasta"): strainDict[seq_record.id] = seq_record.seq idStr = str(seq_record.id) seqStr = str(seq_record.seq) genomeDict[idStr] = MutableSeq(seqStr, IUPAC.IUPACAmbiguousDNA()) Ncount = 0 for key in genomeDict: Ncount = Ncount + genomeDict[key].count("N") lenToMaskUpper = 0 lenToMaskLower = 0 existingN = 0 mito = 0 strainBed = strainName + ".bedgraph" bed = open(strainBed, 'r') lines = bed.readlines() for line in lines: currentLine = line.strip('\n') info = currentLine.split() chrom = info[0] chromstart = int(info[1]) - 1
"AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT" + "TATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT" + "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA", generic_dna) print(gene.translate(table="Bacterial")) print(gene.translate(table="Bacterial", cds=True)) ##查看密码子表 from Bio.Data import CodonTable standard_table = CodonTable.unambiguous_dna_by_name["Standard"] mito_table = CodonTable.unambiguous_dna_by_id[2] print(standard_table) print(mito_table.start_codons) print(mito_table.stop_codons) print(mito_table.forward_table["ACG"]) ##可变对象 from Bio.Seq import MutableSeq mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) print(mutable_seq) mutable_seq[5] = "C" print(mutable_seq) mutable_seq.remove("T") print(mutable_seq) mutable_seq.reverse() print(mutable_seq) new_seq = mutable_seq.toseq() print(new_seq)
def test_tomutable(self): """Check creating a MutableSeq object.""" for example1 in self._examples: mut = MutableSeq(example1) self.assertIsInstance(mut, MutableSeq) self.assertEqual(mut, example1)
def create_clusters_from_bowtie(self): """ The 'offset' field is actually 'abundance' The 'ref' field is actually 'cycle' offset """ with open(self.otu_txt) as f: for line in f: otuid, rest = line.strip().split(None, 1) for x in rest.split(): self.otu_info[x] = otuid self.cluster_by_otu[otuid] = {} for r in BowTieReader(self.input_bowtie, False): cid = r['ID'] otuid = self.otu_info[r['ID']] self.cluster_by_otu[otuid][cid] = {'dirty':True, 'cids':[cid], 'len':len(r['seq']), 'seq': MutableSeq(r['seq']), 'size':int(r['offset']), \ 'qual': [ord(x)-33 for x in r['qual']], 'cycle': range(int(r['ref']), int(r['ref'])+len(r['seq']))}
def parse_vcf(varfile): reader = csv.reader(open(varfile), "excel-tab") for line in reader: if line[0][0] == "#": continue pos = int(line[1]) - 1 var = line[4].split(',') yield pos, var for seq_record in SeqIO.parse(sys.argv[1], 'fasta'): print >> sys.stderr, "Seq ID = %s, Length = %d" % \ (seq_record.id, len(seq_record)) seq = MutableSeq(str(seq_record.seq)) n = 0 for pos, var in parse_vcf(sys.argv[2]): # if (len(var) > 2) or (len(var[0]) > 1): # continue if (len(var) > 1) or (len(var[0]) > 1): continue else: seq[pos] = var[0] n += 1 SeqIO.write(SeqRecord(Seq(str(seq)), id=seq_record.id), sys.stdout, 'fasta') print >> sys.stderr, "Total variants = %d" % n
class StringMethodTests(unittest.TestCase): _examples = [ # These are length 9, a multiple of 3 for translation tests: Seq("ACGTGGGGT"), Seq("ACGUGGGGU"), Seq("GG"), Seq("A"), UnknownSeq(1), UnknownSeq(1, character="n"), UnknownSeq(1, character="N"), UnknownSeq(12, character="N"), UnknownSeq(12, character="X"), UnknownSeq(12), ] for seq in _examples[:]: if not isinstance(seq, UnknownSeq): _examples.append(MutableSeq(seq)) _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None] def _test_method(self, method_name, start_end=False): """Check this method matches the plain string's method.""" self.assertIsInstance(method_name, str) for example1 in self._examples: if not hasattr(example1, method_name): # e.g. MutableSeq does not support transcribe continue str1 = str(example1) for example2 in self._examples: if not hasattr(example2, method_name): # e.g. MutableSeq does not support transcribe continue str2 = str(example2) try: i = getattr(example1, method_name)(str2) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2) except ValueError: j = ValueError self.assertEqual(i, j, "%r.%s(%r)" % (example1, method_name, str2)) try: i = getattr(example1, method_name)(example2) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2) except ValueError: j = ValueError self.assertEqual( i, j, "%r.%s(%r)" % (example1, method_name, example2)) if start_end: for start in self._start_end_values: try: i = getattr(example1, method_name)(str2, start) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2, start) except ValueError: j = ValueError self.assertEqual( i, j, "%r.%s(%r, %s)" % (example1, method_name, str2, start)) for end in self._start_end_values: try: i = getattr(example1, method_name)(str2, start, end) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2, start, end) except ValueError: j = ValueError self.assertEqual( i, j, "%r.%s(%r, %s, %s)" % (example1, method_name, str2, start, end), ) def test_str_count(self): """Check matches the python string count method.""" self._test_method("count", start_end=True) self.assertEqual(Seq("AC777GT").count("7"), 3) self.assertRaises(TypeError, Seq("AC777GT").count, 7) self.assertRaises(TypeError, Seq("AC777GT").count, None) def test_count_overlap(self): """Check count_overlap exception matches python string count method.""" self.assertEqual(Seq("AC777GT").count("77"), 1) self.assertEqual(Seq("AC777GT").count_overlap("77"), 2) self.assertEqual(Seq("AC777GT").count_overlap("7"), 3) self.assertRaises(TypeError, Seq("AC777GT").count_overlap, 7) self.assertRaises(TypeError, Seq("AC777GT").count_overlap, None) def test_str_count_overlap_GG(self): """Check our count_overlap method using GG.""" # Testing with self._examples expected = [ 3, 3, 1, 0, # Seq() Tests 0, 0, 0, 0, 0, 0, # UnknownSeq() Tests 3, 3, 1, 0, # MutableSeq() Tests ] assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term GG as a string self.assertEqual(seq.count_overlap("GG"), exp) self.assertEqual(seq.count_overlap("G" * 5), 0) # Using search term GG as a Seq self.assertEqual(seq.count_overlap(Seq("GG")), exp) self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0) def test_count_overlap_start_end_GG(self): """Check our count_overlap method using GG with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [ (1, 7, 3), (3, None, 3), (3, 6, 2), (4, 6, 1), (4, -1, 2), (-5, None, 2), (-5, 7, 2), (7, -5, 0), (-100, None, 3), (None, 100, 3), (-100, 1000, 3), ] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("GG", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("GG", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0) # Testing UnknownSeq() with variable start and end arguments char_start_end_exp = [ ("N", 1, 7, 0), ("N", 1, 7, 0), ("N", -4, None, 0), ("N", -4, None, 0), ("X", 1, 7, 0), ] for char, start, end, exp in char_start_end_exp: self.assertEqual( UnknownSeq(12, character=char).count_overlap("GG", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [ ("G", 100, 105, 0), ("G", -1, 4, 0), ("G", 4, -1, 0), ("G", -8, -2, 0), ("G", -2, -8, 0), ("G", 8, 2, 0), ("G", 2, 8, 0), ("GG", 8, 2, 0), ("GG", 2, 8, 0), ("GG", -5, -1, 0), ("GG", 1, 5, 0), ("GGG", None, None, 0), ("GGGGGGGGG", None, None, 0), ("GGG", 1, 2, 0), ] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("GG", 1), 0) def test_str_count_overlap_NN(self): """Check our count_overlap method using NN.""" # Testing with self._examples expected = [ 0, 0, 0, 0, # Seq() Tests 0, 0, 0, 11, 0, 0, # UnknownSeq() Tests 0, 0, 0, 0, # MutableSeq() Tests ] assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term NN as a string self.assertEqual(seq.count_overlap("NN"), exp) self.assertEqual(seq.count_overlap("N" * 13), 0) # Using search term NN as a Seq self.assertEqual(seq.count_overlap(Seq("NN")), exp) self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0) def test_count_overlap_start_end_NN(self): """Check our count_overlap method using NN with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [ (1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0), (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0), (-100, None, 0), (None, 100, 0), (-100, 1000, 0), ] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("NN", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("NN", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0) # Testing UnknownSeq() with variable start and end arguments char_start_end_exp = [ ("N", 1, 7, 5), ("N", 1, 7, 5), ("N", -4, None, 3), ("N", -4, None, 3), ("X", 1, 7, 0), ] for char, start, end, exp in char_start_end_exp: self.assertEqual( UnknownSeq(12, character=char).count_overlap("NN", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [ ("N", 100, 105, 0), ("N", -1, 4, 0), ("N", 4, -1, 2), ("N", -8, -2, 5), ("N", -2, -8, 0), ("N", 8, 2, 0), ("N", 2, 8, 5), ("NN", 8, 2, 0), ("NN", 2, 8, 4), ("NN", -5, -1, 3), ("NN", 1, 5, 3), ("NNN", None, None, 5), ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0), ] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("NN", 1), 5) def test_str_find(self): """Check matches the python string find method.""" self._test_method("find", start_end=True) self.assertEqual(Seq("AC7GT").find("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").find, 7) self.assertRaises(TypeError, Seq("ACGT").find, None) def test_str_rfind(self): """Check matches the python string rfind method.""" self._test_method("rfind", start_end=True) self.assertEqual(Seq("AC7GT").rfind("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").rfind, 7) self.assertRaises(TypeError, Seq("ACGT").rfind, None) def test_str_index(self): """Check matches the python string index method.""" self._test_method("index", start_end=True) self.assertEqual(Seq("AC7GT").index("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").index, 7) self.assertRaises(TypeError, Seq("ACGT").index, None) self.assertEqual(MutableSeq("AC7GT").index("7"), 2) self.assertRaises(TypeError, MutableSeq("AC7GT").index, 7) self.assertRaises(TypeError, MutableSeq("ACGT").index, None) def test_str_rindex(self): """Check matches the python string rindex method.""" self._test_method("rindex", start_end=True) self.assertEqual(Seq("AC7GT").rindex("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").rindex, 7) self.assertRaises(TypeError, Seq("ACGT").rindex, None) self.assertEqual(MutableSeq("AC7GT").rindex("7"), 2) self.assertRaises(TypeError, MutableSeq("AC7GT").rindex, 7) self.assertRaises(TypeError, MutableSeq("ACGT").rindex, None) def test_str_startswith(self): """Check matches the python string startswith method.""" self._test_method("startswith", start_end=True) self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC"))) self.assertRaises(TypeError, Seq("ACGT").startswith, None) self.assertRaises(TypeError, MutableSeq("ACGT").startswith, None) # Now check with a tuple of sub sequences for example1 in self._examples: subs = tuple(example1[start:start + 2] for start in range(0, len(example1) - 2, 3)) subs_str = tuple(str(s) for s in subs) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).startswith(subs_str, 3), example1.startswith(subs, 3)) self.assertEqual( str(example1).startswith(subs_str, 2, 6), example1.startswith(subs, 2, 6), ) def test_str_endswith(self): """Check matches the python string endswith method.""" self._test_method("endswith", start_end=True) self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE"))) self.assertRaises(TypeError, Seq("ACGT").endswith, None) # Now check with a tuple of sub sequences for example1 in self._examples: subs = tuple(example1[start:start + 2] for start in range(0, len(example1) - 2, 3)) subs_str = tuple(str(s) for s in subs) self.assertEqual( str(example1).endswith(subs_str), example1.endswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).endswith(subs_str, 3), example1.endswith(subs, 3)) self.assertEqual( str(example1).endswith(subs_str, 2, 6), example1.endswith(subs, 2, 6)) def test_str_strip(self): """Check matches the python string strip method.""" self._test_method("strip") s = Seq(" ACGT ") m = MutableSeq(" ACGT ") self.assertEqual(s.strip(), "ACGT") self.assertRaises(TypeError, s.strip, 7) self.assertEqual(s, " ACGT ") self.assertEqual(m.strip(), "ACGT") self.assertRaises(TypeError, m.strip, 7) self.assertEqual(m, " ACGT ") self.assertEqual(m.strip(inplace=True), "ACGT") self.assertEqual(m, "ACGT") def test_str_lstrip(self): """Check matches the python string lstrip method.""" self._test_method("lstrip") s = Seq(" ACGT ") m = MutableSeq(" ACGT ") self.assertEqual(s.lstrip(), "ACGT ") self.assertRaises(TypeError, s.lstrip, 7) self.assertEqual(s, " ACGT ") self.assertEqual(m.lstrip(), "ACGT ") self.assertRaises(TypeError, m.lstrip, 7) self.assertEqual(m, " ACGT ") self.assertEqual(m.lstrip(inplace=True), "ACGT ") self.assertEqual(m, "ACGT ") def test_str_rstrip(self): """Check matches the python string rstrip method.""" self._test_method("rstrip") s = Seq(" ACGT ") m = MutableSeq(" ACGT ") self.assertEqual(s.rstrip(), " ACGT") self.assertRaises(TypeError, s.rstrip, 7) self.assertEqual(s, " ACGT ") self.assertEqual(m.rstrip(), " ACGT") self.assertRaises(TypeError, m.rstrip, 7) self.assertEqual(m, " ACGT ") self.assertEqual(m.rstrip(inplace=True), " ACGT") self.assertEqual(m, " ACGT") def test_str_split(self): """Check matches the python string split method.""" self._test_method("split") self.assertEqual(Seq("AC7GT").split("7"), "AC7GT".split("7")) self.assertRaises(TypeError, Seq("AC7GT").split, 7) self.assertEqual(MutableSeq("AC7GT").split("7"), "AC7GT".split("7")) self.assertRaises(TypeError, MutableSeq("AC7GT").split, 7) def test_str_rsplit(self): """Check matches the python string rsplit method.""" self._test_method("rsplit") self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".rsplit("7")) self.assertRaises(TypeError, Seq("AC7GT").rsplit, 7) self.assertEqual(MutableSeq("AC7GT").rsplit("7"), "AC7GT".rsplit("7")) self.assertRaises(TypeError, MutableSeq("AC7GT").rsplit, 7) def test_str_length(self): """Check matches the python string __len__ method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(len(example1), len(str1)) def test_str_upper(self): """Check matches the python string upper method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(example1.upper(), str1.upper()) def test_str_lower(self): """Check matches the python string lower method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(example1.lower(), str1.lower()) def test_str_encode(self): """Check matches the python string encode method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(bytes(example1), str1.encode("ascii")) def test_str_hash(self): for example1 in self._examples: if isinstance(example1, MutableSeq): continue with warnings.catch_warnings(): # Silence change in behaviour warning warnings.simplefilter("ignore", BiopythonWarning) self.assertEqual( hash(str(example1)), hash(example1), "Hash mismatch, %r for %r vs %r for %r" % (hash(str(example1)), id(example1), hash(example1), example1), ) def test_str_comparison(self): for example1 in self._examples: for example2 in self._examples: with warnings.catch_warnings(): self.assertEqual( str(example1) == str(example2), example1 == example2, "Checking %r == %r" % (example1, example2), ) self.assertEqual( str(example1) != str(example2), example1 != example2, "Checking %r != %r" % (example1, example2), ) self.assertEqual( str(example1) < str(example2), example1 < example2, "Checking %r < %r" % (example1, example2), ) self.assertEqual( str(example1) <= str(example2), example1 <= example2, "Checking %r <= %r" % (example1, example2), ) self.assertEqual( str(example1) > str(example2), example1 > example2, "Checking %r > %r" % (example1, example2), ) self.assertEqual( str(example1) >= str(example2), example1 >= example2, "Checking %r >= %r" % (example1, example2), ) def test_str_getitem(self): """Check slicing and indexing works like a string.""" for example1 in self._examples: str1 = str(example1) for i in self._start_end_values: if i is not None and abs(i) < len(example1): self.assertEqual(example1[i], str1[i]) self.assertEqual(example1[:i], str1[:i]) self.assertEqual(example1[i:], str1[i:]) for j in self._start_end_values: self.assertEqual(example1[i:j], str1[i:j]) for step in range(-3, 4): if step == 0: with self.assertRaises(ValueError) as cm: example1[i:j:step] self.assertEqual(str(cm.exception), "slice step cannot be zero") else: self.assertEqual(example1[i:j:step], str1[i:j:step]) def test_tomutable(self): """Check creating a MutableSeq object.""" for example1 in self._examples: mut = MutableSeq(example1) self.assertIsInstance(mut, MutableSeq) self.assertEqual(mut, example1) def test_toseq(self): """Check creating a Seq object.""" for example1 in self._examples: seq = Seq(example1) self.assertIsInstance(seq, Seq) self.assertEqual(seq, example1) def test_the_complement(self): """Check obj.complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue comp = example1.complement() str1 = str(example1) if "U" in str1 or "u" in str1: mapping = str.maketrans("ACGUacgu", "UGCAugca") else: # Default to DNA, e.g. complement("A") -> "T" not "U" mapping = str.maketrans("ACGTacgt", "TGCAtgca") self.assertEqual(str1.translate(mapping), comp) def test_the_reverse_complement(self): """Check obj.reverse_complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue comp = example1.reverse_complement() str1 = str(example1) if "U" in str1 or "u" in str1: mapping = str.maketrans("ACGUacgu", "UGCAugca") else: # Defaults to DNA, so reverse_complement("A") --> "T" not "U" mapping = str.maketrans("ACGTacgt", "TGCAtgca") self.assertEqual(str1.translate(mapping)[::-1], comp) def test_the_transcription(self): """Check obj.transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue tran = example1.transcribe() str1 = str(example1) if len(str1) % 3 != 0: # TODO - Check for or silence the expected warning? continue self.assertEqual(str1.replace("T", "U").replace("t", "u"), tran) def test_the_back_transcription(self): """Check obj.back_transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue tran = example1.back_transcribe() str1 = str(example1) self.assertEqual(str1.replace("U", "T").replace("u", "t"), tran) def test_the_translate(self): """Check obj.translate() method.""" mapping = "" for example1 in self._examples: if len(example1) % 3 != 0: # TODO - Check for or silence the expected warning? continue tran = example1.translate() # Try with positional vs named argument: self.assertEqual(example1.translate(11), example1.translate(table=11)) # TODO - check the actual translation, and all the optional args def test_the_translation_of_stops(self): """Check obj.translate() method with stop codons.""" misc_stops = "TAATAGTGAAGAAGG" nuc = Seq(misc_stops) self.assertEqual("***RR", nuc.translate()) self.assertEqual("***RR", nuc.translate(1)) self.assertEqual("***RR", nuc.translate("SGC0")) self.assertEqual("**W**", nuc.translate(table=2)) self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial")) self.assertEqual("**WSS", nuc.translate(table=5)) self.assertEqual("**WSS", nuc.translate(table=9)) self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear")) self.assertEqual("***RR", nuc.translate(table=11)) self.assertEqual("***RR", nuc.translate(table="11")) self.assertEqual("***RR", nuc.translate(table="Bacterial")) self.assertEqual("**GRR", nuc.translate(table=25)) self.assertEqual("", nuc.translate(to_stop=True)) self.assertEqual("O*ORR", nuc.translate(table=special_table)) self.assertEqual("*QWRR", nuc.translate(table=Chilodonella_uncinata_table)) nuc = MutableSeq(misc_stops) self.assertEqual("***RR", nuc.translate()) self.assertEqual("***RR", nuc.translate(1)) self.assertEqual("***RR", nuc.translate("SGC0")) self.assertEqual("**W**", nuc.translate(table=2)) self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial")) self.assertEqual("**WSS", nuc.translate(table=5)) self.assertEqual("**WSS", nuc.translate(table=9)) self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear")) self.assertEqual("***RR", nuc.translate(table=11)) self.assertEqual("***RR", nuc.translate(table="11")) self.assertEqual("***RR", nuc.translate(table="Bacterial")) self.assertEqual("**GRR", nuc.translate(table=25)) self.assertEqual("", nuc.translate(to_stop=True)) self.assertEqual("O*ORR", nuc.translate(table=special_table)) self.assertEqual("*QWRR", nuc.translate(table=Chilodonella_uncinata_table)) # These test the Bio.Seq.translate() function - move these?: self.assertEqual( "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table)) self.assertEqual("O*ORR", translate(str(nuc), table=special_table)) self.assertEqual("", translate(str(nuc), to_stop=True)) self.assertEqual("***RR", translate(str(nuc), table="Bacterial")) self.assertEqual("***RR", translate(str(nuc), table="11")) self.assertEqual("***RR", translate(str(nuc), table=11)) self.assertEqual("**W**", translate(str(nuc), table=2)) self.assertEqual(Seq("TAT").translate(), "Y") self.assertEqual(Seq("TAR").translate(), "*") self.assertEqual(Seq("TAN").translate(), "X") self.assertEqual(Seq("NNN").translate(), "X") self.assertEqual(Seq("TAt").translate(), "Y") self.assertEqual(Seq("TaR").translate(), "*") self.assertEqual(Seq("TaN").translate(), "X") self.assertEqual(Seq("nnN").translate(), "X") self.assertEqual(Seq("tat").translate(), "Y") self.assertEqual(Seq("tar").translate(), "*") self.assertEqual(Seq("tan").translate(), "X") self.assertEqual(Seq("nnn").translate(), "X") def test_the_translation_of_invalid_codons(self): """Check obj.translate() method with invalid codons.""" for codon in ["TA?", "N-N", "AC_", "Ac_"]: msg = "Translating %s should fail" % codon nuc = Seq(codon) with self.assertRaises(TranslationError, msg=msg): nuc.translate() nuc = MutableSeq(codon) with self.assertRaises(TranslationError, msg=msg): nuc.translate() def test_the_translation_of_ambig_codons(self): """Check obj.translate() method with ambiguous codons.""" for ambig_values in [ambiguous_dna_values, ambiguous_rna_values]: ambig = set(ambig_values.keys()) ambig.remove("X") for c1 in ambig: for c2 in ambig: for c3 in ambig: values = { str(Seq(a + b + c).translate()) for a in ambig_values[c1] for b in ambig_values[c2] for c in ambig_values[c3] } t = Seq(c1 + c2 + c3).translate() if t == "*": self.assertEqual(values, set("*")) elif t == "X": self.assertGreater( len(values), 1, "translate('%s') = '%s' not '%s'" % (c1 + c2 + c3, t, ",".join(values)), ) elif t == "Z": self.assertEqual(values, set("EQ")) elif t == "B": self.assertEqual(values, set("DN")) elif t == "J": self.assertEqual(values, set("LI")) else: self.assertEqual(values, set(t)) # TODO - Use the Bio.Data.IUPACData module for the # ambiguous protein mappings? def test_init_typeerror(self): """Check Seq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, Seq, ("A", "C", "G", "T")) self.assertRaises(TypeError, Seq, ["A", "C", "G", "T"]) self.assertRaises(TypeError, Seq, 1) self.assertRaises(TypeError, Seq, 1.0) def test_MutableSeq_init_typeerror(self): """Check MutableSeq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, MutableSeq, ("A", "C", "G", "T")) self.assertRaises(TypeError, MutableSeq, ["A", "C", "G", "T"]) self.assertRaises(TypeError, MutableSeq, 1) self.assertRaises(TypeError, MutableSeq, 1.0) def test_join_Seq_TypeError(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = Seq("NNNNN") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_UnknownSeq_TypeError_iter(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = UnknownSeq(5, character="-") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_MutableSeq_TypeError_iter(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = MutableSeq("MMMMM") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_Seq(self): """Checks if Seq join correctly concatenates sequence with the spacer.""" spacer = Seq("NNNNN") self.assertEqual( "N" * 15, spacer.join([Seq("NNNNN"), Seq("NNNNN")]), ) spacer1 = Seq("") spacers = [spacer1, Seq("NNNNN"), Seq("GGG")] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) # Now try single sequence arguments, should join the letters for target in example_strings + example_strings_seqs: self.assertEqual( str(spacer).join(str(target)), str(spacer.join(target))) def test_join_UnknownSeq(self): """Checks if UnknownSeq join correctly concatenates sequence with the spacer.""" spacer1 = UnknownSeq(5, character="-") spacer2 = UnknownSeq(0, character="-") spacers = [spacer1, spacer2] self.assertEqual( "-" * 15, spacer1.join( [UnknownSeq(5, character="-"), UnknownSeq(5, character="-")]), ) self.assertEqual( "N" * 5 + "-" * 10, spacer1.join([Seq("NNNNN"), UnknownSeq(5, character="-")]), ) example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer2.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) # Now try single sequence arguments, should join the letters for target in example_strings + example_strings_seqs: self.assertEqual( str(spacer).join(str(target)), str(spacer.join(target))) def test_join_MutableSeq_mixed(self): """Check MutableSeq objects can be joined.""" spacer = MutableSeq("NNNNN") self.assertEqual( "N" * 15, spacer.join([MutableSeq("NNNNN"), MutableSeq("NNNNN")]), ) self.assertRaises( TypeError, spacer.join([Seq("NNNNN"), MutableSeq("NNNNN")]), ) def test_join_Seq_with_file(self): """Checks if Seq join correctly concatenates sequence from a file with the spacer.""" filename = "Fasta/f003" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] spacer = Seq("NNNNN") spacer1 = Seq("") # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(seq_concatenated, ref_data) self.assertEqual(seq_concatenated1, ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, "fasta")) def test_join_UnknownSeq_with_file(self): """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer.""" filename = "Fasta/f003" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] spacer = UnknownSeq(0, character="-") spacer1 = UnknownSeq(5, character="-") # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(seq_concatenated, ref_data) self.assertEqual(seq_concatenated1, ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, "fasta")) def test_join_MutableSeq(self): """Checks if MutableSeq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = MutableSeq("") spacers = [ spacer1, MutableSeq("NNNNN"), MutableSeq("GGG"), ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) def test_join_MutableSeq_with_file(self): """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer.""" filename = "Fasta/f003" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] spacer = MutableSeq("NNNNN") spacer1 = MutableSeq("") # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(seq_concatenated, ref_data) self.assertEqual(seq_concatenated1, ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, "fasta")) def test_equality(self): """Test equality when mixing types.""" self.assertEqual(Seq("6"), "6") self.assertNotEqual(Seq("6"), 6) self.assertEqual(Seq(""), "") self.assertNotEqual(Seq(""), None) self.assertEqual(Seq("None"), "None") self.assertNotEqual(Seq("None"), None) self.assertEqual(MutableSeq("6"), "6") self.assertNotEqual(MutableSeq("6"), 6) self.assertEqual(MutableSeq(""), "") self.assertNotEqual(MutableSeq(""), None) self.assertEqual(MutableSeq("None"), "None") self.assertNotEqual(MutableSeq("None"), None) self.assertEqual(UnknownSeq(1, character="6"), "6") self.assertNotEqual(UnknownSeq(1, character="6"), 6) self.assertEqual(UnknownSeq(0), "") self.assertNotEqual(UnknownSeq(0), None)
def setUp(self): alphabet = TestAlphabet() test_genome = MutableSeq("11*22*33*", alphabet) self.organism = Organism(test_genome, test_fitness) self.ambig_info = Schema(alphabet.alphabet_matches)
with open(args.rc_regions) as infile1: RCseqs = csv.reader(infile1, delimiter='\t') for row in RCseqs: seq = row[0] RCstart = row[1] RCstop = row[2] seq_list.append([seq, RCstart, RCstop]) # Read fasta file: fasta_seqs = list(SeqIO.parse(args.fasta, "fasta")) # Mask recombinant regions: masked_seq = [] for i in fasta_seqs: seq = MutableSeq(str(i.seq)) for j in seq_list: if i.id == j[0]: # j[0] is the sequence id in recombinant regions list start_mask = int(j[1]) - 1 # 1 based positions are 1 less in 0-based indexing end_mask = int(j[2]) # last index in range is not included in python len_mask = end_mask - start_mask seq[start_mask:end_mask] = args.maskchar * len_mask masked_seq.append(SeqRecord(Seq(str(seq)), i.id, description="")) for i in masked_seq: print("Number of characters masked in sequence " + i.id + ": " + str(str(i.seq).count(args.maskchar))) # Write masked sequences to file: SeqIO.write(masked_seq, args.out, "fasta")
def MutableSeqFromFile(filename, alphabet): sequence_str = open(filename).read().strip() return MutableSeq(sequence_str.lower(), alphabet)
def count_one_fraction(alignment, refname, debug, start_offset, end_trail): """ Don't bother with expected/allowed mutations, just find everything and filter later Final format: {DNA error: [(protein error), fraction, 1. Read reference file 2. Scan over reference sequence to generate all possible mutations 3. For each ref & read in multiple alignment: - verify the read is good quality - call the mutation - add to count table 4. Print counts """ # use a regular dictionary # when a protein mutation is first encountered, create an entry one_lane_counts = {} # reading & looping over read/reference sequence in multiple sequence alignment # use AlignIO parser and keep sequence only, allowing it to change (important for gap shifts) for pair in AlignIO.parse(alignment, "fasta", alphabet=IUPAC.ambiguous_dna, seq_count=2): # both read and ref are MutableSeq ref = pair[0].seq.tomutable() read = pair[1].seq.tomutable() read = MutableSeq(str(read).replace('N', '.'), read.alphabet) readname = pair[1].id # trim sequencing read to reference ref, read = trim_read(ref, read) # if read_is_wt(read, ref): # if debug: # trimmed_read = re.search(r'^-+([AGCTN][ACGTN-]+[ACGTN])-+$', str(read)) # print() # print(trimmed_read.group(1)) # printErrors("WT", read, ref, True) # continue dna_errors, dna_hgvs, prot_errors = None, None, None try: dna_errors = find_DNA_diff(read, ref, debug, start_offset, end_trail) # errors = a tuple dna_hgvs = find_DNA_hgvs( read, ref, refname, debug, start_offset, end_trail) # string according to HGVS format (ish) prot_errors = find_protein_diff(read, ref, debug, start_offset, end_trail) # print() # print(readname) # print(dna_hgvs, prot_errors) # printErrors(dna_errors, read, ref, True) except: if not dna_errors: print(dna_errors) print_coloured_diff(readname, read, ref, debug) raise try: one_lane_counts[prot_errors]['total'] += 1 one_lane_counts[prot_errors]['dna'][dna_errors] += 1 one_lane_counts[prot_errors]['dna_hgvs'][dna_hgvs] += 1 except KeyError: one_lane_counts[prot_errors] = { 'dna': defaultdict(int), 'dna_hgvs': defaultdict(int), 'total': 1 } one_lane_counts[prot_errors]['dna'][dna_errors] += 1 one_lane_counts[prot_errors]['dna_hgvs'][dna_hgvs] += 1 # count the mutations n = 0 threshold = 10 for error in one_lane_counts.keys(): if one_lane_counts[error]['total'] > threshold: n += 1 print( 'Fount {0} total protein mutations, of which {1} have more than {2} counts' .format(len(one_lane_counts), n, threshold)) return one_lane_counts
def __init__(self, seqFile, format="fasta"): for seq in SeqIO.parse(seqFile, format): seq.seq = MutableSeq(seq.seq.tostring()) self.append(seq)
def test_generated(self): """Write and read back odd SeqRecord objects.""" record1 = SeqRecord( Seq("ACGT" * 500, generic_dna), id="Test", description="Long " * 500, letter_annotations={"phred_quality": [40, 30, 20, 10] * 500}, ) record2 = SeqRecord( MutableSeq("NGGC" * 1000), id="Mut", description="very " * 1000 + "long", letter_annotations={"phred_quality": [0, 5, 5, 10] * 1000}, ) record3 = SeqRecord( UnknownSeq(2000, character="N"), id="Unk", description="l" + ("o" * 1000) + "ng", letter_annotations={"phred_quality": [0, 1] * 1000}, ) record4 = SeqRecord( Seq("ACGT" * 500), id="no_descr", description="", name="", letter_annotations={"phred_quality": [40, 50, 60, 62] * 500}, ) record5 = SeqRecord( Seq("", generic_dna), id="empty_p", description="(could have been trimmed lots)", letter_annotations={"phred_quality": []}, ) record6 = SeqRecord( Seq(""), id="empty_s", description="(could have been trimmed lots)", letter_annotations={"solexa_quality": []}, ) record7 = SeqRecord( Seq("ACNN" * 500), id="Test_Sol", description="Long " * 500, letter_annotations={"solexa_quality": [40, 30, 0, -5] * 500}, ) record8 = SeqRecord( Seq("ACGT"), id="HighQual", description= "With very large qualities that even Sanger FASTQ can't hold!", letter_annotations={"solexa_quality": [0, 10, 100, 1000]}, ) # TODO - Record with no identifier? records = [ record1, record2, record3, record4, record5, record6, record7, record8, ] for format in [ "fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual" ]: handle = StringIO() with warnings.catch_warnings(): # TODO - Have a Biopython defined "DataLossWarning?" warnings.simplefilter("ignore", BiopythonWarning) SeqIO.write(records, handle, format) handle.seek(0) compare_records(records, list(SeqIO.parse(handle, format)), truncation_expected(format))
def __init__(self, seq, alphaproperty=None, insertprob=None, deleteprob=None, mualphabet=None, muprob=None, mupos=None, delpos=None, inpos=None, verbose=False): try: self.occureddel = list() # This is to keep a history of chnges made to the reference self.occuredmu = list() # This is necessary for writing the haplotypes in the format self.occuredins = list() # of haplotyping software's. self.inserted_allele = list() # keeps track of the inserted allele to be able to get them back when needed! self.alt_allele = list() # keeps track of the substituted if not isinstance(verbose, bool): raise CustomException("ERROR: verbose must be set to either True or False. \ Default is to False") else: self.verbose = verbose if isinstance(seq, str): if alphaproperty is None: if self.verbose: print( "WARNING: No alphabet type is specified for the sequence string!") else: pass self.alphaproperty = Alphabet() else: self.alphaproperty = alphaproperty self.seq = MutableSeq(seq, self.alphaproperty) elif isinstance(seq, Seq): self.alphaproperty = seq.__getattribute__( 'alphabet') self.seq = seq.tomutable() elif isinstance(seq, MutableSeq): self.alphaproperty = seq.__getattribute__( 'alphabet') self.seq = copy.deepcopy(seq) else: raise CustomException("ERROR: Should provide a Seq or MutableSeq object, \n \ or a string sequence!") self.alphabet = set(str(self.seq)) self.ref = str(self.seq) if not delpos: self.delpos = [] else: if set(delpos).issubset( set(range(len(self.ref)))): self.delpos = list( delpos) # Deletion by specifying the positions else: raise CustomException( "ERROR: Deletion positions exceed the range of the reference or are not positive integers!") if not inpos: self.inpos = [] else: if set(inpos).issubset( set(range(len(self.ref)))): self.inpos = list( inpos) # Insertion by specifying the positions else: raise CustomException( "ERROR: Insertion positions exceed the range of the reference or are not positive integers!") if not mupos: self.mupos = [] else: if set(mupos).issubset( set(range(len(self.ref)))): self.mupos = list( mupos) # Mutation by specifying the positions else: raise CustomException( "ERROR: Mutation positions exceed the range of the reference or are not positive integers!") if not mualphabet: if self.verbose: print("WARNING: You have specified no mutation alphabet! Mutations are set to random \ letters!") self.mualphabet = dict() for key in self.alphabet: self.mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter else: mualphabet = dict([(str(k), str(v)) for k, v in mualphabet.iteritems()]) for key, value in mualphabet.iteritems(): if len(key) != 1: raise CustomException("ERROR: the mutation alphabet deals with point mutations! Only single letters are\ allowed as keys!") elif key in set(''.join(value)): raise CustomException("ERROR: Wrong mutation values specified! A letter could just be substituted with a\ different letter for mutation!") if set( mualphabet.keys()) == self.alphabet and set( ''.join( mualphabet.values())) <= self.alphabet: self.mualphabet = copy.deepcopy( mualphabet) elif set( mualphabet.keys()) < self.alphabet and set( ''.join( mualphabet.values())) < self.alphabet: if self.verbose: print("WARNING: Mutation is not specified for some letters! Those mutations are set\ to random letters!") self.mualphabet = copy.deepcopy( mualphabet) # Whatever has been specified for mutation alphabet is kep intact for key in self.alphabet - set( mualphabet.keys()): self.mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter else: if self.verbose: print("WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\ updated and\nunspecified mutations are set to random letters!") new_mualphabet = dict() # As mutation may introduce novel alleles in the sequence, alphabet is updated first for key, value in mualphabet.iteritems(): # Whatever has been specified for mutation alphabet is kep intact self.alphabet.add( key) # Only the alphabet is updated if necessary self.alphabet |= (set(''.join( value)) - self.alphabet) new_mualphabet.update( {key: value}) for key in self.alphabet - set( new_mualphabet.keys()): new_mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter self.mualphabet = copy.deepcopy( new_mualphabet) if not insertprob: self.insertprob = dict() # If no insertprob is given, it is set to zero everywhere for key in self.alphabet: self.insertprob[key] = 0 else: if set(list( insertprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in insertion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_insertprob = dict() for key, value in insertprob.iteritems(): if value >= 0 and value <= 1: new_insertprob.update( {key: value}) else: raise CustomException( "ERROR: Insertion probability must be >=0 and <=1!") for key in self.alphabet - set( new_insertprob.keys()): new_insertprob[key] = 0 self.insertprob = copy.deepcopy(new_insertprob) if not deleteprob: # If no deleteprob is given, it is set to zero everywhere self.deleteprob = dict() for key in self.alphabet: self.deleteprob[key] = 0 else: if set(list( deleteprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in deletion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_deleteprob = dict() for key, value in deleteprob.iteritems(): if value >= 0 and value <= 1: new_deleteprob.update( {key: value}) else: raise CustomException( "ERROR: Deletion probability must be >=0 and <=1!") for key in self.alphabet - set( new_deleteprob.keys()): new_deleteprob[key] = 0 self.deleteprob = copy.deepcopy(new_deleteprob) if not muprob: self.muprob = dict() # If no muprob is given, it is set to zero everywhere for key in self.alphabet: self.muprob[key] = 0 else: if set(list(muprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in mutation probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_muprob = dict() for key, value in muprob.iteritems(): if value >= 0 and value <= 1: new_muprob.update({key: value}) else: raise CustomException( "ERROR: Mutation probability must be >=0 and <=1!") for key in self.alphabet - set( new_muprob.keys()): new_muprob[key] = 0 self.muprob = copy.deepcopy(new_muprob) except CustomException as instance: print(instance) sys.exit(2) else: if self.verbose: print( "MuGen object successfully created.\nWARNING: MuGen sequence is case sensitive!")
else: print("huh? ERROR") t = Seq.Seq("T", IUPAC.ambiguous_dna) u = s + t print(str(u.alphabet)) from Bio.Seq import MutableSeq import array print print("Testing MutableSeq") print("==================") print("Testing creating MutableSeqs in multiple ways") string_seq = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) array_seq = MutableSeq(array.array(array_indicator, "TCAAAAGGATGCATCATG"), IUPAC.ambiguous_dna) converted_seq = s.tomutable() for test_seq in [string_seq]: print(repr(test_seq)) print(str(test_seq)) print(len(test_seq)) print(repr(test_seq.toseq())) print(test_seq[0]) print(repr(test_seq[1:5])) test_seq[1:3] = "GAT" print "Set slice with string:", repr(test_seq)
def probmu(self): self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() self.alt_allele = list() """Operates on a MuGen object, and returns a Seq object obtained by making random changes to the reference sequence of the MuGen object, using the probabilities given to MuGen""" self.seq = [] for __site, __base in enumerate(self.ref): if __site in set(self.mupos) | set(self.inpos) | set( self.delpos): self.seq.append( __base) # No change is made at indel/mutation positions else: __prob = {'ins': self.insertprob.get(__base), 'del': self.deleteprob.get(__base), 'sub': self.muprob.get(__base)} __error = random.choice(['ins', 'del', 'sub', 'sub']) # An error occurs randomly: insertion or \ # deletion or substitution __rnd = float(int( random.random() * 100000)) / 100000 # The probability that this error is \ # not corrected by replication machinary is determined \ if __rnd < __prob.get( __error): # by insertprob,deleteprob and muprob if __error == 'sub': self.seq.append(random.choice( self.mualphabet.get( __base))) # Substitute tha letter with one from the mutation alphabet self.occuredmu.append( __site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([ self.seq[ -1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice( list( self.alphabet))) # Insert a random letter right after the letter self.occuredins.append( __site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([ __base + self.seq[ -1]]) # Update the list of inserted alleles else: self.occureddel.append( __site) # Delete the letter in the progeny sequence by just not adding it else: # Update the list of the sites which are deleted in the progeny sequence self.seq.append( __base) # No change is induced at the site in the progeny sequence self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) if (self.occuredins): _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions in ascending order self.occuredins, self.inserted_allele = zip( *_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("WARNING: If indel/mutation positions are specified, MuGen.probmu() makes no change at those sites. \n \ Use MuGen.posmu() or Mugen.hapchanger() to apply changes at those sites!") print("Changes made to the haplotype!")
import gffutils import rjvbio.seq import Bio.SeqIO, Bio.SeqRecord from Bio.Seq import MutableSeq from Bio.Seq import Seq if conf.out == 'STDOUT': fout = sys.stdout else: fout = open(conf.out, 'wb') db = gffutils.FeatureDB(conf.gffdb) if conf.featuretypes == 'ALL': conf.featuretypes = None for rec in Bio.SeqIO.parse(conf.inpfasta, 'fasta'): seqid = rec.id.strip() seq = MutableSeq(str(rec.seq)) length = len(seq) for feature in db.all_features(limit=[seqid, 0, length], completely_within=False, featuretype=conf.featuretypes): start = min(feature.start - 1, feature.end) end = max(feature.start - 1, feature.end) flength = end - start seq[start:end] = 'N' * flength assert len(seq) == length newrec = Bio.SeqRecord.SeqRecord(seq, id=seqid, description='') Bio.SeqIO.write(newrec, fout, "fasta") if conf.out != 'STDOUT':
def posmu(self): """Operates on a MuGen object, and returns a Seq object obtained by making specefic changes at specefic locations on the reference sequence of the MuGen object, using the indel and mutation positions already given to MuGen""" __change = [None] * len(self.ref) self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() # Preservation and change site are determined self.alt_allele = list() for __site in self.inpos: # Preservation and change site are determined __change[ __site] = 'ins' # with respect to the reference seq for __site in self.delpos: # type of the change is also specified __change[__site] = 'del' # The substituion base at the for __site in self.mupos: # specified position is determined __change[__site] = 'sub' # from the mutation alphabet. self.seq = [] for __site, __error in iter( zip(range(len(self.ref)), __change)): __base = self.ref[__site] if __error is None: self.seq.append(__base) elif __error == 'sub': self.seq.append(random.choice( self.mualphabet.get( __base))) # Substitute tha letter with one from the mutation alphabet self.occuredmu.append( __site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([self.seq[ -1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice(list( self.alphabet))) # Insert a random letter right after the letter self.occuredins.append( __site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([__base + self.seq[ -1]]) # Update the list of inserted alleles else: self.occureddel.append( __site) # Delete the letter in the progeny sequence by just not adding it self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) # Update the list of the sites which are deleted in the progeny sequence if self.occuredins: _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions self.occuredins, self.inserted_allele = zip( *_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("WARNING: if there are overlaps betweeen deletion, insertion and mutation positions, \n \ just one of the changes takes place with the following priority: \n \ 1)Mutation 2)Deletion 3)Insertion. \n") print("Changes made to the haplotype!")
def setUp(self): self.alphabet = TestAlphabet() genome = MutableSeq("2", self.alphabet) self.org = Organism(genome, test_fitness) self.test_mutator = TestMutator()
def hapchanger(self): """Operates on a MuGen object, and returns a Seq object obtained by making random and specified changes to the reference sequence of the MuGen object, using the probabilities as well as the positions given to MuGen.""" self.seq = [] self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() self.alt_allele = list() for __site, __base in enumerate(self.ref): if __site in set( self.mupos): # Making specified changes at the specified positions self.seq.append(random.choice( self.mualphabet.get( __base))) # Induce mutation at the site whose position is given self.occuredmu.append( __site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([self.seq[ -1]]) # Update the list of alternative alleles elif __site in set(self.inpos): self.seq.append( __base) # Make an insertion right after the site whose position is given self.seq.append( random.choice(list(self.alphabet))) self.occuredins.append( __site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([__base + self.seq[ -1]]) # Update the list of inserted alleles elif __site in set(self.delpos): self.occureddel.append( __site) # Update the list of the sited with deleted letter else: # If not change is specified at the position, \ # make a random change according to the prob model __prob = {'ins': self.insertprob.get(__base), 'del': self.deleteprob.get(__base), 'sub': self.muprob.get(__base)} __error = random.choice(['ins', 'del', 'sub', 'sub']) # An error occurs randomly: insertion or \ # deletion or substitution __rnd = float(int( random.random() * 100000)) / 100000 # The probability that this error is \ # not corrected by replication machinary is determined \ if __rnd < __prob.get( __error): # by insertprob,deleteprob and muprob if __error == 'sub': self.seq.append(random.choice(self.mualphabet.get(__base))) self.occuredmu.append(__site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([self.seq[-1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice(list(self.alphabet))) self.occuredins.append(__site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([__base + self.seq[-1]]) # Update the list of inserted alleles elif __error == 'del': self.occureddel.append(__site) # Update the list of the sited with deleted letter else: self.seq.append(__base) self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) if (self.occuredins): _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions self.occuredins, self.inserted_allele = zip( *_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("Changes made to the haplotype!")
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: - sequence -- A Seq object with the emission sequence that we want to decode. - state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the initial, transition, and emission probs log_initial = self._log_transform(self.initial_prob) log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- recursion # loop over the training squence (i = 1 .. L) # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. for i in range(0, len(sequence)): # loop over all of the possible i-th states in the state path for cur_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(cur_state, sequence[i])] max_prob = 0 if i == 0: # for the first state, use the initial probability rather # than looking back to previous states max_prob = log_initial[cur_state] else: # loop over all possible (i-1)-th previous states possible_state_probs = {} for prev_state in self.transitions_to(cur_state): # a_{kl} trans_part = log_trans[(prev_state, cur_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(prev_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[prev_state] = cur_prob # calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) # v_{k}(i) viterbi_probs[(cur_state, i)] = (emission_part + max_prob) if i > 0: # get the most likely prev_state leading to cur_state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, cur_state)] = state break # --- termination # calculate the probability of the state path # loop over all states all_probs = {} for state in state_letters: # v_{k}(L) all_probs[state] = viterbi_probs[(state, len(sequence) - 1)] state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = list(range(1, len(sequence))) loop_seq.reverse() # last_state is the last state in the most probable state sequence. # Compute that sequence by walking backwards in time. From the i-th # state in the sequence, find the (i-1)-th state as the most # probable state preceding the i-th state. state = last_state traceback_seq.append(state) for i in loop_seq: state = pred_state_seq[(i - 1, state)] traceback_seq.append(state) # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
markovBuilder.set_emission_score('O', 'C', .33) markovBuilder.set_emission_score('O', 'G', .33) markovBuilder.set_emission_score('O', 'S', .33) markovBuilder.set_emission_score('P', 'A', .67) markovBuilder.set_emission_score('P', 'T', .33) #Menginisialisasi Hidden Markov Model markovModel = markovBuilder.get_markov_model() #3 sequence yang akan dialign seq1 = Seq('ATGA', arrayDNA()) seq2 = Seq('A CCA', arrayDNA()) seq3 = Seq('ACAST', arrayDNA()) #state untuk tiap sequence seq1State = MutableSeq('MNOP', arrayState()) seq2State = MutableSeq('MDIOP', arrayState()) seq3State = MutableSeq('MNIOP', arrayState()) seq = [seq1, seq2, seq3] states = [seq1State, seq2State, seq3State] #training Hidden Markov Model dengan sequence di atas trainer = Trainer.KnownStateTrainer(markovModel) for i in range(len(seq)): trainingseq = Trainer.TrainingSequence(seq[i], states[i]) trainedhmm = trainer.train([trainingseq]) #contoh query yang lain testSeq = Seq('ATSA', arrayDNA()) testState = MutableSeq('MNOP', arrayState())
def setUp(self): self.alphabet = TestAlphabet() self.genome = MutableSeq("1234", self.alphabet) self.organism = Organism.Organism(self.genome, fitness_calculator)
str_light_chain_one, str_light_chain_two, "ATGCGTATCGATCGCGATACGATTAGGCGGAT" ] def u_crc32(seq): #NOTE - On Python 2 crc32 could return a signed int, but on Python 3 it is #always unsigned #Docs suggest should use crc32(x) & 0xffffffff for consistency. return crc32(seq) & 0xffffffff for i, seq_str in enumerate(examples): print "Example %i, length %i, %s..." % (i + 1, len(seq_str), seq_str[:10]) #Avoid cross platforms with printing floats by doing conversion explicitly def simple_LCC(s): return "%0.2f" % lcc_simp(s) def windowed_LCC(s): return ", ".join(["%0.2f" % v for v in lcc_mult(s, 20)]) for checksum in [u_crc32, crc64, gcg, seguid, simple_LCC, windowed_LCC]: #First using a string: value = checksum(seq_str) print " %s = %s" % (checksum.__name__, value) #Secondly check it works with a Seq object assert value == checksum(Seq(seq_str, single_letter_alphabet)) #Finally check it works with a MutableSeq object assert value == checksum(MutableSeq(seq_str, single_letter_alphabet))
def test_count_overlap_start_end_GG(self): """Check our count_overlap method using GG with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [ (1, 7, 3), (3, None, 3), (3, 6, 2), (4, 6, 1), (4, -1, 2), (-5, None, 2), (-5, 7, 2), (7, -5, 0), (-100, None, 3), (None, 100, 3), (-100, 1000, 3), ] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("GG", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("GG", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0) # Testing UnknownSeq() with variable start and end arguments char_start_end_exp = [ ("N", 1, 7, 0), ("N", 1, 7, 0), ("N", -4, None, 0), ("N", -4, None, 0), ("X", 1, 7, 0), ] for char, start, end, exp in char_start_end_exp: self.assertEqual( UnknownSeq(12, character=char).count_overlap("GG", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [ ("G", 100, 105, 0), ("G", -1, 4, 0), ("G", 4, -1, 0), ("G", -8, -2, 0), ("G", -2, -8, 0), ("G", 8, 2, 0), ("G", 2, 8, 0), ("GG", 8, 2, 0), ("GG", 2, 8, 0), ("GG", -5, -1, 0), ("GG", 1, 5, 0), ("GGG", None, None, 0), ("GGGGGGGGG", None, None, 0), ("GGG", 1, 2, 0), ] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("GG", 1), 0)
def search_mutated_feature(vcf_record, gbk_dico): ''' - Search if mutation is located within a coding sequence - determine if mutation is synonymous or not using a MutableSeq record (copy of the original record with mutation) ''' from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from copy import copy from Bio.Alphabet import IUPAC from Bio.Seq import MutableSeq from Bio.Alphabet import generic_dna # create record_alt = copy(gbk_dico[vcf_record.CHROM]) record_alt.seq = MutableSeq(str(record_alt.seq), generic_dna) results = { "mut_location": "Intergenic", "mut_type": '-', "orf_name": '-', "gene": '-' } for feature in record_alt.features: if int(vcf_record.POS) in feature and feature.type != "source": results["mut_location"] = feature.type if feature.type == 'mobile_element': results["orf_name"] = feature.qualifiers[ "mobile_element_type"][0] elif feature.type == 'CDS': results["orf_name"] = feature.qualifiers["locus_tag"][0] else: results[ "orf_name"] = "Unknown locus for feature: %s" % feature.type try: results["gene"] = feature.qualifiers["gene"][0] except KeyError: results["gene"] = '-' if feature.type == 'CDS': if len(vcf_record.ALT[0]) > 1: results["mut_type"] = 'INDEL' continue else: aa_seq_ref = str( feature.extract(record_alt.seq).translate()) # mutate reference sequence if vcf_record.ALT[0] == '*': # frameshift results["mut_type"] = 'F' else: record_alt.seq[int(vcf_record.POS) - 1] = str( vcf_record.ALT[0]) # check if synonymous or not aa_seq_alt = str( feature.extract(record_alt.seq).translate()) if str(aa_seq_ref) == str(aa_seq_alt): results["mut_type"] = 'S' else: results["mut_type"] = extract_mutation( aa_seq_ref, aa_seq_alt) return results # if no match, return empty results return results