def ReadingFrameFinder(DNASTRING): CleanDNA = DNASTRING.rstrip("\n") OpenLocations = [] CloseLocations = [] stringlen = len(CleanDNA) TtoU = CleanDNA.replace("T", 'U') readingframeRange = xrange(0, stringlen) PossibleGenes = [] for item in readingframeRange: if TtoU[item:item+3] == "AUG": Newthing = xrange(item, stringlen, 3) storage = item for number in Newthing: if TtoU[number:number+3] == "UAA" or TtoU[number:number+3] == "UAG" or TtoU[number:number+3] == "UGA": PossibleGenes.append(TtoU[storage:number+3]) break for Seqeu in PossibleGenes: if len(Seqeu) % 3 == 0: LETGO = Seq(Seqeu, generic_rna) FinalizedProt.append(str(LETGO.translate())) else: Removal_Len = len(Seqeu) % 3 UpdatedSequence = Seqeu[:-Removal_Len] ETGO2 = Seq(UpdatedSequence, generic_rna) FinalizedProt.append(str(ETGO2.translate()))
def itercodon(seq, frame, offset, table, reverse=False): stop = 0 if not reverse: for i in xrange(frame, len(seq) - offset, 3): subseq = str(seq.seq)[i:i + 3] assert (len(subseq) % 3 == 0), (str(seq)) aa = Seq.translate(subseq, table) yield i, aa if i + 3 != len(seq): subseq = seq[i + 3:] + "N" * (3 - offset) assert (len(subseq) % 3 == 0) aa = Seq.translate(subseq, table) yield i, aa else: for i in xrange(len(seq), offset, -3): # the reverse complement subseq = Seq.reverse_complement(str(seq.seq)[i - 3:i]) assert (len(subseq) % 3 == 0) aa = Seq.translate(subseq, table) yield i, aa if offset: subseq = Seq.reverse_complement("N" * (3 - offset) + str(seq.seq)[:offset]) assert (len(subseq) % 3 == 0) aa = Seq.translate(subseq, table) yield i, aa
def aa_table(codonfile, outfile): """ """ codons = pd.read_csv(codonfile, sep="\t", index_col="hxb2").fillna("") subtables = [] for region in ranges: subtable = pd.DataFrame(columns=["region", "position", "coverage"] + aa_header, index=ranges[region]).fillna(0) subtable["region"] = region subtable["position"] = np.arange(1, len(subtable) + 1) for hxb2 in subtable.index: if hxb2 in codons.index: rows = codons.loc[[hxb2]] for codon, count in zip(rows["codon"], rows["count"]): if codon == "": subtable.loc[hxb2, "del"] += count elif len(codon) > 3: subtable.loc[hxb2, "ins"] += count aa = str(Seq.translate(codon[:3])) subtable.loc[hxb2, aa] += count else: aa = str(Seq.translate(codon)) subtable.loc[hxb2, aa] += count subtable["coverage"] = subtable[aa_header].sum(axis=1) subtables.append(subtable) pd.concat(subtables).to_excel(outfile, index_label="hxb2")
def translate(filename, out=sys.stdout, log=sys.stderr): """ Translate nucleotide sequences in FASTA file `filename` to all six possible frames. Write amino acid sequences to FASTA file `out`, with the frame number appended to the sequence header. Log summary statistics to file `log`. """ nskipped = 0 for n, record in enumerate(SeqIO.parse(filename, "fasta")): seq = str(record.seq) if 'N' in seq: nskipped += 1 continue for i in range(3): j = 3 * ((len(seq) - i) // 3) + i print(">%s-%d" % (record.id, i), file=out) print(Seq.translate(seq[i:j]), file=out) seq = str(record.seq.reverse_complement()) for i in range(3): j = 3 * ((len(seq) - i) // 3) + i print(">%s-%d'" % (record.id, i), file=out) print(Seq.translate(seq[i:j]), file=out) print("nreads", n, file=log) print("nskipped (N)", nskipped, file=log)
def test_stops(self): for nucleotide_seq in [ self.misc_stops, Seq.Seq(self.misc_stops), Seq.Seq(self.misc_stops, Alphabet.generic_nucleotide), Seq.Seq(self.misc_stops, Alphabet.DNAAlphabet()), Seq.Seq(self.misc_stops, IUPAC.unambiguous_dna), ]: self.assertEqual("***RR", str(Seq.translate(nucleotide_seq))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=1))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table="SGC0"))) self.assertEqual("**W**", str(Seq.translate(nucleotide_seq, table=2))) self.assertEqual( "**WRR", str(Seq.translate(nucleotide_seq, table="Yeast Mitochondrial"))) self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=5))) self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=9))) self.assertEqual( "**CRR", str(Seq.translate(nucleotide_seq, table="Euplotid Nuclear"))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=11))) self.assertEqual( "***RR", str(Seq.translate(nucleotide_seq, table="Bacterial")))
def add_translations(self): ''' translate the nucleotide sequence into the proteins specified in self.proteins. these are expected to be SeqFeatures ''' from Bio import Seq for node in self.tree.find_clades(order='preorder'): if not hasattr(node, "translations"): node.translations = {} node.aa_mutations = {} if node.up is None: for prot in self.proteins: node.translations[prot] = Seq.translate( str(self.proteins[prot].extract( Seq.Seq("".join(node.sequence)))).replace( '-', 'N')) node.aa_mutations[prot] = [] else: for prot in self.proteins: node.translations[prot] = Seq.translate( str(self.proteins[prot].extract( Seq.Seq("".join(node.sequence)))).replace( '-', 'N')) node.aa_mutations[prot] = [ (a, pos, d) for pos, (a, d) in enumerate( zip(node.up.translations[prot], node.translations[prot])) if a != d ] self.dump_attr.append('translations')
def assign_fitness(nodes): ''' loops over all viruses, translates their sequences and calculates the virus fitness ''' aa, sites, wt_aa, aa_prob = load_mutational_tolerance() aln = AlignIO.read('source-data/H1_H3.fasta', 'fasta') # returns true whenever either of the sequences have a gap aligned = (np.array(aln)!='-').min(axis=0) # map alignment positions to sequence positions, subset to aligned amino acids indices = {} for seq in aln: indices[seq.name] = (np.cumsum(np.fromstring(str(seq.seq), dtype='S1')!='-')-1)[aligned] # make a reduced set of amino-acid probabilities that only contains aligned positions aa_prob=aa_prob[indices['H1'],:] # attach another column for non-canonical amino acids aa_prob = np.hstack((aa_prob, 1e-5*np.ones((aa_prob.shape[0],1)))) if isinstance(nodes, list): for node in nodes: node['tol'] = calc_fitness_tolerance(Seq.translate(node['seq']), aa_prob, aa, indices['H3']) elif isinstance(nodes, dendropy.Tree): for node in nodes.postorder_node_iter(): node.tol = calc_fitness_tolerance(Seq.translate(node.seq), aa_prob, aa, indices['H3'])
def test_translation_on_proteins(self): """Check translation fails on a protein.""" for s in protein_seqs: with self.assertRaises(TranslationError): Seq.translate(s) with self.assertRaises(TranslationError): s.translate()
def test_translation_on_proteins(self): """Test translation shouldn't work on a protein!""" for s in protein_seqs: with self.assertRaises(ValueError): Seq.translate(s) if isinstance(s, Seq.Seq): with self.assertRaises(ValueError): s.translate()
def test_translation_to_stop(self): for nucleotide_seq in self.test_seqs: nucleotide_seq = nucleotide_seq[: 3 * (len(nucleotide_seq) // 3)] if "X" not in nucleotide_seq: short = Seq.translate(nucleotide_seq, to_stop=True) self.assertEqual(short, Seq.translate(nucleotide_seq).split("*")[0]) seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" self.assertEqual("VAIVMGRWKGAR", Seq.translate(seq, table=2, to_stop=True))
def test_translation_to_stop(self): for nucleotide_seq in self.test_seqs: nucleotide_seq = nucleotide_seq[:3 * (len(nucleotide_seq) // 3)] if isinstance(nucleotide_seq, Seq.Seq) and 'X' not in str(nucleotide_seq): short = Seq.translate(nucleotide_seq, to_stop=True) self.assertEqual(str(short), str(Seq.translate(nucleotide_seq).split('*')[0])) seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" self.assertEqual("VAIVMGRWKGAR", Seq.translate(seq, table=2, to_stop=True))
def calc_total_subst(start_codon, end_codon): """ Returns total synonymous substitutions, nonsynonymous substitutions. If there are multiple positions that differ between codons, then returns the average synonynous substitutions, average nonsynonymous substitutions across all possible pathways from codon1 to codon2 where each stage in a pathway is separated by 1 position mutation. :param Bio.Seq.Seq start_codon: 3bp codon :param Bio.Seq.Seq end_codon: 3bp codon :return tuple (int, int): (average point mutations that yield same amino acid across all pathways, average point mutations that yield different amino acid across all pathways) """ total_syn = 0.0 total_nonsyn = 0.0 total_subs = 0.0 upper_start_codon = start_codon.upper() upper_end_codon = end_codon.upper() # find positions where the codons differ diff_pos = [] for pos, nucstr1 in enumerate(str(upper_start_codon)): nucstr2 = str(upper_end_codon[pos]) if nucstr1 != nucstr2: diff_pos.extend([pos]) # Traverse all possible pathways from start_codon to end_codon where # each stage of a pathway mutates by 1 base. last_codon = upper_start_codon last_aa = Seq.translate(last_codon) for pathway in itertools.permutations(diff_pos): print str(upper_start_codon) + " " + str(upper_end_codon) + " " + ",".join([str(x) for x in pathway]) for mut_pos in pathway: mut_nuc = upper_end_codon[mut_pos] mut_codon = last_codon[:mut_pos] + mut_nuc + last_codon[mut_pos+1:] mut_aa = Seq.translate(mut_codon) total_subs += 1 if str(last_aa) == str(mut_aa): total_syn += 1 else: total_nonsyn += 1 last_codon = mut_codon last_aa = mut_aa if str(last_codon) != str(upper_end_codon): raise ValueError("Pathway does not yield end codon " + str(last_codon)) if total_subs: ave_syn = total_syn/total_subs ave_nonsyn = total_nonsyn/total_subs else: ave_syn = 0.0 ave_nonsyn = 0.0 return ave_syn, ave_nonsyn
def ex4(): seqs_histones, seqs_bzips = read_sequences() seqs_histones = [Seq.translate(s, to_stop=True) for s in seqs_histones] seqs_bzips = [Seq.translate(s, to_stop=True) for s in seqs_bzips] print("histones:") compute_all_with_all(seqs_histones, function=compute_pair_ex4) print("bzips:") compute_all_with_all(seqs_bzips, function=compute_pair_ex4) print("bzips x histones") compute_all_with_all(seqs_histones, seqs_bzips, function=compute_pair_ex4)
def get_syn_mutations(self, region, mask_constrained=True): if region in self.annotation and self.annotation[region].type in [ 'gene', 'protein' ]: try: aft = self.get_allele_frequency_trajectories(region) if len(aft.mask.shape) == 0: aft_valid = np.ones((aft.shape[0], aft.shape[-1]), dtype=bool) else: aft_valid = ~np.array([af.mask.sum(axis=0) for af in aft], dtype=bool) gaps = self.get_gaps_by_codon(region) initial_seq = self.get_initial_sequence(region) consensi = [] for af in aft: tmp = consensus(af) tmp[gaps] = 'N' consensi.append(tmp) cons_aa = np.array([ np.fromstring(Seq.translate(''.join(cons.astype('U'))), dtype='S1') for cons in consensi ]) no_substitution = np.repeat( np.array([ len(np.unique(col[ind])) == 1 for ind, col in zip(aft_valid.T[::3], cons_aa.T) ], dtype=bool), 3) syn_muts = np.zeros(aft.shape[1:], dtype=bool) for pos in range(aft.shape[-1]): ci = pos // 3 rf = pos % 3 codon = ''.join(initial_seq[ci * 3:(ci + 1) * 3].astype("U")) for ni, nuc in enumerate(alpha[:4].astype("U")): mod_codon = codon[:rf] + nuc + codon[rf + 1:] try: syn_muts[ni,pos] = (Seq.translate(codon)==Seq.translate(mod_codon))\ *no_substitution[pos] except: syn_muts[ni, pos] = False if mask_constrained: syn_muts[:, self.get_constrained(region)] = False return syn_muts except: import ipdb ipdb.set_trace() else: print(region, "is not a valid protein or gene") return None
def test_translation_to_stop(self): for nucleotide_seq in self.test_seqs: nucleotide_seq = nucleotide_seq[:3 * (len(nucleotide_seq) // 3)] if isinstance(nucleotide_seq, Seq.Seq) and 'X' not in str(nucleotide_seq): short = Seq.translate(nucleotide_seq, to_stop=True) self.assertEqual( str(short), str(Seq.translate(nucleotide_seq).split('*')[0])) seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" self.assertEqual("VAIVMGRWKGAR", Seq.translate(seq, table=2, to_stop=True))
def main() -> None: """ Make a jazz noise here """ args = get_args() if seqs := [str(rec.seq) for rec in SeqIO.parse(args.file, 'fasta')]: rna = seqs[0].replace('T', 'U') orfs = set() for seq in [rna, Seq.reverse_complement(rna)]: for i in range(3): if prot := Seq.translate(truncate(seq[i:], 3), to_stop=False): for orf in find_orfs(prot): orfs.add(orf)
def translateSeq(cds): senseOrAnti = 'sense' finalCDS = cds try: translated = Seq.translate(cds,cds=True) # finalCDS = cds except TranslationError,e: try: reverseCDS = Seq.reverse_complement(cds) translated = Seq.translate(reverseCDS,cds=True) finalCDS = reverseCDS senseOrAnti = 'anti' except TranslationError,e: print 'Translation failed in %s'%cds
def parseFeatureBed(bedFile,regionSeqs): print 'VIVAN: Parsing %s'%bedFile bedFile = open(bedFile,'r').xreadlines() features = {} for line in bedFile: if line.strip(): feature = Feature(line) regionSeq = regionSeqs[feature.region] feature.cds = feature.getCDS(regionSeq) try: translated = Seq.translate(feature.cds,cds=True) except TranslationError,e: translated = Seq.translate(feature.cds) WARNINGS.append('Translation error in feature : %s\nCDS : %s\nProtein : %s\n%s\n'%(feature.featureLine,feature.cds,translated,e)) features[feature.name]=feature
def group_by_protein(fasta_file): """ Groups DNA sequences based on the protein they code for. Args: fasta_file (str): path to the FASTA file with DNA sequences for a gene. Returns: protein_diversity (dict): dictionary with a gene identifier as key and another dictionary as value. The nested dictionary has protein sequences as keys and a list as value for each key. Each list has the allele identifiers and sequences that code for that protein, organized in tuples. """ protein_diversity = {} basename = os.path.basename(fasta_file) protein_diversity[basename] = {} for record in SeqIO.parse(fasta_file, 'fasta'): seqid = record.id allele_id = seqid.split('_')[-1] sequence = str(record.seq) try: protein = Seq.translate(sequence, table=11, cds=True) except Exception: continue if protein in protein_diversity[basename]: protein_diversity[basename][protein][0].append((allele_id, sequence)) else: protein_diversity[basename][protein] = [[(allele_id, sequence)]] return protein_diversity
def read_reference(fname, genemap): try: ref = str(SeqIO.read(fname, 'fasta').seq) except: with open(fname, 'r') as fh: ref = "".join([x.strip() for x in fh]) translations = {} with open(genemap, 'r') as fh: for line in fh: if line[0] == '#': continue entries = [x.strip() for x in line.strip().split('\t')] start = int(entries[3]) end = int(entries[4]) strand = entries[6] attributes = { x.split()[0]: ' '.join(x.split()[1:]) for x in entries[8].split(';') } if 'gene_name' in attributes: name = attributes['gene_name'].strip('"') else: name = None translation = Seq.translate( SeqFeature.SeqFeature( SeqFeature.FeatureLocation( start - 1, end, strand=-1 if strand == '-' else 1)).extract(ref)) translations[name] = str(translation) return {"nuc": ref, "translations": translations}
def translationBio(data): '''Uses Biopython translate ''' proteinSeq = '' for line in data: proteinSeq += Seq.translate(line, table='Standard', stop_symbol='', to_stop=False) #proteinSeq += Seq.translate(line) print proteinSeq
def export(self, path = '', extra_attr = ['aa_muts']): from Bio import Seq from itertools import izip timetree_fname = path+'tree.json' sequence_fname = path+'sequences.json' tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr) write_json(tree_json, timetree_fname, indent=None) elems = {} elems['root'] = {} elems['root']['nuc'] = "".join(self.tree.root.sequence) for prot in self.proteins: tmp = str(self.proteins[prot].extract(Seq.Seq(elems['root']['nuc']))) #elems['root'][prot] = str(Seq.translate(tmp.replace('---', 'NNN'))).replace('X','-') elems['root'][prot] = str(Seq.translate(tmp.replace('-', 'N'))).replace('X','-') for node in self.tree.find_clades(): if hasattr(node, "clade") and hasattr(node, "sequence"): elems[node.clade] = {} elems[node.clade]['nuc'] = {pos:state for pos, (state, ancstate) in enumerate(izip(node.sequence, self.tree.root.sequence)) if state!=ancstate} for node in self.tree.find_clades(): if hasattr(node, "clade") and hasattr(node, "translations"): for prot in self.proteins: elems[node.clade][prot] = {pos:state for pos, (state, ancstate) in enumerate(izip(node.translations[prot], elems['root'][prot])) if state!=ancstate} write_json(elems, sequence_fname, indent=None)
def add_translations(self): ''' translate the nucleotide sequence into the proteins specified in self.proteins. these are expected to be SeqFeatures ''' from Bio import Seq # Sort proteins by start position of the corresponding SeqFeature entry. sorted_proteins = sorted(self.proteins.items(), key=lambda protein_pair: protein_pair[1].start) for node in self.tree.find_clades(order='preorder'): if not hasattr(node, "translations"): # Maintain genomic order of protein translations for easy # assembly by downstream functions. node.translations=OrderedDict() node.aa_mutations = {} for prot, feature in sorted_proteins: node.translations[prot] = Seq.translate(str(feature.extract(Seq.Seq("".join(node.sequence)))).replace('-', 'N')) if node.up is None: node.aa_mutations[prot] = [] else: node.aa_mutations[prot] = [(a,pos,d) for pos, (a,d) in enumerate(zip(node.up.translations[prot], node.translations[prot])) if a!=d] self.dump_attr.append('translations')
def RFLP_digests(self, fasta_infile): RFLP_digests = {} for fasta_record in SeqIO.parse(fasta_infile, "fasta"): id = str(fasta_record.id) sequence = str(fasta_record.seq) desc = str(fasta_record.description) print(sequence) # sys.exit() digest_metadata = self.RFLP_digest(sequence) digest_metadata['ID'] = id digest_metadata['Description'] = desc digest_metadata['Nucleotide UT Sequence'] = sequence digest_metadata['Nucleotide UT Sequence Length'] = len(sequence) amino_acid_sequence = Seq.translate(fasta_record.seq, table='Standard', stop_symbol='*', to_stop=False, cds=False, gap=None) # print(str(amino_acid_sequence)) digest_metadata['Peptide UT Sequence'] = str(amino_acid_sequence) digest_metadata['Peptide UT Sequence Length'] = len( digest_metadata['Peptide UT Sequence']) RFLP_digests[id] = digest_metadata return RFLP_digests
def add_translations(self): from Bio import Seq for node in self.tree.find_clades(): if not hasattr(node, "translations"): node.translations={} for prot in self.proteins: node.translations[prot] = Seq.translate(str(self.proteins[prot].extract(Seq.Seq("".join(node.sequence)))).replace('-', 'N'))
def test_stops(self): for nucleotide_seq in [self.misc_stops, Seq.Seq(self.misc_stops)]: self.assertEqual("***RR", Seq.translate(nucleotide_seq)) self.assertEqual("***RR", Seq.translate(nucleotide_seq, table=1)) self.assertEqual("***RR", Seq.translate(nucleotide_seq, table="SGC0")) self.assertEqual("**W**", Seq.translate(nucleotide_seq, table=2)) self.assertEqual( "**WRR", Seq.translate(nucleotide_seq, table="Yeast Mitochondrial") ) self.assertEqual("**WSS", Seq.translate(nucleotide_seq, table=5)) self.assertEqual("**WSS", Seq.translate(nucleotide_seq, table=9)) self.assertEqual( "**CRR", Seq.translate(nucleotide_seq, table="Euplotid Nuclear") ) self.assertEqual("***RR", Seq.translate(nucleotide_seq, table=11)) self.assertEqual("***RR", Seq.translate(nucleotide_seq, table="Bacterial"))
def translate(config, rc=False): table = 1 if mycoplasma(config): # table 4 is for mycoplasma ala: # http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi table = 4 fd, fmap = None, None try: log.debug("Doing translation with table %d, rc: %s", table, rc) fd = os.open(ddna(config), os.O_RDONLY) fmap = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ) # By convention (e.g. from the C or NCBI) the DNA is is 1 # indexed; our DDNA is a c style array that is 0 indexed startIdx = config['startBase'] - 1 # The end index here is inclusive but array.slice isn't so we # don't need to subtract 1 endIdx = config['endBase'] seq = Seq.Seq(fmap[startIdx:endIdx]) if rc: seq = seq.reverse_complement() return { 'seq': str(seq), 'trans': str(Seq.translate(seq, table)) } finally: if fmap: fmap.close if fd: os.close(fd)
def getCodonTableInfo(codon_table_dict,ref_cds_coordinates_dict,proteins_pos_list,strain,nucleotide_pos,nucleotide,segment): codon_table_dict_copied = codon_table_dict for protein_pos in proteins_pos_list: protein_fields=protein_pos.split(":") protein_name=protein_fields[0] protein_codon_number=int(protein_fields[1]) if protein_name not in codon_table_dict_copied[strain]: # if protein does not exist yet codon_table_dict_copied[strain][protein_name]={protein_codon_number:[nucleotide,str(nucleotide_pos),None,None,None]} else: if protein_codon_number not in codon_table_dict_copied[strain][protein_name]: # if protein codon number does not exist yet codon_table_dict_copied[strain][protein_name][protein_codon_number]=[nucleotide,str(nucleotide_pos),None,None,None] else: if codon_table_dict_copied[strain][protein_name][protein_codon_number][2] == None: # if the secod position of the codon has not been filled codon_table_dict_copied[strain][protein_name][protein_codon_number][2]=str(nucleotide_pos) codon_table_dict_copied[strain][protein_name][protein_codon_number][0]=codon_table_dict_copied[strain][protein_name][protein_codon_number][0]+nucleotide else: if codon_table_dict_copied[strain][protein_name][protein_codon_number][3] == None: # if the third position of the codon has not been filled codon_table_dict_copied[strain][protein_name][protein_codon_number][3]=str(nucleotide_pos) codon_table_dict_copied[strain][protein_name][protein_codon_number][0]=codon_table_dict_copied[strain][protein_name][protein_codon_number][0]+nucleotide codon = codon_table_dict_copied[strain][protein_name][protein_codon_number][0] aa_code = Seq.translate(codon,to_stop=False,stop_symbol='*') codon_table_dict_copied[strain][protein_name][protein_codon_number][4] = aa_code else: print >> sys.stderr , "\n[ERROR]: The codon \""+str(protein_codon_number)+"\" is already set in the codon table as, "+codon_table_dict_copied[strain][protein_name][protein_codon_number]+". Contact the author because this is a major issue.\n" sys.exit(1) return codon_table_dict_copied
def getProtein(dna, protein): for i in range(1, 16): if i in range(7, 9): continue x = Seq.translate(dna, stop_symbol='', table=i) if x == protein: return i
def translateDNAtoAA(input_fasta, output_fasta, remove_lower_case = False): with open(input_fasta, 'r') as f: with open(output_fasta, 'w+') as g: for line in f.readlines(): if line[0] == '>': g.write(line) continue else: if line[-2:] == '\r\n': assert(len(line) %3 == 2) elif line[-1:] == '\n': assert(len(line) %3 == 1) if remove_lower_case: g.write(Seq.translate(line.translate(None, string.ascii_lowercase)[:-1], to_stop = True) + '\n') else: g.write(Seq.translate(line[:-1], to_stop = True) + '\n')
def insertion_is_synonymous_match(self, insertion, mtn): nt_pos = mtn["NAPosition"] + 3 - 1 nt_ins = mtn["InsertedCodonsText"] recovered_insertion = indels.Insertion( nt_ins=nt_ins, nt_pos=nt_pos, gene=insertion.gene, genotype=insertion.genotype, ) self.assertGreater(len(nt_ins), 0) mutated = insertion.mutated_gene aligned = recovered_insertion.mutated_gene self.assertEqual( bioseq.translate(mutated), bioseq.translate(aligned), )
def extract_proteome(): print 'EXTRAINDO PROTEOMA ...' global Arguments global BIN_PATHS global CWD global ProteomeFastaPath global ProteomeFastaDecoyPath global record ProteomeFastaContent = '' ProteomeFastaHandle = open(ProteomeFastaPath,'w') ProteomeCDSIndex = 0 for FilePath in SplitFilePathString(Arguments.genome): FileHandle = open(FilePath) for Scaffold in SeqIO.parse(FileHandle,'genbank'): print Scaffold.id for Feature in Scaffold.features: if Feature.type == 'CDS': ProteomeCDSIndex += 1 CDSSeq = Feature.location.extract(Scaffold) CDSProtSeq = Seq.translate(CDSSeq) if 'locus_tag' in Feature.qualifiers.keys(): LocusTag = Feature.qualifiers['locus_tag'][0] else: LocusTag = 'MISSING_LOCUS_TAG' if 'product' in Feature.qualifiers.keys(): Product = Feature.qualifiers['product'][0] else: Product = 'MISSING_PRODUCT' ProteomeFastaContent += '>{0}|{0} {1} {2} {3}\n{4}\n'.format( str(ProteomeCDSIndex), Scaffold.id, LocusTag,Product.replace("'",""), str(CDSProtSeq)) # add crap contaminant proteins crapProteinsHandler = open('/home/cdtec/Frederico/ms6/bin/crap.fasta') crapProteinsParser = SeqIO.parse(crapProteinsHandler,'fasta') for crapProteinIndex,crapProtein in enumerate(crapProteinsParser): ProteomeCDSIndex += 1 ProteomeFastaContent += '>CONTAMINANT_CRAP_{0}|{0} {1} {2}\n{3}\n'.format( str(ProteomeCDSIndex), crapProteinIndex, crapProtein.description.replace("'",""), str(crapProtein.seq)) # add custom contaminant proteins if record[8]: customContaminantProteinsHandler = open('/home/cdtec/Frederico/ms6/jobs_data/%s/contaminants.fasta'%Arguments.job_id) customContaminantProteinsParser = SeqIO.parse(customContaminantProteinsHandler,'fasta') for customContaminantIndex,customContaminant in enumerate(customContaminantProteinsParser): ProteomeCDSIndex += 1 ProteomeFastaContent += '>CONTAMINANT_CUSTOM_{0}|{0} {1} {2}\n{3}\n'.format( str(ProteomeCDSIndex), customContaminantIndex, customContaminant.description.replace("'",""), str(customContaminant.seq)) ProteomeFastaHandle.write(ProteomeFastaContent) ProteomeFastaHandle.close() return True
def test_stops(self): for nucleotide_seq in [self.misc_stops, Seq.Seq(self.misc_stops), Seq.Seq(self.misc_stops, Alphabet.generic_nucleotide), Seq.Seq(self.misc_stops, Alphabet.DNAAlphabet()), Seq.Seq(self.misc_stops, IUPAC.unambiguous_dna)]: self.assertEqual("***RR", str(Seq.translate(nucleotide_seq))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=1))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table="SGC0"))) self.assertEqual("**W**", str(Seq.translate(nucleotide_seq, table=2))) self.assertEqual("**WRR", str(Seq.translate(nucleotide_seq, table='Yeast Mitochondrial'))) self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=5))) self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=9))) self.assertEqual("**CRR", str(Seq.translate(nucleotide_seq, table='Euplotid Nuclear'))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=11))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table='Bacterial')))
def test_translation(self): for nucleotide_seq in self.test_seqs: nucleotide_seq = nucleotide_seq[:3 * (len(nucleotide_seq) // 3)] if isinstance(nucleotide_seq, Seq.Seq) and "X" not in str(nucleotide_seq): expected = Seq.translate(nucleotide_seq) self.assertEqual(repr(expected), repr(nucleotide_seq.translate()))
def mutationType(single_mutations): "Find mutations type (R/S) for single mutation" from Bio import Seq print len(single_mutations) for i in range(len(single_mutations)): germline = single_mutations[i][0] mutated = single_mutations[i][2] if '-' not in germline and 'N' not in germline and '-' not in mutated and 'N' not in mutated: if Seq.translate(germline) == Seq.translate(mutated): single_mutations[i].append('silent') else: single_mutations[i].append('replacement') else: single_mutations[i].append('unknown') return single_mutations
def test_translation_using_tables_with_ambiguous_stop_codons(self): """Check for error and warning messages. Here, 'ambiguous stop codons' means codons of unambiguous sequence but with a context sensitive encoding as STOP or an amino acid. Thus, these codons appear within the codon table in the forward table as well as in the list of stop codons. """ seq = "ATGGGCTGA" with self.assertRaises(ValueError): Seq.translate(seq, table=28, to_stop=True) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") Seq.translate(seq, table=28) message = str(w[-1].message) self.assertTrue(message.startswith("This table contains")) self.assertTrue(message.endswith("be translated as amino acid."))
def test_translation_on_proteins(self): """Check translation fails on a protein.""" for s in protein_seqs: if len(s) % 3 != 0: with self.assertWarns(BiopythonWarning): with self.assertRaises(TranslationError): Seq.translate(s) with self.assertWarns(BiopythonWarning): with self.assertRaises(TranslationError): s.translate() else: with self.assertRaises(TranslationError): Seq.translate(s) with self.assertRaises(TranslationError): s.translate()
def __init__(self, string): string = string.lower() if is_nucleotide(string): self.nucleotide = string warnings.simplefilter('ignore', BiopythonWarning) string = Seq.translate(string).lower() self.primary = string.split('*') self.secondary = [] self.structures = []
def deletion_is_synonymous_match(self, deletion, mtn): nt_pos = mtn["NAPosition"] - 1 nt_count = mtn["Control"].count('-') recovered_deletion = indels.Deletion( nt_pos=nt_pos, gene=deletion.gene, genotype=deletion.genotype, nt_count=nt_count, orig_nt=None, ) self.assertGreater(nt_count, 0) mutated = deletion.mutated_gene aligned = recovered_deletion.mutated_gene # Translate to amino-acids to account for synonymous mutations. self.assertEqual( bioseq.translate(mutated), bioseq.translate(aligned), )
def calc_total_poss_subst(codon): total_poss_syn = 0.0 total_poss_nonsyn = 0.0 orig_aa = Seq.translate(codon) for codon_pos in range(0, Utility.NUC_PER_CODON): nuc = codon[codon_pos] for mut_str in ("A", "C", "T", "G"): mut = Seq.Seq(mut_str) if str(mut).upper() == str(nuc).upper(): continue mut_codon = codon[:codon_pos] + mut + codon[codon_pos+1:] mut_aa = Seq.translate(mut_codon) if str(orig_aa).upper() == str(mut_aa).upper(): total_poss_syn += 1 else: total_poss_nonsyn += 1 return total_poss_syn, total_poss_nonsyn
def translateDNAtoAA(input_fasta, output_fasta): with open(input_fasta, 'r') as f: with open(output_fasta, 'w+') as g: for line in f.readlines(): if line[0] == '>': g.write(line) continue else: assert(len(line) %3 == 1) g.write(Seq.translate(line[:-1], to_stop = True) + '\n')
def check_fragments(oligo_file, design_fasta): design_aa_list = [] with open(design_fasta, 'r') as f: for pdb, seq in izip_longest(f, f, fillvalue=None): if '4AC0' and 'B0' in pdb: block = seq[77:117] elif '4AC0' and 'B1' in pdb: block = seq[99:138] elif '2uxo' and 'B0' in pdb: block = seq[62:100] elif '2uxo' and 'B1' in pdb: block = seq[136:176] else: raise Exception('Unrecognized design name') design_aa_list.append(block) fragment_list = [] with open(oligo_file, 'r') as o: for pdb, seq in izip_longest(o, o, fillvalue=None): if '4AC0' and 'B0' in pdb: seq_lower = seq.lower() seq_no_5p = seq_lower.split('gtgacccgtccctgggtctcaagat')[1] fragment = seq_no_5p.split('gccttgagaccgggcagaggtcgac')[0] elif '4AC0' and 'B1' in pdb: seq_lower = seq.lower() seq_no_5p = seq_lower.split('tgcccgctgtcttcaggtctcaagta')[1] fragment = seq_no_5p.split('catttgagacctgtagcccggcagtg')[0] elif '2uxo' and 'B0' in pdb: seq_lower = seq.lower() seq_no_5p = seq_lower.split('cgatcgtgcccacctggtctccactg')[1] fragment = seq_no_5p.split('gttctgagaccagttggagcccgcac')[0] elif '2uxo' and 'B1' in pdb: seq_lower = seq.lower() seq_no_5p = seq_lower.split('ctggtgcgtcgtctggtctctggat')[1] fragment = seq_no_5p.split('cgttggagaccggcgaacacttccc')[0] else: raise Exception('Unrecognized oligo name') fragment_list.append(fragment) missing_list = [] for item in fragment_list: aa_fragment = Seq.translate(item) if aa_fragment in design_aa_list: design_aa_list.remove(aa_fragment) else: missing_list.append(aa_fragment) if missing_list: sys.stderr.write('Error: The following oligo sequences do not match a design amino acid sequence\n') for miss in missing_list: sys.stderr.write('{0}\n'.format(miss)) if design_aa_list: sys.stderr.write('Error: The following design sequences do not match an oligo sequence\n') for design in design_aa_list: sys.stderr.write('{0}\n'.format(design)) sys.stdout.write('done\n')
def get_syn_mutations(self, region, mask_constrained = True): from itertools import izip if region in self.annotation and self.annotation[region].type in ['gene', 'protein']: try: aft = self.get_allele_frequency_trajectories(region) if len(aft.mask.shape) == 0: aft_valid = np.ones((aft.shape[0], aft.shape[-1]), dtype=bool) else: aft_valid = -np.array([af.mask.sum(axis=0) for af in aft], dtype=bool) gaps = self.get_gaps_by_codon(region) initial_seq = self.get_initial_sequence(region) consensi = [] for af in aft: tmp = consensus(af) tmp[gaps]='N' consensi.append(tmp) cons_aa = np.array([np.fromstring(Seq.translate(''.join(cons)), dtype='|S1') for cons in consensi]) no_substitution = np.repeat(np.array([len(np.unique(col[ind]))==1 for ind, col in izip(aft_valid.T[::3], cons_aa.T)], dtype=bool), 3) syn_muts = np.zeros(aft.shape[1:], dtype=bool) for pos in xrange(aft.shape[-1]): ci = pos//3 rf = pos%3 codon = ''.join(initial_seq[ci*3:(ci+1)*3]) for ni,nuc in enumerate(alpha[:4]): mod_codon = codon[:rf] + nuc + codon[rf+1:] try: syn_muts[ni,pos] = (Seq.translate(codon)==Seq.translate(mod_codon))\ *no_substitution[pos] except: syn_muts[ni,pos] = False if mask_constrained: syn_muts[:,self.get_constrained(region)] = False return syn_muts except: import pdb; pdb.set_trace() else: print region,"is not a valid protein or gene" return None
def get_protein_seq(self, transcript_id): gaf_record = self.get_transcript(transcript_id) tx_seq = self.get_transcript_seq(transcript_id) if not gaf_record or not tx_seq: return None if "cds_start" not in gaf_record or not gaf_record["cds_start"]: return None prot_seq = Seq.translate(tx_seq[gaf_record["cds_start"] - 1 : gaf_record["cds_stop"]]) if prot_seq[-1] == "*": prot_seq = prot_seq[:-1] return prot_seq
def create_sequence_dbs_for_GAF(gaf, transcripts_file, output_dir): from Bio import SeqIO from Bio import Seq import os print "Indexing GAF db by transcript id...\n" gaf_transcript_idx = dict() for i,g in enumerate(gaf): for k in gaf[g].keys(): for ctr,t in enumerate(gaf[g][k]): gaf_transcript_idx[t['transcript_id']] = (ctr,g,k) fh_transcripts = SeqIO.parse(transcripts_file, 'fasta') # transcripts_shlv = shelve.open(os.path.join(output_dir, 'GAF_transcript_seqs.fa.shlv'), 'c') # proteins_shlv = shelve.open(os.path.join(output_dir, 'GAF_protein_seqs.fa.shlv'), 'c') transcripts_shlv = Shove("file://" + os.path.join(output_dir, 'GAF_transcript_seqs.fa.shove')) protein_seqs_url = "file://" + os.path.join(output_dir, 'GAF_protein_seqs.fa.shove') proteins_shlv = Shove(protein_seqs_url) print "Writing transcript and protein shove dbs..." j = 0 transcripts_to_remove = list() for transcript in fh_transcripts: if j % 1000 == 0: print j j += 1 if transcript.name not in gaf_transcript_idx: continue gaf_record = gaf[gaf_transcript_idx[transcript.name][1]][gaf_transcript_idx[transcript.name][2]][gaf_transcript_idx[transcript.name][0]] raw_seq = str(transcript.seq) transcripts_shlv[transcript.name] = raw_seq if 'cds_start' not in gaf_record or not gaf_record['cds_start']: continue prot_seq = Seq.translate(raw_seq[gaf_record['cds_start']-1:gaf_record['cds_stop']]) if prot_seq[-1] == '*': prot_seq = prot_seq[:-1] elif prot_seq.find('*') != -1: # skip small number (n=12) transcripts with incorrect CDS coordinates transcripts_to_remove.append(transcript.name) continue proteins_shlv[transcript.name] = prot_seq for t in transcripts_to_remove: del transcripts_shlv[t] transcripts_shlv.close() proteins_shlv.close() return transcripts_to_remove,protein_seqs_url
def applyBias( multiple_mutations,multiple_group,bias ): "Determine types for multi mutations using pre-set bias" from itertools import permutations from Bio import Seq from collections import Counter counted = [0]*len(multiple_mutations) for mutation in multiple_mutations: if counted[multiple_mutations.index(mutation)] != 1: mismatch_positions = [i for i in range(len(mutation[0])) if mutation[0][i]!=mutation[2][i]] p = list(permutations(mismatch_positions)) type_list = [] type_count = [] for i in range(len(p)): types = [] germline = mutation[0] for j in range(len(p[i])): mutated = germline[:p[i][j]] + mutation[2][p[i][j]] + germline[p[i][j]+1:] if '-' not in germline and 'N' not in germline and '-' not in mutated and 'N' not in mutated: if Seq.translate(germline) == Seq.translate(mutated): types.append('silent') else: types.append('replacement') else: types.append('unkown') germline = mutated type_list.append(types) type_frequency = Counter(types) type_count.append(type_frequency[bias]) type_list = type_list[type_count.index(max(type_count))] indices = [i for i, x in enumerate(multiple_group) if x == multiple_group[multiple_mutations.index(mutation)]] for idx in indices: counted[idx] = 1 multiple_mutations[idx].append(type_list[indices.index(idx)]) return multiple_mutations
def translate(nuc): """Translate nucleotide sequence to amino acid""" from Bio import Seq try: tmp_aa = Seq.translate(nuc.replace('-','N')) #returns string when argument is a string, Bio.Seq otherwise except: print("translation failed",nuc) tmp_aa = 'X'*len(nuc)//3 aa_seq = "" for i,aa in enumerate(tmp_aa): if nuc[i*3:(i+1)*3]=='---': aa_seq+='-' else: aa_seq+=aa return aa_seq
def translate(seq): r = {} r['First Frame'] = Seq.translate(seq) r['Second Frame'] = Seq.translate(seq[1:]) r['Third Frame'] = Seq.translate(seq[2:]) seq = Seq.reverse_complement(seq) r['Complement First Frame'] = Seq.translate(seq) r['Complement Second Frame'] = Seq.translate(seq[1:]) r['Complement Third Frame'] = Seq.translate(seq[2:]) return r
def translate_sequence(input_seq): """Wrapper for Biopython translate function. Bio.Seq.translate will complain if input sequence is not a mulitple of 3. This wrapper function passes an acceptable input to Bio.Seq.translate in order to avoid this warning.""" trailing_bases = len(input_seq) % 3 if trailing_bases: input_seq = ''.join([input_seq, 'NN']) if trailing_bases == 1 else ''.join([input_seq, 'N']) output_seq = Seq.translate(input_seq) if trailing_bases: #remove last residue if input needed to be extended because of trailing bases output_seq = output_seq[:-1] return output_seq
def test_translation_using_cds(self): seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" self.assertEqual("MAIVMGRWKGAR", Seq.translate(seq, table=2, cds=True)) seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCG" # not multiple of three with self.assertRaises(TranslationError): Seq.translate(seq, table=2, cds=True) seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA" # no stop codon with self.assertRaises(TranslationError): Seq.translate(seq, table=2, cds=True) seq = "GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" # no start codon with self.assertRaises(TranslationError): Seq.translate(seq, table=2, cds=True)
def translate_en_orf6(): for tgt in tgt_list: input_handle = open(path_base+tgt) output_fname = path_base+'.'.join(tgt.split('.')[:-1])+'_translate.fasta' print output_fname output_handle = open(output_fname,'w') data = SeqIO.parse(input_handle,'fasta') for record in data: frame01 = SeqRecord(Seq.translate(record.seq[0:]),id=record.id+'ORF1|', name=record.name+'ORF1|',description=record.description) frame02 = SeqRecord(Seq.translate(record.seq[1:]),id=record.id+'ORF2|', name=record.name+'ORF2|',description=record.description) frame03 = SeqRecord(Seq.translate(record.seq[2:]),id=record.id+'ORF3|', name=record.name+'ORF3|',description=record.description) frame04 = SeqRecord(Seq.translate(record.reverse_complement().seq[0:]),id=record.id+'ORF4|', name=record.name+'ORF4|',description=record.description) frame05 = SeqRecord(Seq.translate(record.reverse_complement().seq[1:]),id=record.id+'ORF5|', name=record.name+'ORF5|',description=record.description) frame06 = SeqRecord(Seq.translate(record.reverse_complement().seq[2:]),id=record.id+'ORF6|', name=record.name+'ORF6|',description=record.description) SeqIO.write([frame01,frame02,frame03,frame04,frame05,frame06], output_handle, "fasta") output_handle.close()
print s.complement() assert False, "Complement shouldn't work on a protein!" except ValueError : pass try : print s.reverse_complement() assert False, "Reverse complement shouldn't work on a protein!" except ValueError : pass print print "Translating" print "===========" for nucleotide_seq in test_seqs: try : expected = Seq.translate(nucleotide_seq) print "%s\n-> %s" \ % (repr(nucleotide_seq) , repr(expected)) except (ValueError, TranslationError), e : expected = None print "%s\n-> %s" \ % (repr(nucleotide_seq) , str(e)) #Now test the Seq object's method if isinstance(nucleotide_seq, Seq.Seq) : try : assert repr(expected) == repr(nucleotide_seq.translate()) except (ValueError, TranslationError) : assert expected is None #Now check translate(..., to_stop=True) try : short = Seq.translate(nucleotide_seq, to_stop=True)
def translate(nuc, to_stop=False): """Translate nucleotide sequence to amino acid""" from Bio import Seq return Seq.translate(nuc, to_stop=to_stop) # returns string when argument is a string, Bio.Seq otherwise
def access_mixed_aa(file_name): """(str) ->(list,list,list,list). Return a list of amino acide code for ambiguous dna codon, position of ambiguous nt codon, aa name,seq id from fasta header by reading multifasta nucleotide fasta file """ from Bio import SeqIO aa = [] nucleotide_idx = [] nucl_codon = [] seqids = [] for seq_record in SeqIO.parse(file_name, 'fasta'): seq_id = seq_record.id seq_len = len(seq_record) header, seqline = seq_record.id, str(seq_record.seq) # for header, seqline in readFasta(file_name): # print header + "\n" + seq_line # my_seq = Seq(seq_line, IUPAC.extended_dna) my_seq = Seq(str(seqline), IUPAC.ambiguous_dna) # seq2 = Seq("ARAWTAGKAMTA", IUPAC.ambiguous_dna) # seq2 = seq2.translate() # print seq2 # print ambiguous_dna_values["W"] # print IUPAC.ambiguous_dna.letters seqline = seqline.replace("-", "N") n = 3 codon_list = {i + n: seqline[i:i + n] for i in range(0, len(seqline), n)} # print yaml.dump(ambi_codon) # print yaml.dump(codon_list) ambi_nucl = AMBICODON.keys() # print ambi_nucl # print ambi_codon["Y"] for key, codon in sorted(codon_list.iteritems()): # print "key: ", key , "codon:", codon if list_overlap(codon, ambi_nucl): d, e, f = codon m = [d, e, f] # print codon, ".....", key # print type(ambi_nucl) items = set(m).intersection(ambi_nucl) indexm = m.index(list(items)[0]) # print "index ...", indexm items = list(items) # eg. ['R'] for idx, val in enumerate(items): # print idx # print val codonlist = list(nearbyPermutations(codon)) # print "codon list :", codonlist val = getaalist(codonlist) # remove if aa codon is the same eg. ['D', 'D'] val = list(set(val)) val = "/".join(val) # yeild 'I/L' val = str(val) # print "codonlist *****", codonlist # print "aa val *******", val if "/" in val and indexm == 2: key = key nucleotide_idx.append(key) nucl_codon.append(codon) seqids.append(seq_id) elif "/" in val and indexm == 1: key = key - 1 nucleotide_idx.append(key) nucl_codon.append(codon) seqids.append(seq_id) elif "/" in val and indexm == 0: key = key - 2 nucleotide_idx.append(key) nucl_codon.append(codon) seqids.append(seq_id) else: pass # print ".....", val aa.append(val) else: # print "codon3 ..." ,codon aa1 = Seq(codon, IUPAC.unambiguous_dna) aa1 = aa1.translate() aa1 = str(aa1) aa.append(aa1) #print aa, nucleotide_idx, nucl_codon, seqids return aa, nucleotide_idx, nucl_codon, seqids
from pileup_user import read_fa_file from Bio import Seq reference = read_fa_file("data/reference.fa") print "Original sequence: " + Seq.translate(reference)[274] mutated_reference = "" for i, x in enumerate(reference): if i == 822: mutated_reference += "T" else: mutated_reference += x print "Read sequence: " + Seq.translate(mutated_reference)[274]