def cutOligos(GeneName, cutsite, DNA): # This function asks user for a target cut site, and generates oligos to clone that cut site into # a CRISPR plasmid, and to screen for positive clones containing that cut site. It may also generate # universal homology regions to integrate or otherwise alter that target region. # # The current cut site architecture is: >>>>> YtRNAp-HDV ribozyme- >20nt< -gRNA <<<<< GeneName=input("Name, using quotes: ") cutsite=input("20-mer cut sequence, using quotes: ") DNA=input("Locus sequence +/- a few kb, using quotes: ") if DNA.find(cutsite)==-1: # If cutiste sequence found in ANTISENSE DNA=Seq(DNA).reverse_complement() # then reverse DNA, and turn it into a string index=DNA.find(cutsite)+16 # index gives the start position of the string, e.g., 0. # we add 16 since index+0=start of 20-mer, so index+16=cut site, # 3 nt before last of 20mer Lup=DNA[index-520:index-490] # This primer binds 500bp upstream of cut site cutSequence=Seq("cgggtggcgaatgggacttt")+cutsite+Seq("gttttagagctagaaatagc") seqprimer=Seq("gacttt")+cutsite print("cut" + GeneName + " " + cutSequence) print("Lcolony" + GeneName + " " + seqprimer) print("Lup" + GeneName + " " + Lup)
def ReadingFrameFinder(DNASTRING): CleanDNA = DNASTRING.rstrip("\n") OpenLocations = [] CloseLocations = [] stringlen = len(CleanDNA) TtoU = CleanDNA.replace("T", 'U') readingframeRange = xrange(0, stringlen) PossibleGenes = [] for item in readingframeRange: if TtoU[item:item+3] == "AUG": Newthing = xrange(item, stringlen, 3) storage = item for number in Newthing: if TtoU[number:number+3] == "UAA" or TtoU[number:number+3] == "UAG" or TtoU[number:number+3] == "UGA": PossibleGenes.append(TtoU[storage:number+3]) break for Seqeu in PossibleGenes: if len(Seqeu) % 3 == 0: LETGO = Seq(Seqeu, generic_rna) FinalizedProt.append(str(LETGO.translate())) else: Removal_Len = len(Seqeu) % 3 UpdatedSequence = Seqeu[:-Removal_Len] ETGO2 = Seq(UpdatedSequence, generic_rna) FinalizedProt.append(str(ETGO2.translate()))
def assign_fitness(nodes): ''' loops over all viruses, translates their sequences and calculates the virus fitness ''' aa, sites, wt_aa, aa_prob = load_mutational_tolerance() aln = AlignIO.read('source-data/H1_H3.fasta', 'fasta') # returns true whenever either of the sequences have a gap aligned = (np.array(aln)!='-').min(axis=0) # map alignment positions to sequence positions, subset to aligned amino acids indices = {} for seq in aln: indices[seq.name] = (np.cumsum(np.fromstring(str(seq.seq), dtype='S1')!='-')-1)[aligned] # make a reduced set of amino-acid probabilities that only contains aligned positions aa_prob=aa_prob[indices['H1'],:] # attach another column for non-canonical amino acids aa_prob = np.hstack((aa_prob, 1e-5*np.ones((aa_prob.shape[0],1)))) if isinstance(nodes, list): for node in nodes: node['tol'] = calc_fitness_tolerance(Seq.translate(node['seq']), aa_prob, aa, indices['H3']) elif isinstance(nodes, dendropy.Tree): for node in nodes.postorder_node_iter(): node.tol = calc_fitness_tolerance(Seq.translate(node.seq), aa_prob, aa, indices['H3'])
def test_reverse_complement_on_proteins(self): """Test reverse complement shouldn't work on a protein!""" for s in protein_seqs: with self.assertRaises(ValueError): Seq.reverse_complement(s) with self.assertRaises(ValueError): s.reverse_complement()
def test_translation_on_proteins(self): """Test translation shouldn't work on a protein!""" for s in protein_seqs: with self.assertRaises(ValueError): Seq.translate(s) if isinstance(s, Seq.Seq): with self.assertRaises(ValueError): s.translate()
def test_translation_to_stop(self): for nucleotide_seq in self.test_seqs: nucleotide_seq = nucleotide_seq[:3 * (len(nucleotide_seq) // 3)] if isinstance(nucleotide_seq, Seq.Seq) and 'X' not in str(nucleotide_seq): short = Seq.translate(nucleotide_seq, to_stop=True) self.assertEqual(str(short), str(Seq.translate(nucleotide_seq).split('*')[0])) seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" self.assertEqual("VAIVMGRWKGAR", Seq.translate(seq, table=2, to_stop=True))
def test_back_transcription_of_proteins(self): """Test back-transcription shouldn't work on a protein!""" for s in protein_seqs: with self.assertRaises(ValueError): Seq.back_transcribe(s) if isinstance(s, Seq.Seq): with self.assertRaises(ValueError): s.back_transcribe()
def rc_kmers(self, kmers): res={} keys=[] for s in kmers: if Seq.reverse_complement(s) in keys: res[s]=Seq.reverse_complement(s) else: keys.append(s) res[s]=s return keys,res
def delGene(geneName, cutsite): # This function asks user for a chromosomal locus, a region to be deleted, a suitable CRIPSR cutsite # and outputs oligos for cloning of a pL308 Cas9-gRNA vector, and ones for generating a donor DNA # to delete the unwanted chromosomal region. Primers Lup+Rdown produce a 1kb band if deletion was # successful. # part of yCRISPRv3 by [email protected] #GeneName=input("Name, using quotes: ") #cutsite=input("20-mer cut sequence, using quotes: ").upper() locus = genomicData[geneName][0] deletion = genomicData[geneName][1] deletion = Seq(deletion) if deletion.find(cutsite)==-1: if deletion.reverse_complement().find(cutsite)==-1: print ("WARNING: Guide 20-mer sequence not found in deletion region.") locus=Seq(locus) index=locus.find(deletion) # index gives the start position within locus of the string deletion. # now we delete the deletion region to redefine a newlocus: newlocus=locus[0:index]+locus[index+len(deletion):] # note that since index starts at 0, a value of n points to, in the newlocus, # the first nt after the deletion. So we define the newlocus as above. Note too # that a string of len=40 ends at an index of 39--so we pick up at index+len-1. Lup=newlocus[index-500:index-470] Rdown=newlocus[index+469:index+499].reverse_complement() Rtemp1 = newlocus[:index].reverse_complement() Rtemp2 = newlocus[index:].reverse_complement() rPrimer, rLength = getPrimer(Rtemp1) lPrimer, lLength = getPrimer(newlocus[index:]) Rup = getOverhang(Rtemp2, rLength) + rPrimer Ldown = getOverhang(newlocus[:index], lLength) + lPrimer cutSequence=Seq("cgggtggcgaatgggacttt")+cutsite+Seq("gttttagagctagaaatagc") seqprimer=Seq("gacttt")+cutsite print("cut" + GeneName + " " + cutSequence) print("seq" + GeneName + " " + seqprimer) print("Lup" + GeneName + "del" + " " + Lup) print("Rup" + GeneName + "del" + " " + Rup) print("Ldown" + GeneName + "del" + " " + Ldown) print("Rdown" + GeneName + "del" + " " + Rdown) return Ldown, Rup
def calc_total_subst(start_codon, end_codon): """ Returns total synonymous substitutions, nonsynonymous substitutions. If there are multiple positions that differ between codons, then returns the average synonynous substitutions, average nonsynonymous substitutions across all possible pathways from codon1 to codon2 where each stage in a pathway is separated by 1 position mutation. :param Bio.Seq.Seq start_codon: 3bp codon :param Bio.Seq.Seq end_codon: 3bp codon :return tuple (int, int): (average point mutations that yield same amino acid across all pathways, average point mutations that yield different amino acid across all pathways) """ total_syn = 0.0 total_nonsyn = 0.0 total_subs = 0.0 upper_start_codon = start_codon.upper() upper_end_codon = end_codon.upper() # find positions where the codons differ diff_pos = [] for pos, nucstr1 in enumerate(str(upper_start_codon)): nucstr2 = str(upper_end_codon[pos]) if nucstr1 != nucstr2: diff_pos.extend([pos]) # Traverse all possible pathways from start_codon to end_codon where # each stage of a pathway mutates by 1 base. last_codon = upper_start_codon last_aa = Seq.translate(last_codon) for pathway in itertools.permutations(diff_pos): print str(upper_start_codon) + " " + str(upper_end_codon) + " " + ",".join([str(x) for x in pathway]) for mut_pos in pathway: mut_nuc = upper_end_codon[mut_pos] mut_codon = last_codon[:mut_pos] + mut_nuc + last_codon[mut_pos+1:] mut_aa = Seq.translate(mut_codon) total_subs += 1 if str(last_aa) == str(mut_aa): total_syn += 1 else: total_nonsyn += 1 last_codon = mut_codon last_aa = mut_aa if str(last_codon) != str(upper_end_codon): raise ValueError("Pathway does not yield end codon " + str(last_codon)) if total_subs: ave_syn = total_syn/total_subs ave_nonsyn = total_nonsyn/total_subs else: ave_syn = 0.0 ave_nonsyn = 0.0 return ave_syn, ave_nonsyn
def test_reverse_complement(self): test_seqs_copy = copy.copy(test_seqs) test_seqs_copy.pop(21) for nucleotide_seq in test_seqs_copy: if not isinstance(nucleotide_seq.alphabet, Alphabet.ProteinAlphabet) and \ isinstance(nucleotide_seq, Seq.Seq): expected = Seq.reverse_complement(nucleotide_seq) self.assertEqual(repr(expected), repr(nucleotide_seq.reverse_complement())) self.assertEqual(repr(expected[::-1]), repr(nucleotide_seq.complement())) self.assertEqual(str(nucleotide_seq.complement()), str(Seq.reverse_complement(nucleotide_seq))[::-1]) self.assertEqual(str(nucleotide_seq.reverse_complement()), str(Seq.reverse_complement(nucleotide_seq)))
def export(self, path = '', extra_attr = ['aa_muts']): from Bio import Seq from itertools import izip timetree_fname = path+'tree.json' sequence_fname = path+'sequences.json' tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr) write_json(tree_json, timetree_fname, indent=None) elems = {} elems['root'] = {} elems['root']['nuc'] = "".join(self.tree.root.sequence) for prot in self.proteins: tmp = str(self.proteins[prot].extract(Seq.Seq(elems['root']['nuc']))) #elems['root'][prot] = str(Seq.translate(tmp.replace('---', 'NNN'))).replace('X','-') elems['root'][prot] = str(Seq.translate(tmp.replace('-', 'N'))).replace('X','-') for node in self.tree.find_clades(): if hasattr(node, "clade") and hasattr(node, "sequence"): elems[node.clade] = {} elems[node.clade]['nuc'] = {pos:state for pos, (state, ancstate) in enumerate(izip(node.sequence, self.tree.root.sequence)) if state!=ancstate} for node in self.tree.find_clades(): if hasattr(node, "clade") and hasattr(node, "translations"): for prot in self.proteins: elems[node.clade][prot] = {pos:state for pos, (state, ancstate) in enumerate(izip(node.translations[prot], elems['root'][prot])) if state!=ancstate} write_json(elems, sequence_fname, indent=None)
def translationBio(data): '''Uses Biopython translate ''' proteinSeq = '' for line in data: proteinSeq += Seq.translate(line, table='Standard', stop_symbol='', to_stop=False) #proteinSeq += Seq.translate(line) print proteinSeq
def translateDNAtoAA(input_fasta, output_fasta, remove_lower_case = False): with open(input_fasta, 'r') as f: with open(output_fasta, 'w+') as g: for line in f.readlines(): if line[0] == '>': g.write(line) continue else: if line[-2:] == '\r\n': assert(len(line) %3 == 2) elif line[-1:] == '\n': assert(len(line) %3 == 1) if remove_lower_case: g.write(Seq.translate(line.translate(None, string.ascii_lowercase)[:-1], to_stop = True) + '\n') else: g.write(Seq.translate(line[:-1], to_stop = True) + '\n')
def translate(config, rc=False): table = 1 if mycoplasma(config): # table 4 is for mycoplasma ala: # http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi table = 4 fd, fmap = None, None try: log.debug("Doing translation with table %d, rc: %s", table, rc) fd = os.open(ddna(config), os.O_RDONLY) fmap = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ) # By convention (e.g. from the C or NCBI) the DNA is is 1 # indexed; our DDNA is a c style array that is 0 indexed startIdx = config['startBase'] - 1 # The end index here is inclusive but array.slice isn't so we # don't need to subtract 1 endIdx = config['endBase'] seq = Seq.Seq(fmap[startIdx:endIdx]) if rc: seq = seq.reverse_complement() return { 'seq': str(seq), 'trans': str(Seq.translate(seq, table)) } finally: if fmap: fmap.close if fd: os.close(fd)
def add_translations(self): ''' translate the nucleotide sequence into the proteins specified in self.proteins. these are expected to be SeqFeatures ''' from Bio import Seq # Sort proteins by start position of the corresponding SeqFeature entry. sorted_proteins = sorted(self.proteins.items(), key=lambda protein_pair: protein_pair[1].start) for node in self.tree.find_clades(order='preorder'): if not hasattr(node, "translations"): # Maintain genomic order of protein translations for easy # assembly by downstream functions. node.translations=OrderedDict() node.aa_mutations = {} for prot, feature in sorted_proteins: node.translations[prot] = Seq.translate(str(feature.extract(Seq.Seq("".join(node.sequence)))).replace('-', 'N')) if node.up is None: node.aa_mutations[prot] = [] else: node.aa_mutations[prot] = [(a,pos,d) for pos, (a,d) in enumerate(zip(node.up.translations[prot], node.translations[prot])) if a!=d] self.dump_attr.append('translations')
def add_translations(self): from Bio import Seq for node in self.tree.find_clades(): if not hasattr(node, "translations"): node.translations={} for prot in self.proteins: node.translations[prot] = Seq.translate(str(self.proteins[prot].extract(Seq.Seq("".join(node.sequence)))).replace('-', 'N'))
def mutationType(single_mutations): "Find mutations type (R/S) for single mutation" from Bio import Seq print len(single_mutations) for i in range(len(single_mutations)): germline = single_mutations[i][0] mutated = single_mutations[i][2] if '-' not in germline and 'N' not in germline and '-' not in mutated and 'N' not in mutated: if Seq.translate(germline) == Seq.translate(mutated): single_mutations[i].append('silent') else: single_mutations[i].append('replacement') else: single_mutations[i].append('unknown') return single_mutations
def test_stops(self): for nucleotide_seq in [self.misc_stops, Seq.Seq(self.misc_stops), Seq.Seq(self.misc_stops, Alphabet.generic_nucleotide), Seq.Seq(self.misc_stops, Alphabet.DNAAlphabet()), Seq.Seq(self.misc_stops, IUPAC.unambiguous_dna)]: self.assertEqual("***RR", str(Seq.translate(nucleotide_seq))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=1))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table="SGC0"))) self.assertEqual("**W**", str(Seq.translate(nucleotide_seq, table=2))) self.assertEqual("**WRR", str(Seq.translate(nucleotide_seq, table='Yeast Mitochondrial'))) self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=5))) self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=9))) self.assertEqual("**CRR", str(Seq.translate(nucleotide_seq, table='Euplotid Nuclear'))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=11))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table='Bacterial')))
def calc_total_poss_subst(codon): total_poss_syn = 0.0 total_poss_nonsyn = 0.0 orig_aa = Seq.translate(codon) for codon_pos in range(0, Utility.NUC_PER_CODON): nuc = codon[codon_pos] for mut_str in ("A", "C", "T", "G"): mut = Seq.Seq(mut_str) if str(mut).upper() == str(nuc).upper(): continue mut_codon = codon[:codon_pos] + mut + codon[codon_pos+1:] mut_aa = Seq.translate(mut_codon) if str(orig_aa).upper() == str(mut_aa).upper(): total_poss_syn += 1 else: total_poss_nonsyn += 1 return total_poss_syn, total_poss_nonsyn
def reverse_complement(sequence): """ Reverse complement of a sequence represented as unicode string. Unfortunately, BioPython's reverse_complement doesn't work on unicode strings. We work almost exclusively with unicode strings, so this is a convenience wrapper. """ return unicode(Seq.reverse_complement(str(sequence)))
def __init__(self, string): string = string.lower() if is_nucleotide(string): self.nucleotide = string warnings.simplefilter('ignore', BiopythonWarning) string = Seq.translate(string).lower() self.primary = string.split('*') self.secondary = [] self.structures = []
def oligos_1(): overhangs = ['CATG', 'ACAA'] bc = 'GATGATTGA' kozak = 'gccacc' start = 'atg' fwd =overhangs[0] + loxp + bc + kozak + start + lox71 rev = seq.reverse_complement(loxp+bc+kozak+start+lox71+overhangs[1]) print fwd.upper() print rev.upper()
def translateDNAtoAA(input_fasta, output_fasta): with open(input_fasta, 'r') as f: with open(output_fasta, 'w+') as g: for line in f.readlines(): if line[0] == '>': g.write(line) continue else: assert(len(line) %3 == 1) g.write(Seq.translate(line[:-1], to_stop = True) + '\n')
def check_fragments(oligo_file, design_fasta): design_aa_list = [] with open(design_fasta, 'r') as f: for pdb, seq in izip_longest(f, f, fillvalue=None): if '4AC0' and 'B0' in pdb: block = seq[77:117] elif '4AC0' and 'B1' in pdb: block = seq[99:138] elif '2uxo' and 'B0' in pdb: block = seq[62:100] elif '2uxo' and 'B1' in pdb: block = seq[136:176] else: raise Exception('Unrecognized design name') design_aa_list.append(block) fragment_list = [] with open(oligo_file, 'r') as o: for pdb, seq in izip_longest(o, o, fillvalue=None): if '4AC0' and 'B0' in pdb: seq_lower = seq.lower() seq_no_5p = seq_lower.split('gtgacccgtccctgggtctcaagat')[1] fragment = seq_no_5p.split('gccttgagaccgggcagaggtcgac')[0] elif '4AC0' and 'B1' in pdb: seq_lower = seq.lower() seq_no_5p = seq_lower.split('tgcccgctgtcttcaggtctcaagta')[1] fragment = seq_no_5p.split('catttgagacctgtagcccggcagtg')[0] elif '2uxo' and 'B0' in pdb: seq_lower = seq.lower() seq_no_5p = seq_lower.split('cgatcgtgcccacctggtctccactg')[1] fragment = seq_no_5p.split('gttctgagaccagttggagcccgcac')[0] elif '2uxo' and 'B1' in pdb: seq_lower = seq.lower() seq_no_5p = seq_lower.split('ctggtgcgtcgtctggtctctggat')[1] fragment = seq_no_5p.split('cgttggagaccggcgaacacttccc')[0] else: raise Exception('Unrecognized oligo name') fragment_list.append(fragment) missing_list = [] for item in fragment_list: aa_fragment = Seq.translate(item) if aa_fragment in design_aa_list: design_aa_list.remove(aa_fragment) else: missing_list.append(aa_fragment) if missing_list: sys.stderr.write('Error: The following oligo sequences do not match a design amino acid sequence\n') for miss in missing_list: sys.stderr.write('{0}\n'.format(miss)) if design_aa_list: sys.stderr.write('Error: The following design sequences do not match an oligo sequence\n') for design in design_aa_list: sys.stderr.write('{0}\n'.format(design)) sys.stdout.write('done\n')
def get_sgrna(self): # return DataFrame contains possible sgRNAs. if not hasattr(self, 'sgrna'): ngg = re.compile( '([atgcATGC]{20})([atgcATGC](GG|gg|Gg|gG))' ) ccn = re.compile( '((CC|cc|Cc|cC)[atgcATGC])([atgcATGC]{20})' ) columns = ['seqname', 'start', 'cut', 'end', 'sgrna', 'pam'] sgrna = list() for chromosome in self.genome: sglist = [ { 'seqname': chromosome.id, 'start': x.start(), 'cut': x.end() - 6, 'end': x.end() - 3, 'sgrna': x.group(1), 'pam': x.group(2) } for x in ngg.finditer(str(chromosome.seq)) ] sglist.extend( { 'seqname': chromosome.id, 'start': x.start() + 3, 'cut': x.start() + 6, 'end': x.end(), 'sgrna': Seq.reverse_complement(x.group(3)), 'pam': Seq.reverse_complement(x.group(1)) } for x in ccn.finditer(str(chromosome.seq)) ) sgrna.append( pd.DataFrame( sglist, columns = columns ) ) self.sgrna = pd.concat(sgrna, axis = 0, ignore_index = True) return self.sgrna
def get_syn_mutations(self, region, mask_constrained = True): from itertools import izip if region in self.annotation and self.annotation[region].type in ['gene', 'protein']: try: aft = self.get_allele_frequency_trajectories(region) if len(aft.mask.shape) == 0: aft_valid = np.ones((aft.shape[0], aft.shape[-1]), dtype=bool) else: aft_valid = -np.array([af.mask.sum(axis=0) for af in aft], dtype=bool) gaps = self.get_gaps_by_codon(region) initial_seq = self.get_initial_sequence(region) consensi = [] for af in aft: tmp = consensus(af) tmp[gaps]='N' consensi.append(tmp) cons_aa = np.array([np.fromstring(Seq.translate(''.join(cons)), dtype='|S1') for cons in consensi]) no_substitution = np.repeat(np.array([len(np.unique(col[ind]))==1 for ind, col in izip(aft_valid.T[::3], cons_aa.T)], dtype=bool), 3) syn_muts = np.zeros(aft.shape[1:], dtype=bool) for pos in xrange(aft.shape[-1]): ci = pos//3 rf = pos%3 codon = ''.join(initial_seq[ci*3:(ci+1)*3]) for ni,nuc in enumerate(alpha[:4]): mod_codon = codon[:rf] + nuc + codon[rf+1:] try: syn_muts[ni,pos] = (Seq.translate(codon)==Seq.translate(mod_codon))\ *no_substitution[pos] except: syn_muts[ni,pos] = False if mask_constrained: syn_muts[:,self.get_constrained(region)] = False return syn_muts except: import pdb; pdb.set_trace() else: print region,"is not a valid protein or gene" return None
def getSequences(geneName): from intermine.webservice import Service template = service.get_template('Gene_GenomicDNA') rows = template.rows( E = {"op": "LOOKUP", "value": geneName, "extra_value": "S. cerevisiae"} ) count = 0 for row in rows: geneSeq = Seq(row["sequence.residues"]) locusSeq = Seq(row["chromosome.residues.locus"]) index = locusSeq.find(geneSeq) locusSeq = locusSeq[index-1000:locusSeq] # Reduce locusSize so it is only +/- 1 kbp of geneSeq break return geneSeq, locusSeq
def orf_reader(infile): orfs = {} handle = open(infile,"r") lines = handle.readlines() for line in lines: if line[0] != "#": line_array = line.split("\t") if int(line_array[1]) < 0: orfs[line_array[0]] = [Seq.reverse_complement(line_array[4]),line_array[4]] else: orfs[line_array[0]] = [line_array[4],line_array[4]] return orfs
def translate(seq): r = {} r['First Frame'] = Seq.translate(seq) r['Second Frame'] = Seq.translate(seq[1:]) r['Third Frame'] = Seq.translate(seq[2:]) seq = Seq.reverse_complement(seq) r['Complement First Frame'] = Seq.translate(seq) r['Complement Second Frame'] = Seq.translate(seq[1:]) r['Complement Third Frame'] = Seq.translate(seq[2:]) return r
def setUp(self): self.test_seqs = [ Seq.Seq("TCAAAAGGATGCATCATG"), Seq.Seq("ATGAAACTG"), Seq.Seq("ATGAARCTG"), Seq.Seq("AWGAARCKG"), # Note no U or T Seq.Seq("".join(ambiguous_rna_values)), Seq.Seq("".join(ambiguous_dna_values)), Seq.Seq("AUGAAACUG"), Seq.Seq("ATGAAACTGWN"), Seq.Seq("AUGAAACUGWN"), Seq.MutableSeq("ATGAAACTG"), Seq.MutableSeq("AUGaaaCUG"), ]
def setUp(self): self.bed_row = "\t".join( "TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1 0 3539 TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1|m.13 0 + 2 2969 0 1 3539 0" .split()) self.sequence = """ATCGAGCAGATTGGCCGCAACCTACAACTCCCACGGCCCAAGCACTCTCTCTCTCTCTTTCCCTCTCACC CTCGCCTCCGCTCCCCCATTTCCGAAGTACTCGCGAGCCAGCGGCCTCCAGCTCACCACCGTTTCCGCCG CGCGCAGATCCGCCCAATCCGTGCAGCCTCAGGCCACCGCTCTGGTTCCGTGACATGTGGCGAGGTGGTG GCGCAGACGCTGATGCAGGAGGCGCTCGCGAGGCTGAGGAGCACAACAATGTCGAGGAAGAGGAAGGGAG TGAGGATGGAGATCGGGACCTGCAGAATAAACGTCCTAAAGTGGGTGCTTTTGGCGAAGAAAGCTCTGGT GTTAATGCATCCTTCTTTGGATATGAAGCACCACATTTGCATGCTTTTGCTGAACATGACCATTTGAAGC TGTCACATGGTCCAGAAAATGAATTGGATTTTGGTTTGTCGCTTATCTCAAATGATGGTGGGAATGATAT TCCAAGGGAGACCAACAGTCATGGTGTCTGTGATGTAGAAAGATCAGGTGGAACAAATGCAGAAGATCTT GAAATAAGAATGGACCTATCTGATGATCTCTTGCACCTGATATTCTCCTTCTTATGCCAGAAGGATTTAT GTAGAGCAGGGGCTGCCTGCAAACAGTGGCAGTCTGCTAGTATGCATGAGGATTTCTGGAAATATTTGAA GTTTGAGAACACCAGAATATCTCTGCAGAACTTTGTTAATATTTGCCACCGTTATCAGAATGTGACAAAT CTCAATTTGTCTGGTGTCTTAAGTGCAGAAAGCCTAGTGATTGAAGCAATAACATTCTTAAGGCATCTTA AGACCTTGATAATGGGCAAGGGACAACTGGGAGAAACATTTTTTCAGGCTTTGGCTGAATGCCCATTGTT AAATACTTTAACAGTCAGTGATGCATCCCTTGGTAGTGGCATTCAAGAGGTAACTGTTAATCATGATGGA TTGCATGAACTTCAAATTGTGAAGTGTCGTGCACTCAGAGTATCTATCAGATGCCACCAACTTCGAATAC TGTCTCTGAGGAGAACTGGCATGGCTCATGTATCACTCAATTGTCCTCAGTTGCTTGAATTGGATTTTCA GTCCTGCCATAAGCTTTCTGACACTGCAATTCGTCAAGCAGCGACAGCCTGTCCACTGTTAGCGTCACTA GATATGTCATCCTGCTCGTGTGTTACTGATGAGACATTGCGTGAGATAGCTAATGCATGTCAAAATCTTT CTGTTCTTGATGCATCTAACTGCCCCAACATTTCTTTCGAGTCGGTAAAGCTTCCAATGTTGGTAGACTT GAGACTATCAAGTTGTGAGGGAATCACATCTGCTTCAATGGGTGCAGTATGTTTTAGTCGTATACTTGAG GCGTTGCAACTTGATAATTGTAGCCTGTTGACATCTGTGTCTTTGGATCTGCCACATCTCAAGAATATTA GTCTTGTACACCTCCGCAAGTTTGCTGATTTAAATCTGCGAAGCCCTGTGCTTTCTTACATAAAAGTTTC CAGATGCTCAGCACTTCGTTGTGTTACCATAACATCAAATGCTCTTAAGAAACTGGTGCTTCAAAAACAA GAGAGCCTATGTAATTTATCATTGCAATGCCACAATTTAATTGATGTTGATCTTAGTGATTGCGAGTCAT TGACAAATGAGATCTGCAAAGTTCTCAGTGACGGAGGGGGTTGCCCCATGCTCAGGTCATTAATTCTTGA TAATTGTGAGAGTTTGAGTGTCGTGGAACTGAATAATAGTTCTTTGGTTAATCTCTCACTTGCTGGTTGC CGTTCCATGACATTCCTGAAACTTGCATGCCCAAAGCTTCAAGTGGTGATTCTTGATGGTTGTGATCATC TTGAAAGAGCATCATTTTGCCCGGTTGGTCTTGAATCCCTAAACCTTGGAATTTGTCCAAAGTTGAGTGT TCTACGCATAGAGGCCCCAAATATGTCTATATTGGAGCTGAAGGGCTGTGGTGTCCTTTCTGAGGCTTCA ATTAATTGTCCTTGCTTGATATCTTTAGATGCCTCTTTCTGCAGACAGTTTATGGATGATTCGCTGTCCC AAACAGCAGAAGCATGCCCTCTTATTGAACATCTTATATTGTCTTCATGTTTATCCATTGACGTCCGTGG ATTGTCTTCTCTGCATTGCCTTCAGAAGCTGGCCTTGCTTGACCTATCATATACATTTTTGATGAACTTG AAGCCGGTTTTTGACAGTTGTCTGCAGTTGAAGGTCTTGAAACTTTCAGCTTGCAAGTATCTCAGTGATT CATCTTTGGAACCACTCTACAGAGAGGGTGCTCTACCGATGCTCGTTGAGCTAGATCTGTCCTACTCGTC CATTGGGCAGACTGCAATAGAAGAGCTTCTCGCGTGCTGTACAAATTTGGTTAATGTGAACCTAAACGGA TGTACGAACTTGCATGAATTGGTATGTGGATCAGACTATTGCCGGTCCGGTGACATGCCAATTGATGCTT TCCCCCCTGATTCTGCACCAGACAAGACCAAAGAGATCAGGGAGAGTTCGGATTGTCAGCTTGAAGTTCT CAGTTGTACTGGCTGTCCAAATATTAAGAAAGTTGTTATTCCTTCAACGGCCAACTATCTGAATTTGTCT AAGATCAACCTTAATTTGTCTGCAAACTTGAAGGAAGTAGATTTGAAGTGCTCCAATCTTTACAATTTAA ATTTGAGCAATTGTAACTCACTGGAGATTCTGAAGCTTGATTGCCCAAGATTGGCTAACCTCCAACTTTT GGCATGCACAATGTTGCAAGAGGATGAACTGAAATCTGCACTATCCTTTTGCGGTGCATTGGAGATCCTC AATGTGCACTCTTGTCCACAAATAAACACGCTGGATTTTGGCAGGCTACAGGCTGTTTGCCCAACTCTTA AGCGCATCCAGAGCAGCCCCATCGCATAGTATGAAGGATTCTGGTCTTCTTAATGGACTCGAGTAAATAG TCCAGATTTGAAACAGAAAAGGCCATGTCGTACTCTTGTACATATGCAGCACCGCCAATATATTGTATGG CTGCATGTATTAGGGAGCCAGGGCTGACATGAAACCTGTTCTTCCAATCGATTTCTTGTGTTGAATCTAG TTGAAACATGGAAACCGCACTTCCTAGTTTGTATTTGCTTTTGAGGTGCAGTGATGGAGTAAGCAGATCT GTATTTATATGAATGAATAACCATCTTGTTTGGATCGTCGATGTTGTATGCTTCATTGATGACATGGGGT GCTAAGTTTGACTGAAATTACACCAGGTTCTATGGTTCTCTCATAAGGTGCAGTGATTCTGCGGTCTTTA TTAATCTGTCTCAACTGTGACGATGCAACTGAGACGTTTCCATCTGCCGGCTGCTGATGCTGTGAACTCT TGGTAAAAAACCTGGTGTACTTGATCCAAGAGCATTCGTTGGGTCACTTGTATCCTTGAAAATTGAGTAA CTAATAAATGCTGTTGTGTAAAAAAAAGGGGCTTTCTTT""" self.seq = SeqRecord.SeqRecord( Seq.Seq(self.sequence.replace("\n", "")), id="TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1") self.index = dict() self.index["TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1"] = self.seq
def Tm_NN(seq, check=True, strict=True, c_seq=None, shift=0, nn_table=DNA_NN3, tmm_table=DNA_TMM1, imm_table=DNA_IMM1, de_table=DNA_DE1, dnac1=25, dnac2=25, selfcomp=False, Na=50, K=0, Tris=0, Mg=0, dNTPs=0, saltcorr=5): """Return the Tm using nearest neighbor thermodynamics. Arguments: - seq: The primer/probe sequence as string or Biopython sequence object. For RNA/DNA hybridizations seq must be the RNA sequence. - c_seq: Complementary sequence. The sequence of the template/target in 3'->5' direction. c_seq is necessary for mismatch correction and dangling-ends correction. Both corrections will automatically be applied if mismatches or dangling ends are present. Default=None. - shift: Shift of the primer/probe sequence on the template/target sequence, e.g.:: shift=0 shift=1 shift= -1 Primer (seq): 5' ATGC... 5' ATGC... 5' ATGC... Template (c_seq): 3' TACG... 3' CTACG... 3' ACG... The shift parameter is necessary to align seq and c_seq if they have different lengths or if they should have dangling ends. Default=0 - table: Thermodynamic NN values, eight tables are implemented: For DNA/DNA hybridizations: - DNA_NN1: values from Breslauer et al. (1986) - DNA_NN2: values from Sugimoto et al. (1996) - DNA_NN3: values from Allawi & SantaLucia (1997) (default) - DNA_NN4: values from SantaLucia & Hicks (2004) For RNA/RNA hybridizations: - RNA_NN1: values from Freier et al. (1986) - RNA_NN2: values from Xia et al. (1998) - RNA_NN3: valuse from Chen et al. (2012) For RNA/DNA hybridizations: - R_DNA_NN1: values from Sugimoto et al. (1995) Use the module's maketable method to make a new table or to update one one of the implemented tables. - tmm_table: Thermodynamic values for terminal mismatches. Default: DNA_TMM1 (SantaLucia & Peyret, 2001) - imm_table: Thermodynamic values for internal mismatches, may include insosine mismatches. Default: DNA_IMM1 (Allawi & SantaLucia, 1997-1998; Peyret et al., 1999; Watkins & SantaLucia, 2005) - de_table: Thermodynamic values for dangling ends: - DNA_DE1: for DNA. Values from Bommarito et al. (2000). Default - RNA_DE1: for RNA. Values from Turner & Mathews (2010) - dnac1: Concentration of the higher concentrated strand [nM]. Typically this will be the primer (for PCR) or the probe. Default=25. - dnac2: Concentration of the lower concentrated strand [nM]. In PCR this is the template strand which concentration is typically very low and may be ignored (dnac2=0). In oligo/oligo hybridization experiments, dnac1 equals dnac1. Default=25. MELTING and Primer3Plus use k = [Oligo(Total)]/4 by default. To mimic this behaviour, you have to divide [Oligo(Total)] by 2 and assign this concentration to dnac1 and dnac2. E.g., Total oligo concentration of 50 nM in Primer3Plus means dnac1=25, dnac2=25. - selfcomp: Is the sequence self-complementary? Default=False. If 'True' the primer is thought binding to itself, thus dnac2 is not considered. - Na, K, Tris, Mg, dNTPs: See method 'Tm_GC' for details. Defaults: Na=50, K=0, Tris=0, Mg=0, dNTPs=0. - saltcorr: See method 'Tm_GC'. Default=5. 0 means no salt correction. """ seq = str(seq) if not c_seq: # c_seq must be provided by user if dangling ends or mismatches should # be taken into account. Otherwise take perfect complement. c_seq = Seq.Seq(seq).complement() c_seq = str(c_seq) if check: seq = _check(seq, 'Tm_NN') c_seq = _check(c_seq, 'Tm_NN') tmp_seq = seq tmp_cseq = c_seq delta_h = 0 delta_s = 0 d_h = 0 # Names for indexes d_s = 1 # 0 and 1 # Dangling ends? if shift or len(seq) != len(c_seq): # Align both sequences using the shift parameter if shift > 0: tmp_seq = '.' * shift + seq if shift < 0: tmp_cseq = '.' * abs(shift) + c_seq if len(tmp_cseq) > len(tmp_seq): tmp_seq += (len(tmp_cseq) - len(tmp_seq)) * '.' if len(tmp_cseq) < len(tmp_seq): tmp_cseq += (len(tmp_seq) - len(tmp_cseq)) * '.' # Remove 'over-dangling' ends while tmp_seq.startswith('..') or tmp_cseq.startswith('..'): tmp_seq = tmp_seq[1:] tmp_cseq = tmp_cseq[1:] while tmp_seq.endswith('..') or tmp_cseq.endswith('..'): tmp_seq = tmp_seq[:-1] tmp_cseq = tmp_cseq[:-1] # Now for the dangling ends if tmp_seq.startswith('.') or tmp_cseq.startswith('.'): left_de = tmp_seq[:2] + '/' + tmp_cseq[:2] try: delta_h += de_table[left_de][d_h] delta_s += de_table[left_de][d_s] except KeyError: _key_error(left_de, strict) tmp_seq = tmp_seq[1:] tmp_cseq = tmp_cseq[1:] if tmp_seq.endswith('.') or tmp_cseq.endswith('.'): right_de = tmp_cseq[-2:][::-1] + '/' + tmp_seq[-2:][::-1] try: delta_h += de_table[right_de][d_h] delta_s += de_table[right_de][d_s] except KeyError: _key_error(right_de, strict) tmp_seq = tmp_seq[:-1] tmp_cseq = tmp_cseq[:-1] # Now for terminal mismatches left_tmm = tmp_cseq[:2][::-1] + '/' + tmp_seq[:2][::-1] if left_tmm in tmm_table: delta_h += tmm_table[left_tmm][d_h] delta_s += tmm_table[left_tmm][d_s] tmp_seq = tmp_seq[1:] tmp_cseq = tmp_cseq[1:] right_tmm = tmp_seq[-2:] + '/' + tmp_cseq[-2:] if right_tmm in tmm_table: delta_h += tmm_table[right_tmm][d_h] delta_s += tmm_table[right_tmm][d_s] tmp_seq = tmp_seq[:-1] tmp_cseq = tmp_cseq[:-1] # Now everything 'unusual' at the ends is handled and removed and we can # look at the initiation. # One or several of the following initiation types may apply: # Type: General initiation value delta_h += nn_table['init'][d_h] delta_s += nn_table['init'][d_s] # Type: Duplex with no (allA/T) or at least one (oneG/C) GC pair if SeqUtils.GC(seq) == 0: delta_h += nn_table['init_allA/T'][d_h] delta_s += nn_table['init_allA/T'][d_s] else: delta_h += nn_table['init_oneG/C'][d_h] delta_s += nn_table['init_oneG/C'][d_s] # Type: Penalty if 5' end is T if seq.startswith('T'): delta_h += nn_table['init_5T/A'][d_h] delta_s += nn_table['init_5T/A'][d_s] if seq.endswith('A'): delta_h += nn_table['init_5T/A'][d_h] delta_s += nn_table['init_5T/A'][d_s] # Type: Different values for G/C or A/T terminal basepairs ends = seq[0] + seq[-1] AT = ends.count('A') + ends.count('T') GC = ends.count('G') + ends.count('C') delta_h += nn_table['init_A/T'][d_h] * AT delta_s += nn_table['init_A/T'][d_s] * AT delta_h += nn_table['init_G/C'][d_h] * GC delta_s += nn_table['init_G/C'][d_s] * GC # Finally, the 'zipping' for basenumber in range(len(tmp_seq) - 1): neighbors = tmp_seq[basenumber:basenumber + 2] + '/' + \ tmp_cseq[basenumber:basenumber + 2] if neighbors in imm_table: delta_h += imm_table[neighbors][d_h] delta_s += imm_table[neighbors][d_s] elif neighbors[::-1] in imm_table: delta_h += imm_table[neighbors[::-1]][d_h] delta_s += imm_table[neighbors[::-1]][d_s] elif neighbors in nn_table: delta_h += nn_table[neighbors][d_h] delta_s += nn_table[neighbors][d_s] elif neighbors[::-1] in nn_table: delta_h += nn_table[neighbors[::-1]][d_h] delta_s += nn_table[neighbors[::-1]][d_s] else: # We haven't found the key... _key_error(neighbors, strict) k = (dnac1 - (dnac2 / 2.0)) * 1e-9 if selfcomp: k = dnac1 * 1e-9 delta_h += nn_table['sym'][d_h] delta_s += nn_table['sym'][d_s] R = 1.987 # universal gas constant in Cal/degrees C*Mol if saltcorr: corr = salt_correction(Na=Na, K=K, Tris=Tris, Mg=Mg, dNTPs=dNTPs, method=saltcorr, seq=seq) if saltcorr == 5: delta_s += corr melting_temp = (1000 * delta_h) / (delta_s + (R * (math.log(k)))) - 273.15 if saltcorr in (1, 2, 3, 4): melting_temp += corr if saltcorr in (6, 7): # Tm = 1/(1/Tm + corr) melting_temp = (1 / (1 / (melting_temp + 273.15) + corr) - 273.15) return melting_temp
if "AO=" in t[0:3]: ao = int(t.split(",")[0][3:]) if (ao + ro > 0 and float(ao) / (ao + ro) >= args.minab and "," not in vals[3] and "," not in vals[4]): if vals[0] not in vcf: vcf[vals[0]] = [] vcf[vals[0]].append(vals) chroms = refFile.references for chrom in chroms: chromSeq = refFile.fetch(chrom) if chrom not in vcf: rec = SeqRecord.SeqRecord(Seq.Seq(chromSeq), id=chrom, name="", description="") SeqIO.write(rec, outFile, "fasta") continue sys.stderr.write(chrom + "\n") var = vcf[chrom] varPos = [0] + [int(v[1]) - 1 for v in var] varRefLen = [0] + [len(v[3]) for v in var] refSeqs = [ chromSeq[varPos[i - 1] + varRefLen[i - 1]:varPos[i]] + var[i - 1][4] for i in range(1, len(varPos)) ] + [chromSeq[varPos[-1] + varRefLen[-1]:]] newSeq = "".join(refSeqs) rec = SeqRecord.SeqRecord(Seq.Seq(newSeq),
import copy import unittest import warnings from Bio import BiopythonWarning from Bio import Seq from Bio.Data.IUPACData import ( ambiguous_dna_complement, ambiguous_rna_complement, ambiguous_dna_values, ambiguous_rna_values, ) from Bio.Data.CodonTable import TranslationError, standard_dna_table test_seqs = [ Seq.Seq("TCAAAAGGATGCATCATG"), Seq.Seq("T"), Seq.Seq("ATGAAACTG"), Seq.Seq("ATGAARCTG"), Seq.Seq("AWGAARCKG"), # Note no U or T Seq.Seq("".join(ambiguous_rna_values)), Seq.Seq("".join(ambiguous_dna_values)), Seq.Seq("AWGAARCKG"), Seq.Seq("AUGAAACUG"), Seq.Seq("ATGAAA-CTG"), Seq.Seq("ATGAAACTGWN"), Seq.Seq("AUGAAA==CUG"), Seq.Seq("AUGAAACUGWN"), Seq.Seq("AUGAAACTG"), # U and T Seq.MutableSeq("ATGAAACTG"), Seq.MutableSeq("AUGaaaCUG"),
def test_translation_wrong_type(self): """Test translation table cannot be CodonTable.""" seq = Seq.Seq("ATCGTA") with self.assertRaises(ValueError): seq.translate(table=ambiguous_dna_complement)
def test_translation_of_gapped_string_with_gap_char_given(self): seq = "GTG---GCCATTGTAATGGGCCGC" expected = "V-AIVMGR" self.assertEqual(expected, Seq.translate(seq, gap="-")) self.assertRaises(TypeError, Seq.translate, seq, gap=[]) self.assertRaises(ValueError, Seq.translate, seq, gap="-*")
def test_append_nucleotides(self): self.test_chars.append(Seq.Seq("A")) self.assertEqual(5, len(self.test_chars))
def test_not_equal_comparsion(self): """Test __ne__ comparison method.""" self.assertNotEqual(Seq.Seq("TCAAA"), Seq.Seq("TCAAAA"))
def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG") self.dna = [ Seq.Seq("ATCG"), Seq.Seq("gtca"), Seq.MutableSeq("GGTCA"), Seq.Seq("CTG-CA"), ] self.rna = [ Seq.Seq("AUUUCG"), Seq.MutableSeq("AUUCG"), Seq.Seq("uCAg"), Seq.MutableSeq("UC-AG"), Seq.Seq("U.CAG"), ] self.nuc = [Seq.Seq("ATCG")] self.protein = [ Seq.Seq("ATCGPK"), Seq.Seq("atcGPK"), Seq.Seq("T.CGPK"), Seq.Seq("T-CGPK"), Seq.Seq("MEDG-KRXR*"), Seq.MutableSeq("ME-K-DRXR*XU"), Seq.Seq("MEDG-KRXR@"), Seq.Seq("ME-KR@"), Seq.Seq("MEDG.KRXR@"), ] self.test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
def test_concatenation_of_seq(self): t = Seq.Seq("T") u = self.s + t self.assertEqual(str(self.s) + "T", u) self.assertEqual(self.s + Seq.Seq("T"), "TCAAAAGGATGCATCATGT")
def test_translation_of_stops(self): self.assertEqual(Seq.translate("TAT"), "Y") self.assertEqual(Seq.translate("TAR"), "*") self.assertEqual(Seq.translate("TAN"), "X") self.assertEqual(Seq.translate("NNN"), "X") self.assertEqual(Seq.translate("TAt"), "Y") self.assertEqual(Seq.translate("TaR"), "*") self.assertEqual(Seq.translate("TaN"), "X") self.assertEqual(Seq.translate("nnN"), "X") self.assertEqual(Seq.translate("tat"), "Y") self.assertEqual(Seq.translate("tar"), "*") self.assertEqual(Seq.translate("tan"), "X") self.assertEqual(Seq.translate("nnn"), "X")
def test_stops(self): for nucleotide_seq in [self.misc_stops, Seq.Seq(self.misc_stops)]: self.assertEqual("***RR", Seq.translate(nucleotide_seq)) self.assertEqual("***RR", Seq.translate(nucleotide_seq, table=1)) self.assertEqual("***RR", Seq.translate(nucleotide_seq, table="SGC0")) self.assertEqual("**W**", Seq.translate(nucleotide_seq, table=2)) self.assertEqual( "**WRR", Seq.translate(nucleotide_seq, table="Yeast Mitochondrial")) self.assertEqual("**WSS", Seq.translate(nucleotide_seq, table=5)) self.assertEqual("**WSS", Seq.translate(nucleotide_seq, table=9)) self.assertEqual( "**CRR", Seq.translate(nucleotide_seq, table="Euplotid Nuclear")) self.assertEqual("***RR", Seq.translate(nucleotide_seq, table=11)) self.assertEqual("***RR", Seq.translate(nucleotide_seq, table="Bacterial"))
def test_translation_with_codon_table_as_table_argument(self): table = standard_dna_table self.assertEqual("VAIVMGR", Seq.translate("GTGGCCATTGTAATGGGCCGC", table=table))
def test_translation_of_invalid_codon(self): for codon in ["TA?", "N-N", "AC_", "Ac_"]: with self.assertRaises(TranslationError): Seq.translate(codon)
def test_append_proteins(self): self.test_chars.append(Seq.Seq("K")) self.test_chars.append(Seq.Seq("K-")) self.test_chars.append(Seq.Seq("K@")) self.assertEqual(7, len(self.test_chars))
def test_translation_of_string(self): seq = "GTGGCCATTGTAATGGGCCGC" self.assertEqual("VAIVMGR", Seq.translate(seq))
def setUp(self): self.dna = [ Seq.Seq("ATCG"), Seq.Seq("gtca"), Seq.MutableSeq("GGTCA"), Seq.Seq("CTG-CA"), "TGGTCA", ] self.rna = [ Seq.Seq("AUUUCG"), Seq.MutableSeq("AUUCG"), Seq.Seq("uCAg"), Seq.MutableSeq("UC-AG"), Seq.Seq("U.CAG"), "UGCAU", ] self.nuc = [ Seq.Seq("ATCG"), "UUUTTTACG", ] self.protein = [ Seq.Seq("ATCGPK"), Seq.Seq("atcGPK"), Seq.Seq("T.CGPK"), Seq.Seq("T-CGPK"), Seq.Seq("MEDG-KRXR*"), Seq.MutableSeq("ME-K-DRXR*XU"), "TEDDF", ]
def test_gapped_seq_no_gap_char_given(self): seq = Seq.Seq("ATG---AAACTG") self.assertRaises(TranslationError, seq.translate, gap=None)
def setUp(self): sequence = b"TCAAAAGGATGCATCATG" self.s = Seq.Seq(sequence) self.mutable_s = Seq.MutableSeq(sequence)
def add_sequence(self, key, sequence): self.chain_sequences[key] = sq.Seq(sequence)
def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG")
if min(int(r[1]), int(r[2])) < interval_start: warnings += "NOUPSTREAM," interval_end = interval_start - 1 else: # In case there are multiple called genes in the interval, and since we replaced # interval_end in a previous iteration, we need to make sure we allow # the replaced one to still be the lowest. interval_end = min(interval_end, int(r[1]) - 1) interval_end = min(interval_end, int(r[2]) - 1) else: if max(int(r[1]), int(r[2])) > interval_end: warnings += "NOUPSTREAM," interval_start = interval_end + 1 else: interval_start = max(interval_start, int(r[1]) + 1) interval_start = max(interval_start, int(r[2]) + 1) # Get the actual sequence from what is left. # BUT: Note that the start and stop locations start at 1, while the # array indexes start at 0! startidx = interval_start - 1 stopidx = interval_end - 1 seq = contigseq[startidx:stopidx + 1] if seq.lower().count("n") > options.gapwarn: warnings += "CONTAINSGAP" # We need to do the reverse complement if we are on the "-" strand... if strand == "-": seq = str(Seq.Seq(seq).reverse_complement()) print("%s\t%s\t%s" % (gene, warnings, seq)) con.close()
def test_translation_with_bad_table_argument(self): table = {} with self.assertRaises(ValueError): Seq.translate("GTGGCCATTGTAATGGGCCGC", table=table)
def test_translation_of_leucine(self): for codon in [ "WTA", "MTY", "MTT", "MTW", "MTM", "MTH", "MTA", "MTC", "HTA" ]: self.assertEqual("J", Seq.translate(codon))
def test_translation_of_asparagine(self): for codon in ["RAY", "RAT", "RAC"]: self.assertEqual("B", Seq.translate(codon))
def setUp(self): """ Starting operations """ seq1 = """CCGAAGAAGAACAAATTCCTTGCTGAATCATGGCGAAGTTGAAGCTCTACTCTTACTGGA GAAGCTCATGTGCTCATCGCGTCCGTATCGCCCTCACTTTAAAAGGGCTTGATTATGAAT ATATACCGGTTAATTTGCTCAAAGGGGATCAATCCGATTCAGATTTCAAGAAGATCAATC CAATGGGCACTGTACCAGCGCTTGTTGATGGTGATGTTGTGATTAATGACTCTTTCGCAA TAATAATGTACCTGGATGATAAGTATCCGGAGCCACCGCTGTTACCAAGTGACTACCATA AACGGGCGGTAAATTACCAGGCGACGAGTATTGTCATGTCTGGTATACAGCCTCATCAAA ATATGGCTCTTTTTGTGAGAAGATGAGATTAATAGGTATCTCGAGGACAAGATAAATGCT GAGGAGAAAACTGCTTGGATTACTAATGCTATCACAAAAGGATTCACAGGTTTATAACGA CCTGTCTGATAATGTCTCATATGTCCTTCAGCTCTCGAGAAACTGTTGGTGAGTTGCGCT GGAAAATACGCGACTGGTGATGAAGTTTACTTGGCTGATCTTTTCCTAGCACCACAGATC CACGCAGCATTCAACAGATTCCATATTAACATGGAACCATTCCCGACTCTTGCAAGGTTT TACGAGTCATACAACGAACTGCCTGCATTTCAAAATGCAGTCCCGGAGAAGCAACCAGAT ACTCCTTCCACCATCTGATTCTGTGAACCGTAAGCTTCTCTCAGTCTCAGCTCAATAAAA TCTC""" self.seq1 = SeqRecord.SeqRecord(Seq.Seq(seq1.replace("\n", "")), id="CLASS_2.159") seq2 = """ACAAAACAAAGTAATCGCGAAAACACACAACAATCGCTGGACTCTGCTACTGCGAAGAAC AACAAATTCCTTGTTTATCATGGCGAATTCCGGCGAAGAGAAGTTGAAGCTCTACTCTTA CTGGAGAAGCTCGTGTGCTCATCGTGTCCGTATCGCCCTCGCTTTGAAAGGGCTTGATTA TGAGTATATACCAGTGAATTTGCTCAAGGGTGATCAATTCGATTCAGTTTATCGTTTTGA TCTTCAAGATTTCAAGAAGATCAATCCAATGGGAACTGTACCAGCTCTGGTGGATGGAGA TGTTGTGATTAATGATTCTTTTGCGATAATAATGTATCTGGATGAGAAGTACCCTGAGCC ACCTTTGTTACCTCGTGACCTCCATAAACGAGCTGTGAATTACCAGGCAATGAGTATTGT CTTGTCTGGCATACAGCCTCATCAAAATCTGGCTGTTATTAGGTATATCGAGGAAAAGAT AAATGTGGAGGAGAAGACTGCCTGGGTTAATAATGCTATCACAAAAGGATTTACAGCTCT CGAGAAACTGTTGGTGAATTGCGCTGGGAAACATGCGACTGGTGATGAAATTTACCTGGC TGATCTCTTTCTAGCACCACAGATCCACGGAGCAATCAACAGATTCCAGATTAACATGGA ACCGTACCCAACTCTTGCAAAATGTTACGAATCATACAACGAACTGCCTGCGTTTCAAAA TGCACTACCGGAAAAGCAGCCAGATGCTCCTTCTTCCACCATCTGATTCTGTGAACCCAT AAGCTACTCTCACTTTAATAAAACCTCAG""" self.seq2 = SeqRecord.SeqRecord(Seq.Seq(seq2.replace("\n", "")), id="CLASS_2.160") seq3 = """GATGCCCTTAGTTTCTCTACTTGTATCATACAATAAAGGTCACAGATTTTGAAATTTGCAAAGATATATC ATACATTCTCAGAGGAAGCCTTTGTCTCTAAGACTCTGGACCGTCTCCTTAACCGCATCTTCAACCGCAG TAAAAACCAGCCCGAGCTCAATCAATCGCTTAGCCGCATCATTACACGACGTAAGCCCAGGTTGAGTCTC TTTATCAAACTTGTGAACAGCAAACTCAGGGAAGAGCTTAGACACAAGAGCAGCAAACTCACTGAACTGA TAAATCCCATTAGTGCATAAAAACCGTCCAGAAGCGTCAGGTGTTTCGAATAGCATCACATGACCTTTAG CCACATCTTTCACGTGAACCACACCGAGCCAGTGATGCTCTTGCGTCTCGGTCGAGCCTTGTAAAAGCTG TAGCAGAACAGCACAGCTTGCGTTTAGGTTCGGTTGCAGAAGCGGTCCGAGACATGTTGATGGATGAATC GTCACAATGTTGGTTCCATGCTTCTCCGAAAATTCCCAAGCTGCTTTCTCAGCTAATGTCTTCGAAATTG GATACCATTTCTAAAATCAAAATTATATACCAATCGTTACAAATATCATATAAATCATCACTAACCATTT TAAACCGAAATTTAACGAACCTGCCTCGACTTGCAAAAATCAAGATCGGACCACGACGACTCATCGACGG GAACTTTTTCCGGCCAATTAGGGTTAGGAACCAATGCGGAGATAGATGACGTGATCACCACGCGTCTCAC ATTAAACCTCTTAGCAGCTTCCAACACATTGATCGTTCCCTTAACCGCCGGTTCGACCAGCTCCTTCTCT GGATCTACCGGTGGATCCAACGTACAAGGTGACGCCACGTGGAACACTCCCGCACATCCATCAATAGCTC TGGAGATTGCATCAGAGTCTAAGAGATCCGCTTCAAAGATCTTGATCTTAGAATCGGATCCGGGTAGTTG CAGAAGATGAGTCGGGTCGGATCCTGGGTAAATCGAAGCGTGGATTTTAGTATATCCTTTCTCAATTAAC GTTCGGATTATCCAAGATCCGATGAAACCATTAGCTCCGGTTACACACACTGTCTCTTTCGCCATTGTTG ATCAATAAGCGCTCACTGAGAATTTTTTTGTTCTCTCTCTCTATCGCAATTTATCTCAGAAGATAAGAAA AAAAAAACATCTTTCCAGTAAAAAAGGATCCTTTGTTTTTTTCTTACACGTAAAAAATGGATTTTTTTTT CTCTCTTAAAGATATAATGCGTTGATACAAAAGCGTAACGTTGACATGATATTATCCACTAGTTTTATAG ACTTTTCAAAAAAAGGAGAGAATTTTCAATTCTTCAGTAGTCAAATAGATGAAGACCGCCGGAGCGCCGC CGCAGAGAGGTGGTTCCTCTTCCTCCTCCGCCGTATACTTTAACTGGTCTTCATCATCTTGTTCTTACGA TAGCTGTAGAGTTTTGGTGGTGAAGATGGGAGGAAAAAGCAAGAAGCCTCATCAATCTTCTTCTTTTAAG GAGTCAGAGCCAGAACCACCGAGAATCAAATCCAATGTTAAGCATAACTTGCAGCTTCTCAAGTTATGGA AGGAGTTTCAGAGCAGAGGATCTGGCATGGCTAAGCCAGCGACTAGTTACAGGAAGAAGAAAGTAGAGAA AGACGAGTTACCGGATGATAGCGAGCTCTACCGGGATCCTACAAATACGCTTTACTACACGAACCAAGGT CTATTGGATGACGCAGTTCCGGTTTTGCTTGTTGATGGTTATAATGTGTGTGGATATTGGATGAAGTTAA AGAAACATTTCATGAAAGGAAGGCTTGACGTTGCTCGGCAGAAGTTAGTTGATGAACTTGTGTCCTTCAG TATGGTTAAAGAGGTTAAGGTAGTGGTTGTGTTTGATGCTCTCATGTCTGGTCTTCCTACTCACAAGGAA GACTTTGCAGGTGTTGATGTGATTTTCTCAGGAGAAACTTGTGCTGACGCTTGGATTGAAAAGGAGGTGG TTGCATTGAGAGAAGATGGATGCCCCAAGGTTTGGGTTGTAACATCTGATGTCTGTCAACAACAAGCAGC ACATGGAGCGGTATTGGGGCATCATATCGATGTTATAAACTCGTTATGTTCATATCTTGTTTTTGATTTT GGTGACTGATTCTTGACAGGGAGCTTATATTTGGAGTAGCAAGGCATTGGTTTCTGAGATTAAATCGATG CATAAGGAGGTTGAGAAAATGATGCAAGAAACAAGGTCAACATCTTTCCAAGGGAGATTGCTTAAACACA ATCTTGATTCTGAAGTCGTTGATGCTCTTAAAGATCTTAGAGACAAATTATCAGAAAACGAAACAAAGAG ATGACAAAAAGACCAATCCGGATTATATAAACAATTAACAAGGCTTGGTCTCTCCATGTAACTTCTGTCC CAAGTAAGTAAGCTAATCTGACTTGTAAAAAACAGAGGCTGCAGAGGAAACGAGGGAGATAGAGAGAGAG AGAGCTCAAATGCTTTGTTATTGTTGTATTTGTGTCTGAATTCTTTTTGACTAATCTATATATAGATTCG TTTTCTTTGGTCCAAACATATGGTTAAAAGATAGTTCTGAATTTTTCTTTTAGCTTCATGCATAAGAATC ATCTTAACCTAATAACCTATGTTTATTATTTTACAATAATGTAAAAATGTAAATTTTTAGTTGAATAATG AACCAAATTTTTATGTAAAAAAACTTGGATGTTTATTTTCAAACACAAACATCAGTAACACTTGAAGCAG TAGAGAGAATTGGAGGCAGAGCAAGTCTACAAATTTGCAGATAGTTCCAGGGTTTGAGCTGTTTGTTCTG GTCAGTCTCCAATCAATCAAAGCATATGGTTTATCGAGAATGGATAGAGATTCAAGAGAAGATTGAAGAA CTGAGTTTGCAAAGGCTTATCAATGCCTTCGACTTCGAGTTGAGATTGAAGAAAAGGTAAAGAAATAGCA AGTGATCTTTTGAAAATAGATCTCATATATTAATGACTTTCCATGTCTGTATTTGCTGAAGTTGATCTGA ATTTGCATATTGTTCATGTCAATGGATTGTCTGCTGTTACTAAATTTAACTTTGTGTCAGCACTCTTTAC GTTTTGAATTGTCGAACCATTCACTTGTTCAGTTATTATTTGGTCTATCCATCCTTATATGTTGTTCTCT GTTTAGATAAGGACAAAGAATAGACACCAGAGGAACTGAACCAAACAGCTGAGGCAGTTGGATATGGTGC GGTGAAGTAAGTATACGTATCATCTCTATTCTACTGGTCACATGTCATGAGCAGGGAAATTACAGCCGTT TATCAGAAAGTCTGGCAAAGACATAGATGAGCTGAAACAGACGGTTGAGGAAGCTTACACCAACTTGTTA CCGAGCGTACTGTGCGAGTACCTCTACAGATTATCTGAACACTACACGGACTAGCGTACCATGAAATTTG TGGATTGGCCTCTGCAGCTTTGTTTGAAATTCACTATAGCTTAGATGGCGAATTGGATTTAGACATGGAC TTCCGGATTGTATGTTGTCTTTGAGTCTCAAGGGATTGATTAATGTGATGATATTTATACACCATAGCTG AAATGAAATTTGTACTTAAAACTGATGGATAATTAATAACAGA""" self.seq3 = SeqRecord.SeqRecord(Seq.Seq(seq3.replace("\n", "")), id="PRJEB7093_DN.7194.1") seq4 = """GATGCCCTTAGTTTCTCTACTTGTATCATACAATAAAGGTCACAGATTTTGAAATTTGCA AAGATATATCATACATTCTCAGAGGAAGCCTTTGTCTCTAAGACTCTGGACCGTCTCCTT AACCGCATCTTCAACCGCAGTAAAAACCAGCCCGAGCTCAATCAATCGCTTAGCCGCATC ATTACACGACGTAAGCCCAGGTTGAGTCTCTTTATCAAACTTGTGAACAGCAAACTCAGG GAAGAGCTTAGACACAAGAGCAGCAAACTCACTGAACTGATAAATCCCATTAGTGCATAA AAACCGTCCAGAAGCGTCAGGTGTTTCGAATAGCATCACATGACCTTTAGCCACATCTTT CACGTGAACCACACCGAGCCAGTGATGCTCTTGCGTCTCGGTCGAGCCTTGTAAAAGCTG TAGCAGAACAGCACAGCTTGCGTTTAGGTTCGGTTGCAGAAGCGGTCCGAGACATGTTGA TGGATGAATCGTCACAATGTTGGTTCCATGCTTCTCCGAAAATTCCCAAGCTGCTTTCTC AGCTAATGTCTTCGAAATTGGATACCATTTCTAAAATCAAAATTATATACCAATCGTTAC AAATATCATATAAATCATCACTAACCATTTTAAACCGAAATTTAACGAACCTGCCTCGAC TTGCAAAAATCAAGATCGGACCACGACGACTCATCGACGGGAACTTTTTCCGGCCAATTA GGGTTAGGAACCAATGCGGAGATAGATGACGTGATCACCACGCGTCTCACATTAAACCTC TTAGCAGCTTCCAACACATTGATCGTTCCCTTAACCGCCGGTTCGACCAGCTCCTTCTCT GGATCTACCGGTGGATCCAACGTACAAGGTGACGCCACGTGGAACACTCCCGCACATCCA TCAATAGCTCTGGAGATTGCATCAGAGTCTAAGAGATCCGCTTCAAAGATCTTGATCTTA GAATCGGATCCGGGTAGTTGCAGAAGATGAGTCGGGTCGGATCCTGGGTAAATCGAAGCG TGGATTTTAGTATATCCTTTCTCAATTAACGTTCGGATTATCCAAGATCCGATGAAACCA TTAGCTCCGGTTACACACACTGTCTCTTTCGCCATTGTTGATCAATAAGCGCTCACTGAG AATTTTTTTGTTCTCTCTCTCTATCGCAATTTATCTCAGAAGATAAGAAAAAAAAAACAT CTTTCCAGTAAAAAAGGATCCTTTGTTTTTTTCTTACACGTAAAAAATGGATTTTTTTTT CTCTCTTAAAGATATAATGCGTTGATACAAAAGCGTAACGTTGACATGATATTATCCACT AGTTTTATAGACTTTTCAAAAAAAGGAGAGAATTTTCAATTCTTCAGTAGTCAAATAGAT GAAGACCGCCGGAGCGCCGCCGCAGAGAGGTGGTTCCTCTTCCTCCTCCGCCGTATACTT TAACTGGTCTTCATCATCTTGTTCTTACGATAGCTGTAGAGTTTTGGTGGTGAAGATGGG AGGAAAAAGCAAGAAGCCTCATCAATCTTCTTCTTTTAAGGAGTCAGAGCCAGAACCACC GAGAATCAAATCCAATGTTAAGCATAACTTGCAGCTTCTCAAGTTATGGAAGGAGTTTCA GAGCAGAGGATCTGGCATGGCTAAGCCAGCGACTAGTTACAGGAAGAAGAAAGTAGAGAA AGACGAGTTACCGGATGATAGCGAGCTCTACCGGGATCCTACAAATACGCTTTACTACAC GAACCAAGGTCTATTGGATGACGCAGTTCCGGTTTTGCTTGTTGATGGTTATAATGTGTG TGGATATTGGATGAAGTTAAAGAAACATTTCATGAAAGGAAGGCTTGACGTTGCTCGGCA GAAGTTAGTTGATGAACTTGTGTCCTTCAGTATGGTTAAAGAGGTTAAGGTAGTGGTTGT GTTTGATGCTCTCATGTCTGGTCTTCCTACTCACAAGGAAGACTTTGCAGGTGTTGATGT GATTTTCTCAGGAGAAACTTGTGCTGACGCTTGGATTGAAAAGGAGGTGGTTGCATTGAG AGAAGATGGATGCCCCAAGGTTTGGGTTGTAACATCTGATGTCTGTCAACAACAAGCAGC ACATGGAGCGGGAGCTTATATTTGGAGTAGCAAGGCATTGGTTTCTGAGATTAAATCGAT GCATAAGGAGGTTGAGAAAATGATGCAAGAAACAAGGTCAACATCTTTCCAAGGGAGATT GCTTAAACACAATCTTGATTCTGAAGTCGTTGATGCTCTTAAAGATCTTAGAGACAAATT ATCAGAAAACGAAACAAAGAGATGACAAAAAGACCAATCCGGATTATATAAACAATTAAC AAGGCTTGGTCTCTCCATGTAACTTCTGTCCCAAGTAAGTAAGCTAATCTGACTTGTAAA AAACAGAGGCTGCAGAGGAAACGAGGGAGATAGAGAGAGAGAGAGCTCAAATGCTTTGTT ATTGTTGTATTTGTGTCTGAATTCTTTTTGACTAATCTATATATAGATTCGTTTTCTTTG GTCCAAACATATGGTTAAAAGATAGTTCTGAATTTTTCTTTTAGCTTCATGCATAAGAAT CATCTTAACCTAATAACCTATGTTTATTATTTTACAATAATGTAAAAATGTAAATTTTTA GTTGAATAATGAACCAAATTTTTATGTAAAAAAACTTGGATGTTTATTTTCAAACACAAA CATCAGTAACACTTGAAGCAGTAGAGAGAATTGGAGGCAGAGCAAGTCTACAAATTTGCA GATAGTTCCAGGGTTTGAGCTGTTTGTTCTGGTCAGTCTCCAATCAATCAAAGCATATGG TTTATCGAGAATGGATAGAGATTCAAGAGAAGATTGAAGAACTGAGTTTGCAAAGGCTTA TCAATGCCTTCGACTTCGAGTTGAGATTGAAGAAAAGGTAAAGAAATAGCAAGTGATCTT TTGAAAATAGATCTCATATATTAATGACTTTCCATGTCTGTATTTGCTGAAGTTGATCTG AATTTGCATATTGTTCATGTCAATGGATTGTCTGCTGTTACTAAATTTAACTTTGTGTCA GCACTCTTTACGTTTTGAATTGTCGAACCATTCACTTGTTCAGTTATTATTTGGTCTATC CATCCTTATATGTTGTTCTCTGTTTAGATAAGGACAAAGAATAGACACCAGAGGAACTGA ACCAAACAGCTGAGGCAGTTGGATATGGTGCGGTGAAGTAAGTATACGTATCATCTCTAT TCTACTGGTCACATGTCATGAGCAGGGAAATTACAGCCGTTTATCAGAAAGTCTGGCAAA GACATAGATGAGCTGAAACAGACGGTTGAGGAAGCTTACACCAACTTGTTACCGAGCGTA CTGTGCGAGTACCTCTACAGATTATCTGAACACTACACGGACTAGCGTACCATGAAATTT GTGGATTGGCCTCTGCAGCTTTGTTTGAAATTCACTATAGCTTAGATGGCGAATTGGATT TAGACATGGACTTCCGGATTGTATGTTGTCTTTGAGTCTCAAGGGATTGATTAATGTGAT GATATTTATACACCATAGCTGAAATGAAATTTGTACTTAAAACTGATGGATAATTAATAA CAGA""" self.seq4 = SeqRecord.SeqRecord(Seq.Seq(seq4.replace("\n", "")), id="PRJEB7093_DN.7194.2") self.index = dict() self.index[self.seq1.id] = self.seq1 self.index[self.seq2.id] = self.seq2 self.index[self.seq3.id] = self.seq3 self.index[self.seq4.id] = self.seq4 self.bed1 = "\t".join( """CLASS_2.159 0 784 ID=CLASS_2.159|m.24650 0 + 29 386 0 1 784 0""" .split()) self.bed2 = "\t".join( "CLASS_2.160 0 809 ID=CLASS_2.160|m.34763 0 + 1 766 0 1 809 0" .split()) self.bed3 = "\t".join( "PRJEB7093_DN.7194.1 0 3683 ID=PRJEB7093_DN.7194.1|m.16659 0 - 641 1115 0 1 3683 0" .split()) self.bed4 = "\t".join( "PRJEB7093_DN.7194.2 0 3604 ID=PRJEB7093_DN.7194.2|m.16657 0 - 641 1115 0 1 3604 0" .split())
def test_translation_of_glutamine(self): for codon in ["SAR", "SAG", "SAA"]: self.assertEqual("Z", Seq.translate(codon))
def test_translation_extra_stop_codon(self): seq = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTAG" with self.assertRaises(TranslationError): Seq.translate(seq, table=2, cds=True)
def test_translation_incomplete_codon(self): with self.assertWarns(BiopythonWarning): Seq.translate("GTGGCCATTGTAATGGGCCG")