def randomize(self, nucleotideSeq): ## TESTING ONLY #### TESTING ONLY #### TESTING ONLY #### TESTING ONLY #### #nucleotideSeq = Seq(nucleotideSeq.lower() + 'c') ## TESTING ONLY #### TESTING ONLY #### TESTING ONLY #### TESTING ONLY #### nucleotideSeq = Seq(nucleotideSeq.lower(), generic_dna) # Calculate the total number of permutations permutationsCountDenom = 1 countOfACGT = 0 for n in NucleotidePermutationRandomization.Nucleotides: countOfN = sum([1 if x == n else 0 for x in nucleotideSeq]) permutationsCountDenom *= factorial(countOfN) countOfACGT += countOfN permutationsCount = factorial(countOfACGT) // permutationsCountDenom isSynonymousPosition = [ x in NucleotidePermutationRandomization.Nucleotides for x in nucleotideSeq ] pool = list(compress(nucleotideSeq, isSynonymousPosition)) random.shuffle(pool) newSeq = list(nucleotideSeq) for newNuc, pos in zip( pool, compress(range(len(newSeq)), isSynonymousPosition)): newSeq[pos] = newNuc identity = sum([x == y for (x, y) in zip(nucleotideSeq, newSeq) ]) / len(newSeq) assert (identity >= 0.0) assert (identity <= 1.0) return (permutationsCount, identity, ''.join(newSeq))
def ConcatenatingSeq(): protein_seq = Seq("EVRNAK", IUPAC.protein) dna_seq = Seq("ACGT", IUPAC.unambiguous_dna) #print(protein_seq + dna_seq) #error protein_seq.alphabet = generic_alphabet dna_seq.alphabet = generic_alphabet print(protein_seq + dna_seq) list_of_seqs = [ Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna) ] concatenated = Seq("", generic_dna) for s in list_of_seqs: concatenated += s print('concatenated=', concatenated) con = sum(list_of_seqs, Seq("", generic_dna)) print('con=', con) dna_seq = Seq("acgtACGT", generic_dna) print('unper=', dna_seq.upper()) print('lower=', dna_seq.lower()) print("GTAC" in dna_seq) print("GTAC" in dna_seq.upper())
def add_16db_seqs(focus_file, org, dna, rna, lower): ''' adds seqeunces to silva file as single-line fasta ''' count = 0 with open(focus_file, "r") as inf: for rec, seq in SimpleFastaParser(inf): # for rec in SeqIO.parse(inf,"fasta"): if rna: seq = Seq(seq).transcribe() if dna: seq = Seq(seq).back_transcribe() if lower: seq = seq.lower() sys.stdout.write(">%s\n%s\n" % (rec, seq)) count = count + 1 sys.stderr.write("focusDB seqeunces: {}\n".format(count)) # os.remove(seqs) return (count)
def new_silvadb_for_org(count, org, silva, lower, dna, rna): ''' write new silva database with just sequences from org ''' nlines = 0 write_next_line = False sys.stderr.write("extracting {org} sequences\n".format(**locals())) if os.path.splitext(silva)[-1] in ['.gz', '.gzip']: open_fun = gzip.open else: open_fun = open with open_fun(silva, "rt") as inf: for rec, seq in SimpleFastaParser(inf): if org in rec: if rna: seq = Seq(seq).transcribe() if dna: seq = Seq(seq).back_transcribe() if lower: seq = seq.lower() sys.stdout.write(">%s\n%s\n" % (rec, seq)) nlines += 1 totalcount = count + nlines sys.stderr.write("SILVA seqeunces: {}\n".format(nlines)) sys.stderr.write("Combined sequences: {totalcount}\n".format(**locals()))
from Bio.Seq import Seq from Bio.Alphabet import generic_dna from Bio.Alphabet import IUPAC dna_seq = Seq("acgtACGT", generic_dna) print(repr(dna_seq)) print(dna_seq.upper()) print(dna_seq.lower()) print("TAC" in dna_seq) print("TAC" in dna_seq.upper()) strict_dna_seq = Seq("ACGT", IUPAC.unambiguous_dna) print(repr(strict_dna_seq)) print(repr(strict_dna_seq.lower()))
print(my_seq[::-1]) #reverse fasta_format_string = ">Name\n%s\n" % my_seq print(fasta_format_string) ##连接 list_of_seqs = [Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna)] concatenated = Seq("", generic_dna) for s in list_of_seqs: concatenated += s print(concatenated) print(sum(list_of_seqs, Seq("", generic_dna))) ##changing case dna_seq = Seq("acgtACGT", generic_dna) print(dna_seq.upper()) print(dna_seq.lower()) print("GTAC" in dna_seq) print("GTAC" in dna_seq.upper()) ##互补序列 my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna) print(my_seq) print(my_seq.complement()) print(my_seq.reverse_complement()) ##转录 coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna) messenger_rna = coding_dna.transcribe() print(messenger_rna) ##反转录 print(messenger_rna.back_transcribe())
except: print("字母表不兼容,连接失败!") #连接后字母表的变化 from Bio.Alphabet import generic_nucleotide nuc_seq = Seq("GATCGATGC", generic_nucleotide) dna_seq = Seq("ACGT", IUPAC.unambiguous_dna) print(nuc_seq.alphabet, "+", dna_seq.alphabet, "=", (nuc_seq + dna_seq).alphabet) #大小写更改 from Bio.Alphabet import generic_dna dna_seq = Seq("acgtACGT", generic_dna) print("原始序列:", dna_seq) print("大写序列:", dna_seq.upper()) print("小写序列:", dna_seq.lower()) #转录 print("\n###############\n5. 转录/逆转录\n---------------") coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna) print("编码链DNA:", coding_dna) template_dna = coding_dna.reverse_complement() print("模板链DNA:", template_dna) messenger_rna = coding_dna.transcribe() print(" mRNA:", messenger_rna, messenger_rna.alphabet) #从模板链去做一个真正的生物学上的转录,需要两步: print(" 真实步骤:", template_dna.reverse_complement().transcribe()) #mRNA 逆向转录为 DNA 编码链 print(" 逆转录:", messenger_rna.back_transcribe())
from Bio.Seq import Seq from Bio.Alphabet import single_letter_alphabet seq=Seq("ACGT") print("Sequence: %s"%seq) print("Alphabet: %s"%seq.alphabet) print(seq) test_seq=Seq('AGTATCGAATCGA',single_letter_alphabet) print(test_seq) print(test_seq.alphabet) seq=Seq('ACGTTCGCA') print(seq[0]) print(seq) for i in range(0,len(seq)): print(seq[i]) seq2=Seq('AAATTT') seq=seq+seq2 print(seq.lower()) print(seq.upper()) seq3 = Seq(' ACGT ') print(seq3) print(seq3.strip())
print(my_seq) print(len(my_seq)) print(my_seq[0]) print(my_seq[1]) print(my_seq[0:3]) my_seq.count("G") my_seq.count("C") my_seq.count("A") my_seq.count("T") my_seq.lower() my_seq.upper() my_seq.complement() my_seq.reverse_complement() my_seq.transcribe() my_seq.translate() from Bio.SeqUtils import GC #get GC content print(GC(my_seq)) #get AT content print(100-GC(my_seq))
from Bio.Alphabet import IUPAC from Bio.Alphabet import generic_dna from Bio import SeqIO #x = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna) #y = print(str(x)) #print(repr(y)) """BIOPYTHON TUTORIAL""" #for letter, index in enumerate(x): # print("%i %s" % (letter, index)) #print(len(x)) # print(repr(x[0:12])) # repr for print string + seq type #fasta_format_string = ">Name\n%s\n" % x #print(fasta_format_string) # prot = Seq("EVRNAK" , IUPAC.protein) # dna = Seq("ATGCCC" , IUPAC.unambiguous_dna) # print(str(prot) + str(dna)) # for e in SeqIO.parse("hans.fasta", "fasta"): #Import Fasta sequences (SeqIO required) # print(e) dnas = Seq("ACGT", IUPAC.unambiguous_dna) krass = (dnas.lower()) print(repr(krass)) oki = krass.complement() print(repr(oki))
from Bio.Seq import Seq tatabox_seq = Seq("tataaaggcAATATGCAGTAG") print(tatabox_seq.upper()) print(tatabox_seq.lower())
# 4.4.3.case.py from Bio.Seq import Seq tatabox_seq = Seq("tataaaggcAATATGCAGTAG") print(tatabox_seq.upper()) # TATAAAGGCAATATGCAGTAG print(tatabox_seq.lower()) # tataaaggcaatatgcagtag
# Drug način za štetje nukleotidov je, da si pripravimo slovar, na primer: # In[2]: freq = {} for x in my_seq: freq[x] = my_seq.count(x) print(freq) # izpiše slovar print('A:', freq['A']) # izpiše, koliko A je v zaporedju # Pogosto želimo, so vsi nukleotidi napisani bodisi z velikimi ali malimi črkami. Za ta namen lahko uporabimo `upper` ali `lower`: # In[3]: print(my_seq.lower()) # Zaporedje, ki ga analiziramo, lahko definiramo direktno, lahko pa si nastavimo, da nas računalnik sam vpraša po njem. Pri tem je pomembno, da ga pretvorimo v ustrezen objekt! # In[4]: # na spodnji način bo my_seq niz (string) my_seq = input('Vpiši nukleotidno zaporedje: ') print(my_seq) # tako pa bo kot "sequence object" (seveda moramo prej uvoziti ustrezen modul - glej zgoraj) my_seq = Seq(input('Vpiši nukleotidno zaporedje: ')) print(my_seq) # --- # ## Naloga #
def Kaminoan(frag_list, Dtemp = In_temp, OL = In_length, ty = Format, n = Name): """Takes a list of strings ATGC, an overlap length (integer between 9-25bp), and a name to call the outputted product. Then calculates appropriate primers and returns them in a panda table, as well as a fasta file of the final plasmid""" output = pd.DataFrame(columns = ["Seq_Length", "Fragments", "FW_Primer", "FW_length", "FW_inital_temp", "FW_final_temp", "RV_Primer", "RV_length", "RV_inital_temp", "RV_final_temp"]) noncon = [] delete = [] for k in frag_list: for i in range(len(k)): if k[i] != "A" and k[i] != "C" and k[i] != "T" and k[i] != "G" and k[i] != "a" and k[i] != "c" and k[i] != "t" and k[i] != "g" delete.append(k) frag_list = [x for x in frag_list if x not in delete] for i in range(len(frag_list)): if len(frag_list[i]) < 60: print("fragment ", i, " is ", len(frag_list[i]), " bp, attempting PCR of fragments smaller than 60bp is not recommended") if len(frag_list[i]) < 100: print("fragment ", i, " is ", len(frag_list[i]), " bp long, consider just annealing oligos rather than PCR for short fragments such as these") for j in range(len(frag_list[i])): if frag_list[i][j] != "A" and frag_list[i][j] != "C" and frag_list[i][j] != "T" and frag_list[i][j] != "G": noncon.append((i, j, frag_list[i][j])) for i in range(len(frag_list)): for j in range(len(frag_list[i])): if frag_list[i][j] != "A" and frag_list[i][j] != "C" and frag_list[i][j] != "T" and frag_list[i][j] != "G": print("Fragment ", i, " contains non-conventional (non-ACTG) characters and cannot be processed. Violating fragments, positions and characters are:", noncon) return #frag_list[i] = Seq(frag_list[i], IUPAC.unambiguous_dna) for i in range(len(frag_list)): if i == 0: #FW code starts here, remember to reuse L_RVnow at the end for the final RV_TOT L_FWnow = 7 L_FWpast = 8 C_FWnow = Seq(frag_list[i][0:L_FWnow]) C_FWpast = Seq(frag_list[-1][len(frag_list[-1])-L_FWpast:len(frag_list[-1])]) Overlap = C_FWpast + C_FWnow while mt.Tm_NN(Overlap) < 47: L_FWnow += 1 C_FWnow = Seq(frag_list[i][0:L_FWnow]) Overlap = C_FWpast + C_FWnow if mt.Tm_NN(Overlap) < 47: L_FWpast += 1 C_FWpast = Seq(frag_list[-1][len(frag_list[-1])-L_FWpast:len(frag_list[-1])]) Overlap = C_FWpast + C_FWnow #now generate the forward primer j = OL FW_INI = Seq(frag_list[i][0:j]) mtFW_INI = mt.Tm_NN(FW_INI) while mtFW_INI < Dtemp: j += 1 FW_INI = Seq(frag_list[i][0:j]) mtFW_INI = mt.Tm_NN(FW_INI) FW_TOT = C_FWpast.lower() + FW_INI mtFW_TOT = mt.Tm_NN(FW_TOT) ##RV code starts here with the overlap section L_RVnow = 7 L_RVnext = 8 C_RVnow = Seq(frag_list[i][len(frag_list[i])-L_RVnow:len(frag_list[i])]) C_RVnext = Seq(frag_list[i+1][0:L_RVnext]) Overlap = C_RVnow + C_RVnext while mt.Tm_NN(Overlap) < 47: L_RVnow += 1 C_RVnow = Seq(frag_list[i][len(frag_list[i])-L_RVnow:len(frag_list[i])]) Overlap = C_RVnow + C_RVnext if mt.Tm_NN(Overlap) < 47: L_RVnext += 1 C_RVnext = Seq(frag_list[i+1][0:L_RVnext]) Overlap = C_RVnow + C_RVnext #now generate the reverse primer j = OL RV_INI = Seq(frag_list[i][len(frag_list[i])-j:len(frag_list[i])]) mtRV_INI = mt.Tm_NN(RV_INI) while mtRV_INI < Dtemp: j += 1 RV_INI = Seq(frag_list[i][len(frag_list[i])-j:len(frag_list[i])]) mtRV_INI = mt.Tm_NN(RV_INI) RV_TOT = C_RVnext.reverse_complement().lower() + RV_INI.reverse_complement() mtRV_TOT = mt.Tm_NN(RV_TOT) #say where its going: fragments = "Inserts %0.0f between %0.0f and %0.0f" % (i, i-1, i+1) output.loc[i] = [len(frag_list[i]), fragments, str(FW_TOT), len(FW_TOT), mtFW_INI, mtFW_TOT, str(RV_TOT), len(RV_TOT), mtRV_INI, mtRV_TOT] if 0 < i < len(frag_list) -1: #generate the forward primer j = OL FW_INI = Seq(frag_list[i][0:j]) mtFW_INI = mt.Tm_NN(FW_INI) while mtFW_INI < Dtemp: j += 1 FW_INI = Seq(frag_list[i][0:j]) mtFW_INI = mt.Tm_NN(FW_INI) FW_TOT = C_RVnow.lower() + FW_INI mtFW_TOT = mt.Tm_NN(FW_TOT) ##RV code starts here with the overlap section L_RVnow = 7 L_RVnext = 8 C_RVnow = Seq(frag_list[i][len(frag_list[i])-L_RVnow:len(frag_list[i])]) C_RVnext = Seq(frag_list[i+1][0:L_RVnext]) Overlap = C_RVnow + C_RVnext while mt.Tm_NN(Overlap) < 47: L_RVnow += 1 C_RVnow = Seq(frag_list[i][len(frag_list[i])-L_RVnow:len(frag_list[i])]) Overlap = C_RVnow + C_RVnext if mt.Tm_NN(Overlap) < 47: L_RVnext += 1 C_RVnext = Seq(frag_list[i+1][0:L_RVnext]) Overlap = C_RVnow + C_RVnext #now generate the reverse primer j = OL RV_INI = Seq(frag_list[i][len(frag_list[i])-j:len(frag_list[i])]) mtRV_INI = mt.Tm_NN(RV_INI) while mtRV_INI < Dtemp: j += 1 RV_INI = Seq(frag_list[i][len(frag_list[i])-j:len(frag_list[i])]) mtRV_INI = mt.Tm_NN(RV_INI) RV_TOT = C_RVnext.reverse_complement().lower() + RV_INI.reverse_complement() mtRV_TOT = mt.Tm_NN(RV_TOT) if i >= 1: fragments = "Inserts %0.0f between %0.0f and %0.0f" % (i, i-1, i+1) elif i == 0: fragments = "Inserts %0.0f between %0.0f and %0.0f" % (0, len(frag_list), 1) else: fragments = "FRAGMENT ID ERROR!!!" output.loc[i] = [len(frag_list[i]), fragments, str(FW_TOT), len(FW_TOT), mtFW_INI, mtFW_TOT, str(RV_TOT), len(RV_TOT), mtRV_INI, mtRV_TOT] if i == len(frag_list) - 1: #generate the forward primer j = OL FW_INI = Seq(frag_list[i][0:j]) mtFW_INI = mt.Tm_NN(FW_INI) while mtFW_INI < Dtemp: j += 1 FW_INI = Seq(frag_list[i][0:j]) mtFW_INI = mt.Tm_NN(FW_INI) FW_TOT = C_RVnow.lower() + FW_INI mtFW_TOT = mt.Tm_NN(FW_TOT) #now generate the reverse primer j = OL RV_INI = Seq(frag_list[i][len(frag_list[i])-j:len(frag_list[i])]) mtRV_INI = mt.Tm_NN(RV_INI) while mtRV_INI < Dtemp: j += 1 RV_INI = Seq(frag_list[i][len(frag_list[i])-j:len(frag_list[i])]) mtRV_INI = mt.Tm_NN(RV_INI) RV_TOT = C_FWpast.reverse_complement().lower() + RV_INI.reverse_complement() mtRV_TOT = mt.Tm_NN(RV_TOT) if i >= 1: fragments = "Inserts %0.0f between %0.0f and %0.0f" % (i, i-1, i+1) elif i == 0: fragments = "Inserts %0.0f between %0.0f and %0.0f" % (0, len(frag_list), 1) else: fragments = "FRAGMENT ID ERROR!!!" output.loc[i] = [len(frag_list[i]), fragments, str(FW_TOT), len(FW_TOT), mtFW_INI, mtFW_TOT, str(RV_TOT), len(RV_TOT), mtRV_INI, mtRV_TOT] total = "" for f in frag_list: total += f if len(total) > 15000: print("WARNING: total plasmid lengths of 15kb may result in cloning problems, consult Huang et al (2017)") totalsequence = SeqRecord(Seq(str(total), IUPAC.unambiguous_dna), id="pKam"+date, name=n, description="plasmid cloned in silico using Kaminoan in vivo cloner on the " + date) bp = 0 for i in range(0,len(frag_list)): my_feature = sf.SeqFeature(sf.FeatureLocation(bp,bp+len(frag_list[i])),type="misc_feature") totalsequence.features.append(my_feature) bp += len(frag_list[i]) #generate output map: if ty == "genbank": SeqIO.write(totalsequence, n, "genbank") elif ty == "fasta": SeqIO.write(totalsequence, n, "fasta") else: print("variable ty must be specified as either genbank (default) or fasta in string format. " + ty + " is not an acceptable input for this variable") return output, totalsequence
from Bio.Seq import Seq from Bio.Alphabet import IUPAC from Bio.SeqUtils import GC from Bio.Alphabet import generic_dna my_seq = Seq("CGATGCATGCTAGTC",IUPAC.ambiguous_dna) #后面的IUPAC是对前面的序列进行格式定义,可以是dna或者protein print (my_seq) print (my_seq.alphabet) for index,letter in enumerate(my_seq): print("%i %s" %(index,letter)) #替代方式 print (len(my_seq)) #sequence length print (my_seq[2]) #speific position nucleaicd print (my_seq.count("TG")) #motif counts print (GC(my_seq)) #GC count print (my_seq[2:7]) #slicing a sequence print (my_seq[0::3]) #从第0位开始,直到最后,每间隔2个字符,取第三个字符来显示 print (my_seq[::-1]) #反向读数 print (my_seq.reverse_complement().complement()) #同上 print (str(my_seq)) print (my_seq) #same as above fasta_format_string = ">Name\n%s\n" %my_seq print (fasta_format_string) #将序列转换位string来进行编辑,同时按照fasta格式排列 print (my_seq.lower()) #小写 print (my_seq.lower().upper()) #大写
from Bio.Seq import Seq from Bio.Alphabet import generic_nucleotide from Bio.Alphabet import generic_dna from Bio.Alphabet import IUPAC nuc_seq = Seq("GATCGATGC", generic_nucleotide) dna_seq = Seq("ACGT", IUPAC.unambiguous_dna) print(nuc_seq) print(dna_seq) print((nuc_seq + dna_seq).alphabet) list_of_seqs = [Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna)] concatenated = Seq("", generic_dna) for s in list_of_seqs: concatenated += s print(concatenated, ' ', concatenated.alphabet) print(sum(list_of_seqs, Seq("", generic_dna))) print(concatenated.lower())
dna_seq.alphabet = generic_alphabet protein_seq + dna_seq from Bio.Seq import Seq from Bio.Alphabet import generic_dna list_of_seqs = [Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna)] sum(list_of_seqs, Seq("", generic_dna)) # Changing case from Bio.Seq import Seq from Bio.Alphabet import generic_dna dna_seq = Seq("acgtACGT", generic_dna) dna_seq dna_seq.upper() dna_seq.lower() # Transcription from Bio.Seq import Seq from Bio.Alphabet import IUPAC coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna) coding_dna messenger_rna = coding_dna.transcribe() messenger_rna # Translation from Bio.Seq import Seq from Bio.Alphabet import IUPAC coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
##连接 list_of_seqs = [ Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna) ] concatenated = Seq("", generic_dna) for s in list_of_seqs: concatenated += s print(concatenated) print(sum(list_of_seqs, Seq("", generic_dna))) ##changing case dna_seq = Seq("acgtACGT", generic_dna) print(dna_seq.upper()) print(dna_seq.lower()) print("GTAC" in dna_seq) print("GTAC" in dna_seq.upper()) ##互补序列 my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna) print(my_seq) print(my_seq.complement()) print(my_seq.reverse_complement()) ##转录 coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna) messenger_rna = coding_dna.transcribe() print(messenger_rna) ##反转录
from Bio.Seq import Seq string = "AGTACACTGGT2" my_seq = Seq("AGTACACTGGT") print("Sequence:", my_seq) print("Lower:", my_seq.lower()) print("Complement:", my_seq.complement()) print("Reverse complement:", my_seq.reverse_complement()) print(string.isdigit())
from Bio.Alphabet import generic_nucleotide nucseq = Seq('GATCGATGC',generic_nucleotide) dnaseq = Seq('ACGT',IUPAC.unambiguous_dna) print nucseq.alphabet, dnaseq.alphabet print nucseq + dnaseq, (nucseq + dnaseq).alphabet # parent + child = parent type from Bio.Alphabet import generic_dna list_seqs = [Seq('ACGT', generic_dna), Seq('CCGG', generic_dna), Seq('TTATT', generic_dna)] concatenated = Seq('',generic_dna) for seq in list_seqs: concatenated += seq print concatenated print sum(list_seqs, Seq('', generic_dna)) # function sum, the same as previous # Seq shares many methods from string, except join method print myseq.upper(), myseq.lower() print 'AA' in myseq, 'AA' in myseq.upper() myseq = myseq.upper() print myseq.complement() print myseq.reverse_complement() #print myprot.complement() #Transcription myrna = myseq.transcribe() # replace T for U print myrna, myrna.alphabet #real biological transcription mytemplate = myseq.reverse_complement() myrna = mytemplate.reverse_complement().transcribe() print myrna print myrna.back_transcribe() # replace U for T (RNA->DNA)