def Tm_feature(data, pam_audit=True, learn_options=None): if learn_options is None or "Tm segments" not in learn_options: segments = [(19, 24), (11, 19), (6, 11), (4, 24)] else: segments = learn_options["Tm segments"] sequence = data["30mer"].values featarray = np.ones((sequence.shape[0], 5)) rna = True for i, seq in enumerate(sequence): if pam_audit and seq[25:27] != "GG": continue raise Exception("excepted GG but found %s" % seq[25:27]) featarray[i, 0] = Tm.Tm_staluc(seq, rna=rna) #30mer featarray[i, 1] = Tm.Tm_staluc( seq[segments[0][0]:segments[0][1]], rna=rna) #5nts immediately proximal of the NGG PAM featarray[i, 2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]], rna=rna) #8-mer featarray[i, 3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]], rna=rna) #5-mer featarray[i, 4] = Tm.Tm_staluc(seq[segments[3][0]:segments[3][1]], rna=rna) #20-spacer feat = pd.DataFrame(featarray, index=data.index, columns=[ "Tm global_30mer%s" % rna, "5mer_end_%s" % rna, "8mer_middle_%s" % rna, "5mer_start_%s" % rna, "Tm global_spacer_%s" % rna ]) return feat
def get_primer(seq, direction, name): # Tm_NN: Calculation based on nearest neighbor thermodynamics. Several # tables for DNA/DNA, DNA/RNA and RNA/RNA hybridizations are included. # Correction for mismatches, dangling ends, salt concentration and other # additives are available. # Tm_staluc is the 'old' NN calculation and is kept for compatibility. # It is, however, recommended to use Tm_NN instead, since Tm_staluc may be # depreceated in the future. Also, Tm_NN has much more options. Using # Tm_staluc and Tm_NN with default parameters gives (essentially) the same results. global PRIMER_NUM global PRIMER_TM PRIMER_LENGTH = 15 # min primer lenght if direction == "fwd": while mt.Tm_staluc(seq[0:PRIMER_LENGTH]) <= PRIMER_TM and PRIMER_LENGTH <= 65: PRIMER_LENGTH += 1 primer_seq = seq[0:PRIMER_LENGTH] primer_tm = mt.Tm_staluc(primer_seq) elif direction == "rev": while mt.Tm_staluc(seq[-PRIMER_LENGTH:]) <= PRIMER_TM and PRIMER_LENGTH <= 65: PRIMER_LENGTH += 1 primer_seq = revcomplement(seq[-PRIMER_LENGTH:]).lower() primer_tm = mt.Tm_staluc(primer_seq) primer_seq = str(primer_seq) primer_name = "{}_{}_{}".format(PRIMER_NUM, name, direction) primer = list([primer_name, primer_seq, primer_tm, PRIMER_LENGTH]) PRIMER_NUM += 1 return primer
def Temper(sequence): seq=sequence seq_7=seq[:7] seq_8=seq[7:15] seq_5=seq[15:20] TDic={} TDic['T20']=Tm.Tm_staluc(seq) TDic['T7']=Tm.Tm_staluc(seq_7) TDic['T8']=Tm.Tm_staluc(seq_8) TDic['T5']=Tm.Tm_staluc(seq_5) return TDic
def gene_feature(Y): """ Things like the sequence of the gene, the DNA Tm of the gene, etc. """ gene_names = Y["Target gene"] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values == gene] = len(seq) gc_content[gene_names.values == gene] = SeqUtil.GC(seq) temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False) molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight( seq, "DNA") everything = np.concatenate( (gene_length, gc_content, temperature, molecular_weight), axis=1) df = pd.DataFrame( data=everything, index=gene_names.index, columns=[ "gene length", "gene GC content", "gene temperature", "gene molecular weight", ], ) return df
def Tm_feature(data, pam_audit=True, learn_options=None): """ assuming '30-mer'is a key get melting temperature features from: 0-the 30-mer ("global Tm") 1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM 2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt) 3-the Tm of the DNA:RNA hybrid from position 3 - 7 (i.e. 5 nt) """ if learn_options is None or "Tm segments" not in learn_options: segments = [(19, 24), (11, 19), (6, 11)] else: segments = learn_options["Tm segments"] sequence = data["30mer"].values featarray = np.ones((sequence.shape[0], 4)) for i, seq in enumerate(sequence): if pam_audit and seq[25:27] != "GG": raise Exception(f"expected GG but found {seq[25:27]}") rna = False featarray[i, 0] = Tm.Tm_staluc(seq, rna=rna) # 30mer Tm featarray[i, 1] = Tm.Tm_staluc( seq[segments[0][0]:segments[0][1]], rna=rna) # 5nts immediately proximal of the NGG PAM featarray[i, 2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]], rna=rna) # 8-mer featarray[i, 3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]], rna=rna) # 5-mer feat = pd.DataFrame( featarray, index=data.index, columns=[ f"Tm global_{rna}", f"5mer_end_{rna}", f"8mer_middle_{rna}", f"5mer_start_{rna}", ], ) return feat
def tm_nn(self, dnaconc=500, saltconc=50): """ dnaconc - float, [DNA] nM saltconc - float, [salt] mM @return float, nearest neighbore dna/dna melting temperature """ if not self.sequence: return 0 r = TM.Tm_staluc(self.sequence, dnac=dnaconc, saltc=saltconc) return round(r,1)
from Bio.SeqUtils import MeltingTemp as MT import xlwt PRIMER_FILE = '../../samples/primers.txt' # w is the name of a newly created workbook. w = xlwt.Workbook() # ws is the name of a new sheet in this workbook. ws = w.add_sheet('Result') # These two lines writes the titles of the columns. ws.write(0, 0, 'Primer Sequence') ws.write(0, 1, 'Tm') for index, line in enumerate(open(PRIMER_FILE)): # For each line in the input file, write the primer # sequence and the Tm prm = line[3:len(line) - 4].replace(' ', '') ws.write(index + 1, 0, prm) ws.write(index + 1, 1, '{0:.2f}'.format(MT.Tm_staluc(prm))) # Save the spreadsheel into a file. w.save('primerout.xls')
def melting_temp(s): return np.array([Tm.Tm_staluc(x, rna=False) for x in s[1:]]) / 1e3
print IUPACData.extended_protein_letters print IUPACData.ambiguous_dna_letters print IUPACData.unambiguous_dna_letters print IUPACData.ambiguous_rna_letters print IUPACData.unambiguous_rna_letters print IUPACData.ambiguous_dna_complement #dictionary of complements #and a lot more from Bio.Data import CodonTable print CodonTable.generic_by_id[2] #SeqUtils. Several functions to deal with DNA and protein sequences. #DNA utils import Bio.SeqUtils as SeqUtils print SeqUtils.GC('gacgatcggtattcgtag') #GC content from Bio.SeqUtils import MeltingTemp print MeltingTemp.Tm_staluc('tgcagtacgtatcgt') #DNA/RNA melting temperature #checksum functions: short alphanumeric string signature of a file or sequence #usually written in description of sequence #cgc is a easy, weak, very used checksum (better crc32, crc64) from Bio.SeqUtils import CheckSum myseq='acaagatgccattgtcccccggcctcctgctgctgct' print CheckSum.gcg(myseq) print CheckSum.crc32(myseq) print CheckSum.crc64(myseq) print CheckSum.seguid(myseq) #Protein utils from Bio.SeqUtils import ProtParam myprot=ProtParam.ProteinAnalysis('MLTNK') print myprot.count_amino_acids() print myprot.get_amino_acids_percent() print myprot.molecular_weight()