def gui_CountMotif(self): pattern = '' countMotifsList = [] HTMLFILE = 'E:\MY CODES\PYTHON\DNA-Toolkit\Html-Output\motifCount.html' f = open(HTMLFILE, 'w') t = html.Table(header_row=self.motifs) # Length of motifs for i in range(len(self.sequences)): dna = self.sequences[i] my_dna = Seq(dna, generic_dna) countMotifs = [] countMotifs.append('<b>' + str(i + 1) + '</b>') # number of sequence in table for j in range(1, len(self.motifs)): pattern = self.motifs[j] # Number of repeat pattern countMotifs.append(my_dna.count_overlap(pattern)) t.rows.append(countMotifs) htmlcode = str(t) # print (htmlcode) f.write(htmlcode) f.write('<p>') #print('Done' + '-' * 79) webbrowser.open(HTMLFILE)
def get_features_for_sequence(dna_seq): """ For the given sequence, it returns as features the frequencies of occurrence (as percentages) for all combinations of 1, 2 and 3 consecutive nucleotides (letters). E.g. for sequence ACGGT, we have: - A: 1/5 = 0.2; C: 1/5 = 0.2; G: 2/5 = 0.4; T: 1/5 = 0.2 (we divide by 5 because there are 5 letters in the sequence) - AA: 0; AC: 1/4 = 0.25; AG: 0; AT: 0; CC: 0; CG = 1/4 = 0.25, etc (we divide by 4 because there are 4 overlapping combinations of 2 letters) - AAA: 0; AAC: 0; ... ACG: 1/3 = 0.33; ... CGG: 1/3, etc (we divide by 3 because there are 3 overlapping combinations of 3 letters) The feature map will be: ['A': 0.2, 'C': 0.2, 'G': 0.4, 'T': 0.2, 'AA': 0, 'AC': 0.25, 'AG': 0, 'AT': 0, ..., 'TTT': 0] """ seq_list = generate_possible_sequences(3) seq = Seq(dna_seq) feature_map = {} for combination in seq_list: # We will count the overlapping values and divide them according to their length divide_number = 1 if len(combination) == 1: divide_number = len(dna_seq) elif len(combination) == 2: divide_number = len(dna_seq) - 1 elif len(combination) == 3: divide_number = len(dna_seq) - 2 feature_map[combination] = seq.count_overlap( combination) / divide_number return feature_map
def gui_CountMotif(self): #Import html.py library for create table import lib.HTML as html pattern = '' countMotifsList = [] HTMLFILE = 'htmlOutput/motifCount.html' f = open(HTMLFILE, 'w') t = html.Table(header_row=self.motifs) #Length of motifs for i in range(len(self.sequences)): dna = self.sequences[i] my_dna = Seq(dna, generic_dna) countMotifs = [] countMotifs.append('<b>' + str(i + 1) + '</b>') #number of sequence in table for j in range(1, len(self.motifs)): pattern = self.motifs[j] #Number of repeat pattern countMotifs.append(my_dna.count_overlap(pattern)) t.rows.append(countMotifs) htmlcode = str(t) f.write(htmlcode) f.write('<p>') print('Done' + '-' * 79)
def get_tf_idf_for_sequence(dna_seq, idf_map, size): """ :param dna_seq: the current sequence :param idf_map: a dictionary which was previously populated, containing the IDF for each subsequence (combination: AA, AC, ... TTT), computed for the entire corpus :param size: the total number of sequences :return: """ seq_list = ExtractFeatureStrategy.generate_possible_sequences(3) # remove single letters from the list (they do not bring relevant information in the context of tf-idf, as they occur in all sequences) seq_list = seq_list[4:] seq = Seq(dna_seq) feature_map = {} for combination in seq_list: # We will count the overlapping values and divide them according to their length divide_number = 1 if len(combination) == 2: divide_number = len(dna_seq) - 1 elif len(combination) == 3: divide_number = len(dna_seq) - 2 tf = seq.count_overlap(combination) / divide_number idf = idf_map[combination] if idf != 0: idf = math.log2(size / idf) feature_map[combination] = tf * idf return feature_map
def count_sequences_containing_subsequence(subsequence, sequences): """ From a list of sequences (e.g. an entire corpus), it counts how many times the subsequence occurs. """ result = 0 for s in sequences: seq = Seq(s) count = seq.count_overlap(subsequence) if count > 0: result += 1 return result
def terminal_CountMotif(self): pattern = '' countMotifs = {} # Print length of motifs for i in range(len(self.sequences)): dna = self.sequences[i] my_dna = Seq(dna, generic_dna) for j in range(1, len(self.motifs)): pattern = self.motifs[j] # Number of repeat pattern countMotifs[pattern] = [] countMotifs[pattern].append(my_dna.count_overlap(pattern)) print(countMotifs)
def terminal_FrequencyMotifs(self): pattern = '' countMotifs = {} for i in range(len(self.sequences)): dna = self.sequences[i] my_dna = Seq(dna, generic_dna) print('Sequence', i + 1) for j in range(1, len(self.motifs)): pattern = self.motifs[j] countMotifs[pattern] = [] countMotifs[pattern].append(my_dna.count_overlap(pattern)) for key, value in countMotifs.items(): if (len(key) == 1): temp = value[0] #This is a list percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][0]) * 100 ) #Caculate Percentage and show 2 decimal print('Frequency of', key, '==>', percentage, '%') elif (len(key) == 2): temp = value[0] percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][1]) * 100) print('Frequency of', key, '==>', percentage, '%') elif (len(key) == 3): temp = value[0] percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][2]) * 100) print('Frequency of', key, '==>', percentage, '%') elif (len(key) == 4): temp = value[0] percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][3]) * 100) print('Frequency of', key, '==>', percentage, '%') elif (len(key) == 5): temp = value[0] percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][4]) * 100) print('Frequency of', key, '==>', percentage, '%')
def numNmersMotifs(self): pattern = '' countMotifs = {} for i in range(len(self.sequences)): dna = self.sequences[i] my_dna = Seq(dna, generic_dna) summ = [0] * 5 #Define summ variable for j in range(1, len(self.motifs)): pattern = self.motifs[j] countMotifs[pattern] = [] countMotifs[pattern].append(my_dna.count_overlap(pattern)) for value in countMotifs.keys(): if (len(value) == 1): temp = countMotifs[value] #This is a list summ[0] += temp[0] elif (len(value) == 2): temp = countMotifs[value] summ[1] += temp[0] elif (len(value) == 3): temp = countMotifs[value] summ[2] += temp[0] elif (len(value) == 4): temp = countMotifs[value] summ[3] += temp[0] elif (len(value) == 5): temp = countMotifs[value] summ[4] += temp[0] self.numNmersMotifsDic[i] = [] self.numNmersMotifsDic[i].append(summ[0]) self.numNmersMotifsDic[i].append(summ[1]) self.numNmersMotifsDic[i].append(summ[2]) self.numNmersMotifsDic[i].append(summ[3]) self.numNmersMotifsDic[i].append(summ[4])
def kmerFreq(isoform): K = args.repeat sline = isoform.rstrip().split() chrom = sline[0] start = sline[1] end = sline[2] exonCnt = int(sline[9]) exonlen = sline[10].rstrip(',').split(',') exonlen = [int(length) for length in exonlen] exonS = sline[11].rstrip(',').split(',') exonS = [int(s) for s in exonS] strand = sline[5] gene_seq = records[chrom].seq[int(sline[1]):int(sline[2])].upper() gene_seq_str = str(gene_seq) spliced_seq = "" for i in range(exonCnt): spliced_seq = spliced_seq + gene_seq_str[exonS[i]:(exonS[i] + exonlen[i])] if strand == '-': gene_seq = gene_seq.reverse_complement() spliced_seq = Seq(spliced_seq).reverse_complement() else: spliced_seq = Seq(spliced_seq) spliced_length = len(spliced_seq) gene_length = len(gene_seq) kmer_freq = [0.0] * (4 ** K) iso = sline[3] for kmer in kmer_dict: if args.overlap: kmer_freq[kmer_dict[kmer]] = spliced_seq.count_overlap(kmer) + 0.0 else: kmer_freq[kmer_dict[kmer]] = spliced_seq.count(kmer) + 0.0 if not(args.count): for ind,cnt in enumerate(kmer_freq): kmer_freq[ind] = cnt / spliced_length * 1000 kmer_freq = [str(freq) for freq in kmer_freq] return iso+'\t'+'\t'.join(kmer_freq)+'\n'
def makeDataset(): with open('dataset_disp.csv', 'w') as out: out.write('seq,ie,gc,nstart,maxnstop,maxdisp\n') with open('exons_final.txt') as f: for line in f: # I/E label out.write(line[:-1] + ',1,') # GC my_seq = Seq(line[:-1], IUPAC.unambiguous_dna) out.write(str(GC(my_seq)) + ',') # N_ATG (start codon) out.write(str(my_seq.count_overlap('ATG')) + ',') # max(N_TAA, N_TAG, N_TGA) (stop codons) out.write( str( max(my_seq.count_overlap('TAA'), my_seq.count_overlap('TAG'), my_seq.count_overlap('TGA'))) + ',') # max disparity in translation of default frame #out.write( str(max(disp(trans(my_seq), 15))) + '\n') with open('introns_final.txt') as f: for line in f: # I/E label out.write(line[:-1] + ',0,') # GC my_seq = Seq(line, IUPAC.unambiguous_dna) out.write(str(GC(my_seq)) + ',') # N_ATG (start codon) out.write(str(my_seq.count_overlap('ATG')) + ',') # max(N_TAA, N_TAG, N_TGA) (stop codons) out.write( str( max(my_seq.count_overlap('TAA'), my_seq.count_overlap('TAG'), my_seq.count_overlap('TGA'))) + ',')
def gui_FrequencyMotifs(self): pattern = '' countMotifs = {} percentage = '' HTMLFILE = 'E:\MY CODES\PYTHON\DNA-Toolkit\Html-Output\motifFrequency.html' f = open(HTMLFILE, 'w') t = html.Table(header_row=self.motifs) for i in range(len(self.sequences)): dna = self.sequences[i] my_dna = Seq(dna, generic_dna) frequencyMotifsList = [] frequencyMotifsList.append('<b>' + str(i + 1) + '</b>') # number of sequence in table for j in range(1, len(self.motifs)): pattern = self.motifs[j] countMotifs[pattern] = [] countMotifs[pattern].append(my_dna.count_overlap(pattern)) for key, value in countMotifs.items(): if (len(key) == 1): temp = value[0] # This is a list percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][0]) * 100) # Caculate Percentage and show 2 decimal percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) elif (len(key) == 2): temp = value[0] percentage = '%.2f' % ((temp / self.numNmersMotifsDic[i][1]) * 100) percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) elif (len(key) == 3): temp = value[0] percentage = '%.2f' % ((temp / self.numNmersMotifsDic[i][2]) * 100) percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) elif (len(key) == 4): temp = value[0] percentage = '%.2f' % ((temp / self.numNmersMotifsDic[i][3]) * 100) percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) elif (len(key) == 5): temp = value[0] percentage = '%.2f' % ((temp / self.numNmersMotifsDic[i][4]) * 100) percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) t.rows.append(frequencyMotifsList) htmlcode = str(t) # print (htmlcode) f.write(htmlcode) f.write('<p>') #print('Done' + '-' * 79) webbrowser.open(HTMLFILE)
from Bio.Seq import Seq with open("data/rosalind_ini.txt", "r") as sequence_file: sequence = Seq(sequence_file.read()) #Using count_overlap bcause python string's .count() is a non-overlapping count #In some biological situations, overlapping count is necessary print(sequence.count_overlap("A"), sequence.count_overlap("C"), sequence.count_overlap("G"), sequence.count_overlap("T"))
def gui_FrequencyMotifs(self): #Import html.py library for create table import lib.HTML as html pattern = '' countMotifs = {} percentage = '' HTMLFILE = 'htmlOutput/motifFrequency.html' f = open(HTMLFILE, 'w') t = html.Table(header_row=self.motifs) for i in range(len(self.sequences)): dna = self.sequences[i] my_dna = Seq(dna, generic_dna) frequencyMotifsList = [] frequencyMotifsList.append('<b>' + str(i + 1) + '</b>') #number of sequence in table for j in range(1, len(self.motifs)): pattern = self.motifs[j] countMotifs[pattern] = [] countMotifs[pattern].append(my_dna.count_overlap(pattern)) for key, value in countMotifs.items(): if (len(key) == 1): temp = value[0] #This is a list percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][0]) * 100) percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) elif (len(key) == 2): temp = value[0] percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][1]) * 100) percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) elif (len(key) == 3): temp = value[0] percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][2]) * 100) percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) elif (len(key) == 4): temp = value[0] percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][3]) * 100) percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) elif (len(key) == 5): temp = value[0] percentage = '%.2f' % ( (temp / self.numNmersMotifsDic[i][4]) * 100) percentage = str(percentage) + '%' frequencyMotifsList.append(percentage) t.rows.append(frequencyMotifsList) htmlcode = str(t) f.write(htmlcode) f.write('<p>') print('Done' + '-' * 79)