コード例 #1
0
    def gui_CountMotif(self):

        pattern = ''
        countMotifsList = []

        HTMLFILE = 'E:\MY CODES\PYTHON\DNA-Toolkit\Html-Output\motifCount.html'
        f = open(HTMLFILE, 'w')

        t = html.Table(header_row=self.motifs)

        # Length of motifs
        for i in range(len(self.sequences)):
            dna = self.sequences[i]
            my_dna = Seq(dna, generic_dna)
            countMotifs = []
            countMotifs.append('<b>' + str(i + 1) + '</b>')  # number of sequence in table

            for j in range(1, len(self.motifs)):
                pattern = self.motifs[j]
                # Number of repeat pattern
                countMotifs.append(my_dna.count_overlap(pattern))
            t.rows.append(countMotifs)

        htmlcode = str(t)
        # print (htmlcode)
        f.write(htmlcode)
        f.write('<p>')
        #print('Done' + '-' * 79)
        webbrowser.open(HTMLFILE)
コード例 #2
0
def get_features_for_sequence(dna_seq):
    """
        For the given sequence, it returns as features the frequencies of occurrence (as percentages) for all combinations of 1, 2 and 3 consecutive
        nucleotides (letters).
        E.g. for sequence ACGGT, we have:
            - A: 1/5 = 0.2; C: 1/5 = 0.2; G: 2/5 = 0.4; T: 1/5 = 0.2 (we divide by 5 because there are 5 letters in the sequence)
            - AA: 0; AC: 1/4 = 0.25; AG: 0; AT: 0; CC: 0; CG = 1/4 = 0.25, etc (we divide by 4 because there are 4 overlapping combinations of 2 letters)
            - AAA: 0; AAC: 0; ... ACG: 1/3 = 0.33; ... CGG: 1/3, etc (we divide by 3 because there are 3 overlapping combinations of 3 letters)
            The feature map will be: ['A': 0.2, 'C': 0.2, 'G': 0.4, 'T': 0.2, 'AA': 0, 'AC': 0.25, 'AG': 0, 'AT': 0, ..., 'TTT': 0]
    """
    seq_list = generate_possible_sequences(3)
    seq = Seq(dna_seq)
    feature_map = {}

    for combination in seq_list:
        # We will count the overlapping values and divide them according to their length
        divide_number = 1
        if len(combination) == 1:
            divide_number = len(dna_seq)
        elif len(combination) == 2:
            divide_number = len(dna_seq) - 1
        elif len(combination) == 3:
            divide_number = len(dna_seq) - 2
        feature_map[combination] = seq.count_overlap(
            combination) / divide_number

    return feature_map
コード例 #3
0
    def gui_CountMotif(self):

        #Import html.py library for create table
        import lib.HTML as html

        pattern = ''
        countMotifsList = []

        HTMLFILE = 'htmlOutput/motifCount.html'
        f = open(HTMLFILE, 'w')

        t = html.Table(header_row=self.motifs)

        #Length of motifs
        for i in range(len(self.sequences)):
            dna = self.sequences[i]
            my_dna = Seq(dna, generic_dna)
            countMotifs = []
            countMotifs.append('<b>' + str(i + 1) +
                               '</b>')  #number of sequence in table

            for j in range(1, len(self.motifs)):
                pattern = self.motifs[j]
                #Number of repeat pattern
                countMotifs.append(my_dna.count_overlap(pattern))
            t.rows.append(countMotifs)

        htmlcode = str(t)
        f.write(htmlcode)
        f.write('<p>')
        print('Done' + '-' * 79)
コード例 #4
0
def get_tf_idf_for_sequence(dna_seq, idf_map, size):
    """
    :param dna_seq: the current sequence
    :param idf_map: a dictionary which was previously populated, containing the IDF for each subsequence (combination: AA, AC, ... TTT),
                    computed for the entire corpus
    :param size: the total number of sequences
    :return:
    """
    seq_list = ExtractFeatureStrategy.generate_possible_sequences(3)
    # remove single letters from the list (they do not bring relevant information in the context of tf-idf, as they occur in all sequences)
    seq_list = seq_list[4:]
    seq = Seq(dna_seq)
    feature_map = {}

    for combination in seq_list:
        # We will count the overlapping values and divide them according to their length
        divide_number = 1
        if len(combination) == 2:
            divide_number = len(dna_seq) - 1
        elif len(combination) == 3:
            divide_number = len(dna_seq) - 2
        tf = seq.count_overlap(combination) / divide_number
        idf = idf_map[combination]
        if idf != 0:
            idf = math.log2(size / idf)
        feature_map[combination] = tf * idf

    return feature_map
コード例 #5
0
def count_sequences_containing_subsequence(subsequence, sequences):
    """
    From a list of sequences (e.g. an entire corpus), it counts how many times the subsequence occurs.
    """
    result = 0
    for s in sequences:
        seq = Seq(s)
        count = seq.count_overlap(subsequence)
        if count > 0:
            result += 1
    return result
コード例 #6
0
    def terminal_CountMotif(self):

        pattern = ''
        countMotifs = {}

        # Print length of motifs
        for i in range(len(self.sequences)):
            dna = self.sequences[i]
            my_dna = Seq(dna, generic_dna)

            for j in range(1, len(self.motifs)):
                pattern = self.motifs[j]
                # Number of repeat pattern
                countMotifs[pattern] = []
                countMotifs[pattern].append(my_dna.count_overlap(pattern))
            print(countMotifs)
コード例 #7
0
    def terminal_FrequencyMotifs(self):
        pattern = ''
        countMotifs = {}

        for i in range(len(self.sequences)):
            dna = self.sequences[i]
            my_dna = Seq(dna, generic_dna)
            print('Sequence', i + 1)

            for j in range(1, len(self.motifs)):
                pattern = self.motifs[j]
                countMotifs[pattern] = []
                countMotifs[pattern].append(my_dna.count_overlap(pattern))

            for key, value in countMotifs.items():
                if (len(key) == 1):
                    temp = value[0]  #This is a list
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][0]) * 100
                    )  #Caculate Percentage and show 2 decimal
                    print('Frequency of', key, '==>', percentage, '%')
                elif (len(key) == 2):
                    temp = value[0]
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][1]) * 100)
                    print('Frequency of', key, '==>', percentage, '%')
                elif (len(key) == 3):
                    temp = value[0]
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][2]) * 100)
                    print('Frequency of', key, '==>', percentage, '%')
                elif (len(key) == 4):
                    temp = value[0]
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][3]) * 100)
                    print('Frequency of', key, '==>', percentage, '%')
                elif (len(key) == 5):
                    temp = value[0]
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][4]) * 100)
                    print('Frequency of', key, '==>', percentage, '%')
コード例 #8
0
    def numNmersMotifs(self):

        pattern = ''
        countMotifs = {}

        for i in range(len(self.sequences)):
            dna = self.sequences[i]
            my_dna = Seq(dna, generic_dna)

            summ = [0] * 5
            #Define summ variable

            for j in range(1, len(self.motifs)):
                pattern = self.motifs[j]
                countMotifs[pattern] = []
                countMotifs[pattern].append(my_dna.count_overlap(pattern))

            for value in countMotifs.keys():
                if (len(value) == 1):
                    temp = countMotifs[value]  #This is a list
                    summ[0] += temp[0]
                elif (len(value) == 2):
                    temp = countMotifs[value]
                    summ[1] += temp[0]
                elif (len(value) == 3):
                    temp = countMotifs[value]
                    summ[2] += temp[0]
                elif (len(value) == 4):
                    temp = countMotifs[value]
                    summ[3] += temp[0]
                elif (len(value) == 5):
                    temp = countMotifs[value]
                    summ[4] += temp[0]
            self.numNmersMotifsDic[i] = []
            self.numNmersMotifsDic[i].append(summ[0])
            self.numNmersMotifsDic[i].append(summ[1])
            self.numNmersMotifsDic[i].append(summ[2])
            self.numNmersMotifsDic[i].append(summ[3])
            self.numNmersMotifsDic[i].append(summ[4])
コード例 #9
0
ファイル: Kmer.py プロジェクト: CrescentLuo/Amphisbaena
def kmerFreq(isoform):
    K = args.repeat
    sline = isoform.rstrip().split()
    chrom = sline[0]
    start = sline[1]
    end = sline[2]
    exonCnt = int(sline[9])
    exonlen = sline[10].rstrip(',').split(',')
    exonlen = [int(length) for length in exonlen]
    exonS = sline[11].rstrip(',').split(',')
    exonS = [int(s) for s in exonS]
    strand = sline[5]
    gene_seq = records[chrom].seq[int(sline[1]):int(sline[2])].upper()
    gene_seq_str = str(gene_seq)
    spliced_seq = ""
    for i in range(exonCnt):
        spliced_seq = spliced_seq + gene_seq_str[exonS[i]:(exonS[i] + exonlen[i])]
    if strand == '-':
        gene_seq = gene_seq.reverse_complement()
        spliced_seq = Seq(spliced_seq).reverse_complement()
    else:
        spliced_seq = Seq(spliced_seq)
    spliced_length = len(spliced_seq)
    gene_length = len(gene_seq)
    kmer_freq = [0.0] * (4 ** K)
    iso = sline[3]
    for kmer in kmer_dict:
        if args.overlap:
            kmer_freq[kmer_dict[kmer]] = spliced_seq.count_overlap(kmer) + 0.0
        else:
            kmer_freq[kmer_dict[kmer]] = spliced_seq.count(kmer) + 0.0
    if not(args.count):
        for ind,cnt in enumerate(kmer_freq):
            kmer_freq[ind] = cnt / spliced_length * 1000 
    kmer_freq = [str(freq) for freq in kmer_freq]  
    return iso+'\t'+'\t'.join(kmer_freq)+'\n'
コード例 #10
0
ファイル: dataset.py プロジェクト: katmh/helix
def makeDataset():
    with open('dataset_disp.csv', 'w') as out:
        out.write('seq,ie,gc,nstart,maxnstop,maxdisp\n')
        with open('exons_final.txt') as f:
            for line in f:
                # I/E label
                out.write(line[:-1] + ',1,')

                # GC
                my_seq = Seq(line[:-1], IUPAC.unambiguous_dna)
                out.write(str(GC(my_seq)) + ',')

                # N_ATG (start codon)
                out.write(str(my_seq.count_overlap('ATG')) + ',')

                # max(N_TAA, N_TAG, N_TGA) (stop codons)
                out.write(
                    str(
                        max(my_seq.count_overlap('TAA'),
                            my_seq.count_overlap('TAG'),
                            my_seq.count_overlap('TGA'))) + ',')

                # max disparity in translation of default frame
                #out.write( str(max(disp(trans(my_seq), 15))) + '\n')
        with open('introns_final.txt') as f:
            for line in f:
                # I/E label
                out.write(line[:-1] + ',0,')

                # GC
                my_seq = Seq(line, IUPAC.unambiguous_dna)
                out.write(str(GC(my_seq)) + ',')

                # N_ATG (start codon)
                out.write(str(my_seq.count_overlap('ATG')) + ',')

                # max(N_TAA, N_TAG, N_TGA) (stop codons)
                out.write(
                    str(
                        max(my_seq.count_overlap('TAA'),
                            my_seq.count_overlap('TAG'),
                            my_seq.count_overlap('TGA'))) + ',')
コード例 #11
0
    def gui_FrequencyMotifs(self):

        pattern = ''
        countMotifs = {}
        percentage = ''

        HTMLFILE = 'E:\MY CODES\PYTHON\DNA-Toolkit\Html-Output\motifFrequency.html'
        f = open(HTMLFILE, 'w')

        t = html.Table(header_row=self.motifs)

        for i in range(len(self.sequences)):
            dna = self.sequences[i]
            my_dna = Seq(dna, generic_dna)
            frequencyMotifsList = []
            frequencyMotifsList.append('<b>' + str(i + 1) + '</b>')  # number of sequence in table

            for j in range(1, len(self.motifs)):
                pattern = self.motifs[j]
                countMotifs[pattern] = []
                countMotifs[pattern].append(my_dna.count_overlap(pattern))

            for key, value in countMotifs.items():
                if (len(key) == 1):
                    temp = value[0]  # This is a list
                    percentage = '%.2f' % (
                    (temp / self.numNmersMotifsDic[i][0]) * 100)  # Caculate Percentage and show 2 decimal
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

                elif (len(key) == 2):
                    temp = value[0]
                    percentage = '%.2f' % ((temp / self.numNmersMotifsDic[i][1]) * 100)
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

                elif (len(key) == 3):
                    temp = value[0]
                    percentage = '%.2f' % ((temp / self.numNmersMotifsDic[i][2]) * 100)
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

                elif (len(key) == 4):
                    temp = value[0]
                    percentage = '%.2f' % ((temp / self.numNmersMotifsDic[i][3]) * 100)
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

                elif (len(key) == 5):
                    temp = value[0]
                    percentage = '%.2f' % ((temp / self.numNmersMotifsDic[i][4]) * 100)
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

            t.rows.append(frequencyMotifsList)

        htmlcode = str(t)
        # print (htmlcode)
        f.write(htmlcode)
        f.write('<p>')
        #print('Done' + '-' * 79)
        webbrowser.open(HTMLFILE)
コード例 #12
0
from Bio.Seq import Seq

with open("data/rosalind_ini.txt", "r") as sequence_file:
    sequence = Seq(sequence_file.read())

#Using count_overlap bcause python string's .count() is a non-overlapping count
#In some biological situations, overlapping count is necessary
print(sequence.count_overlap("A"), sequence.count_overlap("C"),
      sequence.count_overlap("G"), sequence.count_overlap("T"))
コード例 #13
0
    def gui_FrequencyMotifs(self):

        #Import html.py library for create table
        import lib.HTML as html

        pattern = ''
        countMotifs = {}
        percentage = ''

        HTMLFILE = 'htmlOutput/motifFrequency.html'
        f = open(HTMLFILE, 'w')

        t = html.Table(header_row=self.motifs)

        for i in range(len(self.sequences)):
            dna = self.sequences[i]
            my_dna = Seq(dna, generic_dna)
            frequencyMotifsList = []
            frequencyMotifsList.append('<b>' + str(i + 1) +
                                       '</b>')  #number of sequence in table

            for j in range(1, len(self.motifs)):
                pattern = self.motifs[j]
                countMotifs[pattern] = []
                countMotifs[pattern].append(my_dna.count_overlap(pattern))

            for key, value in countMotifs.items():
                if (len(key) == 1):
                    temp = value[0]  #This is a list
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][0]) * 100)
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

                elif (len(key) == 2):
                    temp = value[0]
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][1]) * 100)
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

                elif (len(key) == 3):
                    temp = value[0]
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][2]) * 100)
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

                elif (len(key) == 4):
                    temp = value[0]
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][3]) * 100)
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

                elif (len(key) == 5):
                    temp = value[0]
                    percentage = '%.2f' % (
                        (temp / self.numNmersMotifsDic[i][4]) * 100)
                    percentage = str(percentage) + '%'
                    frequencyMotifsList.append(percentage)

            t.rows.append(frequencyMotifsList)

        htmlcode = str(t)
        f.write(htmlcode)
        f.write('<p>')
        print('Done' + '-' * 79)