Esempio n. 1
0
def frequent_words_with_mismatches(dna, k, d):
    frequent_patterns = []
    close_array = []
    frequency_array = []

    for i in range(4**k):
        close_array.append(0)
        frequency_array.append(0)

    for i in range(len(dna) - k + 1):
        neighbourhood = neighbours(dna[i:i + k], d)
        for pattern in neighbourhood:
            idx = PatternToNumber(pattern)
            close_array[idx] = 1

    for i in range(4**k):
        if close_array[i] == 1:
            pattern = NumberToPattern(i, k)
            frequency_array[i] = len(
                approximate_pattern_matching(pattern, dna, d))

    max_value = max(frequency_array)
    for i in range(4**k):
        if frequency_array[i] == max_value:
            pattern = NumberToPattern(i, k)
            frequent_patterns.append(pattern)

    return frequent_patterns
Esempio n. 2
0
def clump_finding(dna, k, L, t):
    clump_frequent_pattern_list = []
    clump_array = []

    for i in range(4**k - 1):
        clump_array.append(0)

    frequency_array = computing_frequencies(dna[0:L], k)

    for i in range(4**k - 1):
        if (frequency_array[i] >= t):
            pattern = NumberToPattern(i, k)
            clump_frequent_pattern_list.append(pattern)

    for i in range(1, len(dna) - L + 1):
        first_pattern = dna[i - 1:i - 1 + k]
        index = PatternToNumber(first_pattern)
        frequency_array[index] = frequency_array[index] - 1

        second_pattern = dna[i + L - k:i + L]
        index = PatternToNumber(second_pattern)
        frequency_array[index] = frequency_array[index] + 1

        if (frequency_array[index] >= t):
            clump_array[index] = 1

    for i in range(4**k - 1):
        if (clump_array[i] == 1):
            pattern = NumberToPattern(i, k)
            clump_frequent_pattern_list.append(pattern)

    return clump_frequent_pattern_list
Esempio n. 3
0
def finding_frequent_patterns_sorting(dna, k):
    frequent_patterns = []
    index = []
    count = []

    for i in range(len(dna) - k + 1):
        pattern = dna[i:i + k]
        idx = PatternToNumber(pattern)
        cnt = 1
        index.append(idx)
        count.append(cnt)

    index.sort()

    for i in range(1, len(dna) - k + 1):
        if (index[i] == index[i - 1]):
            count[i] = count[i - 1] + 1

    max_count = max(count)
    for i in range(0, len(dna) - k + 1):
        if (count[i] == max_count):
            pattern = NumberToPattern(index[i], k)
            frequent_patterns.append(pattern)

    return frequent_patterns
def profile_kmer(dna, k, profile):
    pattern_list=[]
    pattern_prob=[]
    
    for i in range(4**k):
        kmer = NumberToPattern(i,k)
        pattern_list.append(kmer)
        prob = pattern_probability(kmer, profile)
        pattern_prob.append(prob)
        
    pattern_prob, pattern_list = (list(t) for t in zip(*sorted(zip(pattern_prob, pattern_list))))

    for i in range(len(pattern_prob)-1,0,-1):
        for j in range(len(dna)-k+1):
            if(dna[j:j+k]==pattern_list[i]):
                return dna[j:j+k]
def median_string(dna_list, k):
    distance = math.inf
    median = ''
    
    pattern_list = []
    for i in range(4**k):
        kmer = NumberToPattern(i,k)
        pattern_list.append(kmer)
        
    for pattern in pattern_list:
        score=0
        for seq in dna_list:
            score = score + min_hamming_distance(pattern, seq)
        
        if score < distance:
            distance = score
            median = pattern
            
    return median