def frequent_words_with_mismatches_and_revcomps(text, k, d):
  frequent_patterns = []
  close = [0] * (4**k)
  frequency_array = [0] * (4**k)

  for i in range(0,len(text)-k+1):
    neighborhood = neighbors(text[i:i+k], d)
    for pattern in neighborhood:
      index = pattern_to_number(pattern)
      close[index] = 1

  for i in range(0,4**k):
    if close[i] == 1:
      pattern = number_to_pattern(i, k)
      frequency_array[i] = approximate_pattern_count(text, pattern, d)
      frequency_array[i] += approximate_pattern_count(text, reverse_complement(pattern), d)

  max_count = max(frequency_array)

  for i in range(0,4**k):
    if frequency_array[i] == max_count:
      pattern = number_to_pattern(i, k)
      if pattern not in frequent_patterns:
        frequent_patterns.append(pattern)

  return frequent_patterns
Exemple #2
0
def count_frequency_array(sequence, klen):
    frequency_array = [0] * (4**klen)
    frequent_positions = []

    start = 0
    end = klen

    for i in range(len(sequence) - klen):
        frame = sequence[start:end]

        num = pattern_to_number(frame)
        frequency_array[num] = frequency_array[num] + 1

        start = start + 1
        end = end + 1

    max_count = max(frequency_array)

    for i in range(len(frequency_array)):
        if frequency_array[i] == max_count:
            frequent_positions.append(i)

    # convert numbers to pattern
    f = lambda x: number_to_pattern(x)
    frequent_patterns = list(map(f, frequent_nums))

    return frequency_array, frequent_positions, frequent_patterns
def better_clump_finding(genome, k, t, L):
  frequent_patterns = []
  clump = [0] * (4**k)

  text = genome[0:L]
  frequency_array = computing_frequencies(text,k)

  for i in range(0,4**k):
    if frequency_array[i] >= t:
      clump[i] = 1

  for i in range(1,len(genome)-L+1):
    first_pattern = genome[i-1:i-1+k]
    index = pattern_to_number(first_pattern)
    frequency_array[index] = frequency_array[index] - 1
    last_pattern = genome[i+L-k:i+L]
    index = pattern_to_number(last_pattern)
    frequency_array[index] = frequency_array[index] + 1

    if frequency_array[index] >= t:
      clump[index] = 1

  for i in range(0,4**k):
    if clump[i] == 1:
      pattern = number_to_pattern(i,k)
      if pattern not in frequent_patterns:
        frequent_patterns.append(pattern)

  return frequent_patterns
def prob_3010_4():
    lines = open("data/dataset_3010_4.txt").read().splitlines()
    number = int(lines[0])
    k = int(lines[1])

    fout = open("out.txt", "w")
    fout.write(str(number_to_pattern(number, k)))
    fout.close()
def clump_finding(genome, k, t, L):
  frequent_patterns = []
  clump = [0] * (4**k)
  for i in range(0,len(genome)-L+1):
    text = genome[i:i+L]
    frequency_array = cf.computing_frequencies(text,k)
    for index in range(0,4**k):
      if frequency_array[index] >= t:
        clump[index] = 1
  for i in range(0,4**k):
    if clump[i] == 1:
      pattern = number_to_pattern(i, k)
      frequent_patterns.append(pattern)
  return frequent_patterns