def match(short, long):
    mindis = -1
    minpos = 0
    for i in range(len(long) - len(short) + 1):
        dis = leve.hamming(short, long[i:i + len(short)])
        if mindis == -1 or dis < mindis:
            mindis = dis
            minpos = i
    # tail
    for i in range(
            len(long) - len(short) + 1,
            len(long) - (len(short) // 2) + 1):
        common = len(long) - i
        # ceil(dis*LEN_SHORT/LEN_COMMON)
        dis = (leve.hamming(short[:common], long[i:i + common]) * len(short) +
               common - 1) // common
        if mindis == -1 or dis < mindis:
            mindis = dis
            minpos = i
    # head
    for i in range(-(len(short) // 2), 0):
        common = len(short) + i
        # ceil(dis*LEN_SHORT/LEN_COMMON)
        dis = (leve.hamming(short[-i:], long[:common]) * len(short) + common -
               1) // common
        if mindis == -1 or dis < mindis:
            mindis = dis
            minpos = i
    return mindis, minpos
def vanalysis(read):

  hold_v = v_key.findall(read)
  
  if hold_v:
    if len(hold_v) > 1:
      counts['multiple_v_matches'] += 1
      return

    v_match = v_seqs.index(hold_v[0][0]) # Assigns V
    temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
    
    v_seq_start = hold_v[0][1]      
    end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions )      
    if end_v_v_dels: # If the number of deletions has been found
      return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start
      
  else:
    
    hold_v1 = half1_v_key.findall(read)
    
    if hold_v1:
      for i in range(len(hold_v1)):
        indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
        for k in indices:
          if len(v_seqs[k]) == len(read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
            if lev.hamming( v_seqs[k], read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
              counts['verr2'] += 1
              v_match = k
              temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
              end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions )
              if end_v_v_dels:
                v_seq_start = hold_v1[i][1]  
                return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start
      counts['foundv1notv2'] += 1
      return
    
    else:
      
      hold_v2 = half2_v_key.findall(read)
      if hold_v2:
        for i in range(len(hold_v2)):
          indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
          for k in indices:
            if len(v_seqs[k]) == len(read[hold_v2[i][1]-v_half_split:hold_v2[i][1]-v_half_split+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
              if lev.hamming( v_seqs[k], read[hold_v2[i][1]-v_half_split:hold_v2[i][1]+len(v_seqs[k])-v_half_split] ) <= 1:
                counts['verr1'] += 1
                v_match = k
                temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - v_half_split - 1 # Finds where the end of a full V would be
                end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions )
                if end_v_v_dels:
                  v_seq_start = hold_v2[i][1] - v_half_split      
                  return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start
        counts['foundv2notv1'] += 1
        return
              
      else:
        counts['no_vtags_found'] += 1
        return
Example #3
0
def vanalysis(read):

  hold_v = v_key.findall(read)
  
  if hold_v:
    if len(hold_v) > 1:
      counts['multiple_v_matches'] += 1
      return

    v_match = v_seqs.index(hold_v[0][0]) # Assigns VJ
    temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
    
    v_seq_start = hold_v[0][1]      
    end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions )      
    if end_v_v_dels: # If the number of deletions has been found
      return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start
      
  else:
    
    hold_v1 = half1_v_key.findall(read)
    
    if hold_v1:
      for i in range(len(hold_v1)):
        indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
        for k in indices:
          if len(v_seqs[k]) == len(read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
            if lev.hamming( v_seqs[k], read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
              counts['verr2'] += 1
              v_match = k
              temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
              end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions )
              if end_v_v_dels:
                v_seq_start = hold_v1[i][1]  
                return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start
      counts['foundv1notv2'] += 1
      return
    
    else:
      
      hold_v2 = half2_v_key.findall(read)
      if hold_v2:
        for i in range(len(hold_v2)):
          indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
          for k in indices:
            if len(v_seqs[k]) == len(read[hold_v2[i][1]-v_half_split:hold_v2[i][1]-v_half_split+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
              if lev.hamming( v_seqs[k], read[hold_v2[i][1]-v_half_split:hold_v2[i][1]+len(v_seqs[k])-v_half_split] ) <= 1:
                counts['verr1'] += 1
                v_match = k
                temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - v_half_split - 1 # Finds where the end of a full V would be
                end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions )
                if end_v_v_dels:
                  v_seq_start = hold_v2[i][1] - v_half_split      
                  return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start
        counts['foundv2notv1'] += 1
        return
              
      else:
        counts['no_vtags_found'] += 1
        return
Example #4
0
def janalysis(read, end_of_v):
  
  hold_j = j_key.findall(read)
  
  if hold_j:
    if len(hold_j) > 1:
      counts['multiple_j_matches'] += 1
      return
  
    j_match = j_seqs.index(hold_j[0][0]) # Assigns J
    temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
    
    j_seq_end = hold_j[0][1] + len(hold_j[0][0])      
        
    start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions, end_of_v )
    
    if start_j_j_dels: # If the number of deletions has been found
      return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end
          
  else:
    
    hold_j1 = half1_j_key.findall(read)
    if hold_j1:
      for i in range(len(hold_j1)):
        indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
        for k in indices:
          if len(j_seqs[k]) == len(read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
            if lev.hamming( j_seqs[k], read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
              counts['jerr2'] += 1
              j_match = k
              temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
              j_seq_end = hold_j1[i][1] + len(hold_j1[i][0]) + j_half_split                                              
              start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions, end_of_v )
              if start_j_j_dels:
                return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end
      counts['foundj1notj2'] += 1
      return              
            
    else:        
      hold_j2 = half2_j_key.findall(read)
      if hold_j2:
        for i in range(len(hold_j2)):
          indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
          for k in indices:
            if len(j_seqs[k]) == len(read[hold_j2[i][1]-j_half_split:hold_j2[i][1]-j_half_split+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
              if lev.hamming( j_seqs[k], read[hold_j2[i][1]-j_half_split:hold_j2[i][1]+len(j_seqs[k])-j_half_split] ) <= 1:
                counts['jerr1'] += 1
                j_match = k
                temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - j_half_split # Finds where the start of a full J would be
                j_seq_end = hold_j2[i][1] + len(hold_j2[i][0])                                                
                start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions, end_of_v )
                if start_j_j_dels:
                  return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end
        counts['foundv2notv1'] += 1
        return
      
      else:
         counts['no_j_assigned'] += 1
         return
def janalysis(read):
  
  hold_j = j_key.findall(read)
  
  if hold_j:
    if len(hold_j) > 1:
      counts['multiple_j_matches'] += 1
      return
  
    j_match = j_seqs.index(hold_j[0][0]) # Assigns J
    temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
    
    j_seq_end = hold_j[0][1] + len(hold_j[0][0])      
        
    start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions )
    
    if start_j_j_dels: # If the number of deletions has been found
      return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end
          
  else:
    
    hold_j1 = half1_j_key.findall(read)
    if hold_j1:
      for i in range(len(hold_j1)):
        indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
        for k in indices:
          if len(j_seqs[k]) == len(read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
            if lev.hamming( j_seqs[k], read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
              counts['jerr2'] += 1
              j_match = k
              temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
              j_seq_end = hold_j1[i][1] + len(hold_j1[i][0]) + j_half_split                                              
              start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions )
              if start_j_j_dels:
                return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end
      counts['foundj1notj2'] += 1
      return              
            
    else:        
      hold_j2 = half2_j_key.findall(read)
      if hold_j2:
        for i in range(len(hold_j2)):
          indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
          for k in indices:
            if len(j_seqs[k]) == len(read[hold_j2[i][1]-j_half_split:hold_j2[i][1]-j_half_split+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
              if lev.hamming( j_seqs[k], read[hold_j2[i][1]-j_half_split:hold_j2[i][1]+len(j_seqs[k])-j_half_split] ) <= 1:
                counts['jerr1'] += 1
                j_match = k
                temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - j_half_split # Finds where the start of a full J would be
                j_seq_end = hold_j2[i][1] + len(hold_j2[i][0])                                                
                start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions )
                if start_j_j_dels:
                  return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end
        counts['foundv2notv1'] += 1
        return
      
      else:
         counts['no_j_assigned'] += 1
         return
Example #6
0
def test():
    levenshtein = levenshtein_recur

    Start = 0
    Stop = 565
    limit = 10
    list_1 = ''.join(
        [chr(random.randrange(Start, Stop)) for iter in range(limit)])
    list_2 = ''.join(
        [chr(random.randrange(Start, Stop)) for iter in range(limit)])
    print(list_1)
    print(list_2)

    print("distance between 'cat', 'chello'", levenshtein('cat', 'chello'))
    print("distance between '', 'chello'", levenshtein('', 'chello'))
    print("distance between 'cat', ''", levenshtein('cat', ''))
    print("distance between 'cat', 'chello'", levenshtein('cat', 'chello'))
    print("distance between 'cat', 'cate'", levenshtein('cat', 'cate'))
    print("distance between 'cat', 'ca'", levenshtein('cat', 'ca'))
    print("distance between 'cat', 'cad'", levenshtein('cat', 'cad'))

    begin = time.time()
    print("distance between list_1, list_2",
          Levenshtein.distance(list_1, list_2))
    end = time.time()
    print(f"Total runtime of the Levenshtein.distance is {end - begin}")

    begin = time.time()
    print("distance between list_1, list_2", levenshtein_recur(list_1, list_2))
    end = time.time()
    print(f"Total runtime of the levenshtein_recur is {end - begin}")

    print("hamming distance between 'cat', 'cad'", hammingDist('cat', 'cad'))
    print("hamming distance between 'cat', 'cad'",
          Levenshtein.hamming('cat', 'cad'))

    Start = 0
    Stop = 565
    limit = 100
    list_1 = ''.join(
        [chr(random.randrange(Start, Stop)) for iter in range(limit)])
    list_2 = ''.join(
        [chr(random.randrange(Start, Stop)) for iter in range(limit)])
    print(list_1)
    print(list_2)

    begin = time.time()
    print("hamming distance between list_1, list_2",
          Levenshtein.hamming(list_1, list_2))
    end = time.time()
    print(f"Total runtime of the Levenshtein.hamming is {end - begin}")

    begin = time.time()
    print("hamming distance between list_1, list_2",
          hammingDist(list_1, list_2))
    end = time.time()
    print(f"Total runtime of the hammingDist is {end - begin}")
def simple(len_motifs_sup,l_max,L_Left,L_UP,epsilon,theta,top_k,output_filename):
	l = L_Left
	N = {}
	
	len_motifs_laplacesup  = copy.deepcopy(len_motifs_sup)
	#len_motifs_consolidatesup = copy.deepcopy(len_motifs_laplacesup)
	#len_motifs_laplacesup ={}
	len_motifs_consolidatesup ={}
	alphabet = 'agct'

	while l <= L_UP:
		scale =((l_max- l + 1) * (L_UP - L_Left+1))/epsilon
		allsequence =itertools.product(alphabet,repeat = l)
		Seq_l = len_motifs_sup[l]
		for x in allsequence:
			seql = ''.join(x)
			if seql in Seq_l.keys():
				len_motifs_laplacesup[l][seql] = len_motifs_sup[l][seql] + np.random.laplace(0,scale,1)[0]
			else:
				len_motifs_sup[l][seql] = 0
				len_motifs_laplacesup[l][seql] = np.random.laplace(0,scale,1)[0]
		Seq_l = len_motifs_sup[l]
		s = Seq_l.keys()[random.randint(0,len(Seq_l))]
		Bucket = {}
		for each_seq1 in Seq_l:
			i = Levenshtein.hamming(each_seq1,s)
			if i not in Bucket.keys():
				Bucket.setdefault(i,[])
				Bucket[i].append(each_seq1)
			else:
				Bucket[i].append(each_seq1)
		Bucket_key = Bucket.keys()
		for i in Bucket_key:
			for each_seq1 in Bucket[i]:
				len_motifs_consolidatesup.setdefault(l,{})[each_seq1]= 0
				if i >= deta:
					for j in xrange(i-deta ,min(i+deta,l)+1):
						if Bucket.has_key(j):
							for each_seq2 in Bucket[j]:
								if  0<=Levenshtein.hamming(each_seq1,each_seq2) <= deta:
									len_motifs_consolidatesup.setdefault(l,{})[each_seq1] = round(float(len_motifs_consolidatesup.setdefault(l,{})[each_seq1]))  + round(float(len_motifs_sup[l][each_seq2]))
				else:
					for j in xrange(0,min(i+deta,1)+1):
						if Bucket.has_key(j):
							for each_seq2 in Bucket[j]:
								if 0<=Levenshtein.hamming(each_seq1,each_seq2 )<=deta:
									len_motifs_consolidatesup.setdefault(l,{})[each_seq1]=round(float(len_motifs_consolidatesup.setdefault(l,{})[each_seq1]))  + round(float(len_motifs_sup[l][each_seq2]))

		N = TopN(N,len_motifs_consolidatesup[l],top_k)
		l += 1
        
	f = open(output_filename,'w')
	for i in dict(sorted(N.iteritems(),key=lambda t:t[1],reverse=True)).keys():
		f.write( i + ':' + str(N[i]) + '\n')
	return len_motifs_consolidatesup
Example #8
0
def j_analysis(rc, hold_j, j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j,
               j_regions, half1_j_key, half2_j_key):

    j_match = None

    if hold_j:
        j_match = j_seqs.index(hold_j[0][0])  # Assigns J
        temp_start_j = hold_j[0][1] - jump_to_start_j[
            j_match]  # Finds where the start of a full J would be
        if get_j_deletions(rc, j_match, temp_start_j, j_regions):
            [start_j, deletions_j] = get_j_deletions(rc, j_match, temp_start_j,
                                                     j_regions)
        found_j_match = 1
    else:
        found_j_match = 0
        hold_j1 = half1_j_key.findall(rc)
        hold_j2 = half2_j_key.findall(rc)

        for i in range(len(hold_j1)):
            indices = [
                y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0]
            ]
            for k in indices:
                if len(j_seqs[k]) == len(
                        rc[hold_j1[i][1]:hold_j1[i][1] +
                           len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
                    if lev.hamming(
                            j_seqs[k], rc[hold_j1[i][1]:hold_j1[i][1] +
                                          len(j_seqs[k])]) <= 1:
                        j_match = k
                        temp_start_j = hold_j1[i][1] - jump_to_start_j[
                            j_match]  # Finds where the start of a full J would be
                        found_j_match += 1
        for i in range(len(hold_j2)):
            indices = [
                y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0]
            ]
            for k in indices:
                if len(j_seqs[k]) == len(
                        rc[hold_j2[i][1] - j_half_split:hold_j2[i][1] -
                           j_half_split +
                           len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
                    if lev.hamming(
                            j_seqs[k],
                            rc[hold_j2[i][1] - j_half_split:hold_j2[i][1] +
                               len(j_seqs[k]) - j_half_split]) <= 1:
                        j_match = k
                        temp_start_j = hold_j2[i][1] - jump_to_start_j[
                            j_match] - j_half_split  # Finds where the start of a full J would be
                        found_j_match += 1

    if j_match is not None:
        return j_match, temp_start_j, found_j_match
    else:
        return [None, None, None]
Example #9
0
def trim5(seq):
    for base in range(the_minimum_length_of_adapter5, len(seq)):  #取不到的那个值
        if len(seq[:base]) < len(adapter5):
            seq_part = seq[:base]
            ada_part = adapter5[-base:]
            if Levenshtein.hamming(seq_part, ada_part) <= 1:
                return True
        else:
            seq_part = seq[base - len(adapter5):base]
            ada_part = adapter5
            if Levenshtein.hamming(seq_part, ada_part) <= 1:
                return True
    return False
Example #10
0
def trim3(seq):
    read_trimed = ''
    for base in range(len(seq) - the_minimum_length_of_adapter3, -1, -1):
        if len(seq[base:]) < len(adapter3):
            seq_part = seq[base:]
            ada_part = adapter3[:len(seq_part)]
            if Levenshtein.hamming(seq_part, ada_part) <= 2:
                read_trimed = seq[:base]
        else:
            seq_part = seq[base:base + len(adapter3)]
            ada_part = adapter3
            if Levenshtein.hamming(seq_part, ada_part) <= 2:
                read_trimed = seq[:base]
    return read_trimed
Example #11
0
def find_best_match(TAG_seq, tags, maximum_distance):
    """
    Find the best match from the list of tags.

    Compares the Levenshtein distance between tags and the trimmed sequences.
    The tag and the sequence must have the same length.
    If no matches found returns 'unmapped'.
    We add 1
    Args:
        TAG_seq (string): Sequence from R1 already start trimmed
        tags (dict): A dictionary with the TAGs as keys and TAG Names as values.
        maximum_distance (int): Maximum distance given by the user.

    Returns:
        best_match (string): The TAG name that will be used for counting.
    """
    best_match = 'unmapped'
    best_score = maximum_distance
    for tag, name in tags.items():
        score = Levenshtein.hamming(tag, TAG_seq[:len(tag)])
        if score == 0:
            #Best possible match
            return (name)
        elif score <= best_score:
            best_score = score
            best_match = name
            return (best_match)
    return (best_match)
Example #12
0
def hamming_distance(first, second):
    ''' returns the edit distance/hamming distances between
    its two arguements '''

    # dist = sum([not a == b for a, b in zip(first, second)])
    # return dist
    return Levenshtein.hamming(first, second)
Example #13
0
def check_clusters(seq_list, all_clusters, cutoff, unique):
    print('Checking for unique input sequences')
    seqs = {}
    for seq in seq_list:
        seqs[seq[0]] = seqs.get(seq, 0) + 1

    # Sequences in seq_list are unique
    if unique:            
        for k, v in seqs.items():
            if v != 1:
                print('Error: sequence %s appears in seq_list more than once.' % k)
            
    # There is a one-to-one correspondence between sequences in seq_list and sequences in all_clusters
    print('Checking for one-to-one correspondence between input and output sequences.')
    for cluster, max_len, min_len in all_clusters:
        for seq, id in cluster:
            if seq not in seqs:
                print('Error: sequence %s is in all_clusters but not in seq_list.' % seq)
            else:
                seqs[seq] += 1

    for k, v in seqs.items():
        if v < 2:
            print('Error: sequence %s appears in seq_list but not in all_clusters.' % k)
        elif unique and v > 2:
            print('Error: sequence %s appears in all_clusters more than once.' % k)
            
    # The cluster forms a connected network with each sequence in a cluster having a nearest neighbour within the cutoff distance
    print('Checking cluster membership.')
    t0 = time.time()
    i = 0
    for cluster, max_len, min_len in all_clusters:
        # push each cluster through get_clusters and check it results in a single cluster
        # one could argue that this isn't strictly an independent check, but the underlying algorithm is in scipy
        # this does check that merging across chunks has happened correctly
        if len(cluster) > 1:
            res = get_clusters(cluster, cutoff)
            if len(res) != 1:
                print('Error: cluster with sequence %s (id %s) is partitioned into %d clusters by further application of get_cluster.' % (cluster[0], len(res)))

        i += 1
        t1 = time.time()
        if t1 - t0 > 10:
            print 'Checking cluster %d\n' % i
            t0 = time.time()

    # No clusters are mergeable
    print('Checking that clusters are distinct.')
    for i in range(len(all_clusters)):
        c1 = all_clusters[i][0]
        for c2, max_len, min_len in all_clusters[i+1:]:
            for s1, i1 in c1:
                for s2, i2 in c2:
                    cut = int(cutoff * min(len(s1), len(s2)))
                    if hamming:
                        if len(s1) == len(s2) and ld.hamming(s1, s2) <= cut:
                            print('Error: sequences %s (id %s) and %s (id %s) are in different clusters but are within the cutoff distance.' % (s1, i1, s2, i2))
                    else:
                        if ld.distance(s1, s2, cut) <= cut:
                            print('Error: sequences %s (id %s) and %s (id %s) are in different clusters but are within the cutoff distance.' % (s1, i1, s2, i2))
def hamming1_align((peptide, protein_list)):

    # I and L are considered the same in this alignment
    query = peptide.replace('I', 'L')
    query_length = len(query)
    match_list = []
    for protein in protein_list:
        subject = protein['seq'].replace('I', 'L')
        subject_length = len(subject)

        # First, find candidate locations by pigeonhole principle:
        # if hamming distance is 1, the left or right half must be exact match
        # Then, calculate hamming distance at candidate locations and return those equal to 1
        query_left = query[:query_length / 2]
        query_right = query[query_length / 2:]
        left_index = [x.start() for x in re.finditer(query_left, subject)]
        right_index = [x.start() for x in re.finditer(query_right, subject)]
        right_index = [(x - query_length / 2) for x in right_index]
        candidate_index = left_index + right_index
        candidate_index = [
            x for x in candidate_index
            if x >= 0 and (x + query_length) <= subject_length
        ]
        hamming1_index = [
            x for x in candidate_index
            if Levenshtein.hamming(query, subject[x:(x + query_length)]) == 1
        ]

        if hamming1_index:
            match_list += [{
                'protein': protein,
                'match_index': index
            } for index in hamming1_index]

    return peptide, match_list
def apm(sequence, pattern, max_mismatch):
    motif_size = len(pattern)
    list_motif_mism = []
    for i in range(0, len(sequence) - motif_size + 1):
        if Levenshtein.hamming(pattern,sequence[i:i + motif_size]) <= max_mismatch:
            list_motif_mism.append(sequence[i:i + motif_size])
    return list_motif_mism
Example #16
0
def calculateD(example):
    '''
    计算各种距离
    :param example:
    :param request_template:
    :return: 返回跟每个模版比较的加权距离,此处加权较为简单,平均做的
    '''
    # sim = {'hamming':0,'distance':0,'Leven':0,..}
    sim_all = []
    # if example in request_template:
    #     return
    for request_M in request_template:
        if example != request_M['request_data']:
            sim = {'hamming': 0, 'distance': 0, 'Leven': 0,'jaro':0,'jaro_winkler':0,'function':request_M['function'],'sum':0}
            sim['distance'] = 1/Levenshtein.distance(example, request_M['request_data'])
            sim['Leven'] = Levenshtein.ratio(example, request_M['request_data'])
            sim['jaro'] = Levenshtein.jaro(example,request_M['request_data'])
            sim['jaro_winkler'] = Levenshtein.jaro_winkler(example,request_M['request_data'])
            try:
                sim['hamming'] = 1/Levenshtein.hamming(example, request_M['request_data'])
            except ValueError:
                sim['hamming'] = 0

            sim['sum'] = (sim['hamming']+sim['distance']+sim['Leven']+sim['jaro']+sim['jaro_winkler'])/5
            sim_all.append(sim)
        else:
            return [{'hamming': 1, 'distance': 1, 'Leven': 1,'jaro':1,'jaro_winkler':1,'function':request_M['function'],'sum':1}]
        # print(sim)
    return sim_all
Example #17
0
 def match_head(self, line, keywords):
     dist_list = []
     len_list = []
     for target in keywords:
         t_len = min(len(target), len(line))
         dist = Levenshtein.hamming(line[:t_len], target[:t_len])
         dist_list.append(dist)
         len_list.append(t_len)
     npdist = np.array(dist_list)
     min_idx = np.array(dist_list).argmin()
     if (dist_list[min_idx] <= 1):
         if (dist_list[min_idx] == 1):
             nplen = np.array(len_list)
             min_val = np.array(dist_list).min()
             min_idx_list = np.where(npdist == min_val)
             if min_idx_list[0].size > 1:
                 min_len_list = nplen[list(min_idx_list[0])]
                 min_idx = min_idx_list[0][min_len_list.argmax()]
         if (dist_list[min_idx] > 0):
             logging.warning(
                 f"将[{line}]开头纠正为[{keywords[min_idx]}],距离{dist_list[min_idx]}"
             )
         return keywords[min_idx]
     else:
         return None
Example #18
0
def find_best_match_shift(TAG_seq, tags, maximum_distance):
    """
    Find the best match from the list of tags with sliding window.

    Compares the Levenshtein distance between tags and the trimmed sequences.
    The tag and the sequence must have the same length.
    If no matches found returns 'unmapped'.
    We add 1
    Args:
        TAG_seq (string): Sequence from R1 already start trimmed
        tags (dict): A dictionary with the TAGs as keys and TAG Names as values.
        maximum_distance (int): Maximum distance given by the user.

    Returns:
        best_match (string): The TAG name that will be used for counting.
    """
    best_match = "unmapped"
    best_score = maximum_distance
    shifts = range(0, len(TAG_seq) - len(max(tags, key=len)))

    for shift in shifts:
        for tag, name in tags.items():
            score = Levenshtein.hamming(tag, TAG_seq[shift:len(tag) + shift])
            if score == 0:
                # Best possible match
                return name
            elif score <= best_score:
                best_score = score
                best_match = name
                return best_match
    return best_match
Example #19
0
def superhamming(inpA, inpB, max_cutoff):

    if len(inpA) > len(inpB):
        A = inpB
        B = inpA
    else:
        A = inpA
        B = inpB

    len_diff = len(B) - len(A)

    best_ham = 100
    bestA = ""
    bestB = ""

    top_buffer = len_diff + max_cutoff
    bot_buffer = max_cutoff

    for i in range(top_buffer + 1):
        ii = top_buffer - i
        top_string = "." * i + A + ii * "."
        # print ""
        # print top_string

        for j in range(bot_buffer + 1):
            jj = bot_buffer - j
            bot_string = "." * j + B + jj * "."
            # print bot_string, Levenshtein.hamming(top_string, bot_string)
            ham = Levenshtein.hamming(top_string, bot_string)
            if ham < best_ham:
                best_ham = ham
                bestA = top_string
                bestB = bot_string

    return (best_ham)
Example #20
0
def get_exp_mismatch_matrix(k, _lambda):
    """
    Compute the mismatch mixing matrix for A(k) with an _lambda-exponentially
    decaying mixing coefficient in the Hamming distance (number of character
    mismatches).

    Eg: 'AAAA' and 'AABB' have a Hamming distance of 2, thus a mixing coefficient
        of _lambda**2

    PARAMETERS:
    - k: the length of strings in the alphabet
    - _lambda: the exponential parameter of the decay per mismatches

    RETURNS:
    - (4^k, 4^k) mixing matrix
    """

    words = get_words(k)
    N = len(words)

    exp_mismatch_matrix = np.zeros((N, N))
    for i in range(N):
        exp_mismatch_matrix[i,i] = 1
        for j in range(i+1, N):
            exp_mismatch_matrix[i,j] = _lambda**Levenshtein.hamming(words[i], words[j])
            exp_mismatch_matrix[j,i] = exp_mismatch_matrix[i,j]

    return exp_mismatch_matrix
def test_cell_distances(whitelist, collapsing_threshold):
    """Tests cell barcode distances to validate provided cell barcode collapsing threshold
    
    Function needs the given whitelist as well as the threshold.
    If the value is too high, it will rerun until an acceptable value is found.
    
    Args:
        whitelist (set): Whitelist barcode set
        collapsing_threshold (int): Value of threshold

    Returns:
        collapsing_threshold (int): Valid threshold
    """
    ok = False
    while not ok:
        print('Testing cell barcode collapsing threshold of {}'.format(
            collapsing_threshold))
        all_comb = combinations(whitelist, 2)
        for comb in all_comb:
            if Levenshtein.hamming(comb[0], comb[1]) <= collapsing_threshold:
                collapsing_threshold -= 1
                print('Value is too high, reducing it by 1')
                break
        else:
            ok = True
    print('Using {} for cell barcode collapsing threshold'.format(
        collapsing_threshold))
    return (collapsing_threshold)
def match_short_data(long_data, short_data):
    long = long_data['s']
    short = short_data['s']
    mindis = -1
    minpos = 0
    for i in range(len(long)-len(short)+1):
        dis = leve.hamming(short, long[i:i+len(short)])
        if mindis == -1 or dis < mindis:
            mindis = dis
            minpos = i
    # # tail
    # for i in range(len(long)-len(short)+1, len(long)-(len(short)//2)+1):
    #     common = len(long)-i
    #     # ceil(dis*LEN_SHORT/LEN_COMMON)
    #     dis = (leve.hamming(short[:common],
    #                         long[i:i+common])*len(short)+common-1)//common
    #     if mindis == -1 or dis < mindis:
    #         mindis = dis
    #         minpos = i
    # # head
    # for i in range(-(len(short)//2), 0):
    #     common = len(short)+i
    #     # ceil(dis*LEN_SHORT/LEN_COMMON)
    #     dis = (leve.hamming(short[-i:], long[:common])
    #            * len(short)+common-1)//common
    #     if mindis == -1 or dis < mindis:
    #         mindis = dis
    #         minpos = i
    match = {
        'name': short_data['name'],
        'pos': minpos,
        'dis': mindis,
        's': short_data['s']
    }
    return match
Example #23
0
def check_index_distance(indexes):
    print >> sys.stderr, "# Check hamming distance for indexes:"
    for index1 in indexes:
        print >> sys.stderr, index1,
        for index2 in indexes:
            print >> sys.stderr, lv.hamming(index1, index2),
        print >> sys.stderr
def apm(sequence, pattern, max_mismatch):
    motif_size = len(pattern)
    list_motif_mism = []
    for i in range(0, len(sequence) - motif_size + 1):
        if Levenshtein.hamming(pattern,
                               sequence[i:i + motif_size]) <= max_mismatch:
            list_motif_mism.append(sequence[i:i + motif_size])
    return list_motif_mism
Example #25
0
def loop_formacth3(str3, longstr):
    dict_num = {}
    for i in range(0, (len(longstr) - 8)):
        sim2 = Levenshtein.hamming(str3, longstr[i:(i + 8)])
        dict_num[longstr[i:(i + 8)]] = sim2
    dict_num = sorted(dict_num.items(), key=lambda x: x[1], reverse=False)
    # print(dict_num[0])
    return dict_num[0]
Example #26
0
def loop_formacth2(str2, longstr):
    dict_num = {}
    for i in range(10, 117):
        sim2 = Levenshtein.hamming(str2, longstr[i:(i + 33)])
        dict_num[longstr[i:(i + 33)]] = sim2
    dict_num = sorted(dict_num.items(), key=lambda x: x[1], reverse=False)
    # print(dict_num[0])
    return dict_num[0]
Example #27
0
def get_dists(x):
    (lowrow, highrow, seq_list, hamming) = x
    dists = []
    for i in range(lowrow, highrow):
        for j in range(i+1, len(seq_list)):
            if hamming:
                dists.append(ld.hamming(seq_list[i][0], seq_list[j][0]))
            else:
                dists.append(ld.distance(seq_list[i][0], seq_list[j][0]))
    return (dists)
Example #28
0
def loop_formacth1(str1, longstr):
    dict_num = {}
    for i in range(10, 116):
        # print(longstr[i:(i+34)])
        # print(str1)
        sim1 = Levenshtein.hamming(str1, longstr[i:(i + 34)])
        dict_num[longstr[i:(i + 34)]] = sim1
    dict_num = sorted(dict_num.items(), key=lambda x: x[1], reverse=False)
    # print(dict_num[0])
    return dict_num[0]
Example #29
0
def add_items(edges, verts, colours, idprefix, seqs, seq_counts,
              first_sample_seen, cutoff, highlightcolour, cluster):
    # Check the cluster spans at least two samples
    samples_hit = []
    for id in cluster:
        for colour in colours:
            if colour[0] in id:
                if colour[0] not in samples_hit:
                    samples_hit.append(colour[0])
                    if len(samples_hit) > 1:
                        break

    if len(samples_hit) < 2:
        return 0

    newedges = len(edges)
    for i in range(len(cluster)):
        id1 = cluster[i]
        for id2 in cluster[i + 1:]:
            if len(seqs[id1]) == len(seqs[id2]):
                limit = int(len(seqs[id1]) * cutoff)
                hd = ld.hamming(seqs[id1], seqs[id2])
                if hd <= limit:
                    edge = {}
                    edge['Source'] = id1
                    edge['Target'] = id2
                    edge['Hamming'] = hd
                    if hd == 1:
                        edge['Color'] = 'black'
                    else:
                        edge['Color'] = 'white'
                    edges.append(edge)

    for id in cluster:
        found = False
        for edge in edges[newedges:]:
            if edge['Source'] == id or edge['Target'] == id:
                found = True
                break
        if not found:
            print 'Warning: vertex %s is not connected.' % id

        vert = {}
        vert['Id'] = id
        if id[:len(idprefix)] == idprefix:
            vert['color'] = 'black'
            if seqs[id] in first_sample_seen:
                vert['color'] = colours[first_sample_seen[seqs[id]]][1]
        else:
            vert['color'] = highlightcolour

        vert['size'] = 2 + 2 * math.log(seq_counts[seqs[id]])
        verts.append(vert)

    return 1
Example #30
0
def check_bc(seq, bc_len):
    bc = seq[0:bc_len]
    print(bc_len, bc)
    min_dist = 100
    min_bc = None
    for b in bc_lens[bc_len]:
        dist = lv.hamming(b, bc)
        if dist < min_dist:
            min_dist = dist
            min_bc = b
    return bc_len, min_dist, min_bc
Example #31
0
def smooth_step(seq1, seq2):
    mylist, myseq = [], []
    num = len(seq2) - len(seq1) + 1
    for i in range(num):
        misnum = Levenshtein.hamming(seq1, seq2[i:i + len(seq1)])
        myseq.append(seq2[i:])
        mylist.append(misnum)
    if min(mylist) <= 5:
        return min(mylist), myseq[mylist.index(min(mylist))][:len(seq1)]
    else:
        return 100, "NNNNNNNNNN"
 def seq_ngrams_query(self, query_seq, max_mismatch=2):
     '''
 Return a list of tuples of mqtches, each with a reference to a read that contains the
 query sequence query_seq, and the postion at which it was found in that read's umi_well_seq
 '''
     if len(query_seq) < FastqReadNgramHash.ngram_length:
         sys.stderr.write(
             f'Cannot query a FastqReadNgramHash with a query string shorter than the ngram length ({FastqReadNgramHash.ngram_length})\n'
         )
     match_umi_well_seqs = {}
     num_ngrams = len(query_seq) - FastqReadNgramHash.ngram_length + 1
     for offset in range(num_ngrams):
         ngram = query_seq[offset:(offset +
                                   FastqReadNgramHash.ngram_length)]
         if ngram in self.ngram_hash:
             new_matches = self.ngram_hash[ngram]
         else:
             new_matches = []
         for (match_umi_well_seq, match_offset) in new_matches:
             if match_umi_well_seq in match_fastq_reads:
                 match_fastq_reads[match_umi_well_seq].update(
                     {match_offset: offset})
             else:
                 match_fastq_reads[match_umi_well_seq] = {
                     match_offset: offset
                 }
     final_matches = []
     for match_umi_well_seq, offsets in match_umi_well_seqs.items():
         if (num_ngrams - len(offsets) <= max_mismatch):
             sorted_match_offsets = sorted(offsets.keys())
             # print(query_seq)
             # print(fastq_read.umi_well_seq)
             # print('\n'.join([f'{match_offset}: {offsets[match_offset]}' for match_offset in sorted_match_offsets]))
             best_run = _longest_consecutive_run(sorted_match_offsets)
             if best_run == None:
                 continue
             # print(best_run)
             # print(offsets[sorted_match_offsets[_longest_consecutive_run(sorted_match_offsets)]])
             best_match_start = sorted_match_offsets[best_run] - offsets[
                 sorted_match_offsets[_longest_consecutive_run(
                     sorted_match_offsets)]]
             # print(f'best_match_start: {best_match_start}')
             if best_match_start > FastqReadNgramHash.seq_length - len(
                     query_seq) or best_match_start < 0:
                 continue
             best_match_dist = Levenshtein.hamming(
                 query_seq,
                 match_umi_well_seq[best_match_start:best_match_start +
                                    len(query_seq)])
             # print(f'fastq_read: {fastq_read}\nbest_match_start: {best_match_start}\nbest_match_dist: {best_match_dist}')
             final_matches.append(
                 (match_umi_well_seq, best_match_start, best_match_dist))
             # print('#########')
     return (final_matches)
Example #33
0
def add_items(edges, verts, colours, idprefix, seqs, seq_counts, first_sample_seen, cutoff, highlightcolour, cluster):
    # Check the cluster spans at least two samples
    samples_hit = []
    for id in cluster:
        for colour in colours:
            if colour[0] in id:
                if colour[0] not in samples_hit:
                    samples_hit.append(colour[0])
                    if len(samples_hit) > 1:
                        break
                        
    if len(samples_hit) < 2:
        return 0

    newedges = len(edges)
    for i in range(len(cluster)):
        id1 = cluster[i]
        for id2 in cluster[i+1:]:
            if len(seqs[id1]) == len(seqs[id2]):
                limit = int(len(seqs[id1]) * cutoff)
                hd = ld.hamming(seqs[id1], seqs[id2])
                if hd <= limit:
                    edge = {}
                    edge['Source'] = id1
                    edge['Target'] = id2
                    edge['Hamming'] = hd
                    if hd == 1:
                        edge['Color'] = 'black'
                    else:
                        edge['Color'] = 'white'
                    edges.append(edge)
                
    for id in cluster:
        found = False
        for edge in edges[newedges:]:
            if edge['Source'] == id or edge['Target'] == id:
                found = True
                break
        if not found:
            print 'Warning: vertex %s is not connected.' % id
            
        vert = {}
        vert['Id'] = id
        if id[:len(idprefix)] == idprefix:
            vert['color'] = 'black'
            if seqs[id] in first_sample_seen:                
                vert['color'] = colours[first_sample_seen[seqs[id]]][1]
        else:
            vert['color'] = highlightcolour

        vert['size'] = 2 + 2*math.log(seq_counts[seqs[id]])
        verts.append(vert)
    
    return 1
Example #34
0
def get_exp_mismatch_matrix(words, _lambda):
    N = len(words)

    exp_mismatch_matrix = np.zeros((N, N))
    for i in range(N):
        exp_mismatch_matrix[i, i] = 1
        for j in range(i + 1, N):
            exp_mismatch_matrix[i, j] = _lambda**Levenshtein.hamming(
                words[i], words[j])
            exp_mismatch_matrix[j, i] = exp_mismatch_matrix[i, j]

    return exp_mismatch_matrix
Example #35
0
def can_merge(A, B, off):
    res = ''
    if off < 0:
        A, B = B, A
        off = -off
    common = min(len(A) - off, len(B))
    # if common <= 100:  # Threshold
    #     return False
    error_rate = leve.hamming(A[off:off + common], B[:common]) / common
    if error_rate > 0.2:  # Threshold
        return False
    return True
def v_analysis( rc, hold_v, v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v, v_regions, half1_v_key, half2_v_key, error0_count, error1_count ):

    # rc is a string of record.seq as input

    v_match = None

    if hold_v:                
        v_match = v_seqs.index(hold_v[0][0]) # Assigns V
        temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
        if get_v_deletions( rc, v_match, temp_end_v, v_regions ): # If the number of deletions has been found
            [ end_v, deletions_v] = get_v_deletions( rc, v_match, temp_end_v, v_regions )
        found_v_match = 1
        error0_count += 1
    else:
        found_v_match = 0
        hold_v1 = half1_v_key.findall(rc)
        hold_v2 = half2_v_key.findall(rc)
        for i in range(len(hold_v1)):
            indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
            for k in indices:
                if len(v_seqs[k]) == len(rc[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
                    if lev.hamming( v_seqs[k], str(rc)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                        v_match = k
                        temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                        found_v_match += 1
                        error1_count += 1
        for i in range(len(hold_v2)):
            indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
            for k in indices:
                if len(v_seqs[k]) == len(rc[hold_v2[i][1]-v_half_split:hold_v2[i][1]-v_half_split+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
                    if lev.hamming( v_seqs[k], rc[hold_v2[i][1]-v_half_split:hold_v2[i][1]+len(v_seqs[k])-v_half_split] ) <= 1:
                        v_match = k
                        temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - v_half_split - 1 # Finds where the end of a full V would be
                        found_v_match += 1
                        error1_count += 1

    if v_match is not None:
        return v_match, temp_end_v, found_v_match, error0_count, error1_count
    else:
        return [None, None, None, error0_count, error1_count]
def j_analysis( rc, hold_j, j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j, j_regions, half1_j_key, half2_j_key ):

    j_match = None
    
    if hold_j:
        j_match = j_seqs.index(hold_j[0][0]) # Assigns J
        temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
        if get_j_deletions( rc, j_match, temp_start_j, j_regions ):
            [ start_j, deletions_j] = get_j_deletions( rc, j_match, temp_start_j, j_regions )
        found_j_match = 1
    else:
        found_j_match = 0
        hold_j1 = half1_j_key.findall(rc)
        hold_j2 = half2_j_key.findall(rc)
        
        for i in range(len(hold_j1)):
            indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
            for k in indices:
                if len(j_seqs[k]) == len(rc[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
                    if lev.hamming( j_seqs[k], rc[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                        j_match = k
                        temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                        found_j_match += 1
        for i in range(len(hold_j2)):
            indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
            for k in indices:
                if len(j_seqs[k]) == len(rc[hold_j2[i][1]-j_half_split:hold_j2[i][1]-j_half_split+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
                    if lev.hamming( j_seqs[k], rc[hold_j2[i][1]-j_half_split:hold_j2[i][1]+len(j_seqs[k])-j_half_split] ) <= 1:
                        j_match = k
                        temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - j_half_split # Finds where the start of a full J would be
                        found_j_match += 1

    if j_match is not None:
        return j_match, temp_start_j, found_j_match
    else:
        return [None, None, None]
Example #38
0
def find_merge_point(x):
    cs1, c, cutoff, hamming = x
    c2, c2_max_length, c2_min_length = c
    merges = []
    for s2, i2 in c2:
        s2_len = len(s2)
        for i in range(len(cs1)):
            if i not in merges:
                (c1, c1_max_length, c1_min_length) = cs1[i]
                for s1, i1 in cs1[i][0]:
                    cut = int(cutoff * min(len(s1), s2_len))
                    if hamming:
                        if c1_min_length != c1_max_length or c2_min_length != c2_max_length:
                            print 'Error: variable length clusters found.'
                        if c1_min_length == c2_min_length and ld.hamming(s1, s2) <= cut:
                            merges.append(i)
                            break
                    else:
                        if c1_min_length - c2_max_length <= cut and c2_min_length - c1_max_length <= cut and ld.distance(s1, s2, cut) <= cut:
                            merges.append(i)
                            break

    return merges
Example #39
0
def main(argv):
    parser = argparse.ArgumentParser(description='Plot the minimum distance between sequences in the file, uusing Hamming distance between sequences of the same length.')
    parser.add_argument('infile', help='input file (FASTA)')
    parser.add_argument('outprefix', help='prefix for output files')
    parser.add_argument('-l', '--limit', help='limit to at most this many sequences, drawn at random without replacement')    
    parser.add_argument('-v', '--verbose', help='display progress', action='store_true')    
    parser.add_argument('-i', '--interactive', help='display charts interactively', action='store_true')    
    parser.add_argument('-g', '--length_lims', help='axis limits for CDR length plots <xmin,xmax,ymin,ymax>')
    parser.add_argument('-d', '--dist_lims', help='axis limits for CDR length plots <xmin,xmax,ymin,ymax>')
    parser.add_argument('-c', '--csv', help='produce comma separated variable output instead of plots', action='store_true')
    args = parser.parse_args()

    length_lims = None
    dist_lims = None
    
    def split_lims(lims):
        ll = lims.split(',')
        if len(ll) != 4:
            print('Error: limit specifier "%s" should contain four numbers separated by commas.' % lims)
            exit()
        else:
            for i in range(len(ll)):
                ll[i] = float(ll[i])
            return ll
    
    if args.length_lims:
        length_lims = split_lims(args.length_lims)        

    if args.dist_lims:
        dist_lims = split_lims(args.dist_lims)        

    # move seqs into dict, indexed by length. Eliminate duplicates.
    seq_list = []
    seen_seqs = {}
    for seq_record in SeqIO.parse(args.infile, "fasta"):
        seq = str(seq_record.seq)
        if seq not in seen_seqs:
            seq_list.append(seq)
            seen_seqs[seq] = 1
        
    print '%d unique sequences.' % len(seen_seqs)

    if args.limit and len(seq_list) > int(args.limit):
        if args.verbose:
            print 'Limiting to a sample of %d' % int(args.limit)
        seq_list = random.sample(seq_list, int(args.limit))

    seqs = {}
    for seq in seq_list:
        length = len(seq)
        if length in seqs:
            seqs[length].append(seq)
        else:
            seqs[length] = [seq]
            
    maxval = max(seqs.keys()) + 1
    minval = min(seqs.keys())
    sizes = np.zeros(maxval)
    for k, v in seqs.items():
        sizes[k] = len(v)

    sizes = sizes / sizes.sum()

    # plot length distribution

    if not args.csv:                    
        plt.figure()
        plt.bar(range(maxval), sizes, 1/1.5, color='blue')
        plt.xlabel('CDR3 length')
        plt.ylabel('Frequency')
        if length_lims:
            plt.axis(length_lims)
        plt.savefig(args.outprefix + '_length_distribution.pdf')        
        if args.interactive:
            plt.show()
    else:
        with open(args.outprefix + '_length_distribution.csv', 'wb') as fo:
            writer = csv.writer(fo)
            writer.writerow(['Length'] + range(minval, maxval))
            writer.writerow(['Frequency'] + list(sizes[minval:maxval]))
            
    # Calculate min distances
    
    mindists = []
    for seq_length, seq_list in seqs.items():
        if args.verbose:
            print 'Processing sequences of length %d' % seq_length
        for i in range(len(seq_list)-1):
            mindist = 9999
            for j in range(i+1, len(seq_list)):
                h = Levenshtein.hamming(seq_list[i], seq_list[j])
                mindist = min(mindist, h)
            mindists.append(float(mindist)/seq_length)
            
    if not args.csv:                    
        plt.figure()
        plt.hist(mindists, bins=50)
        plt.xlabel('Minimum distance')
        plt.ylabel('Occurrences')
        if dist_lims:
            plt.axis(dist_lims)
        plt.savefig(args.outprefix + '_min_dist.pdf')        
        if args.interactive:
            plt.show()
    else:
        freq, bins = np.histogram(mindists, 50)
        with open(args.outprefix + '_min_dist.csv', 'wb') as fo:
            writer = csv.writer(fo)
            writer.writerow(['Distance'] + list(bins))
            writer.writerow(['Occurrences'] + list(freq))
def analysis( Sequence_Reads, with_statistics=True, with_reverse_complement_search=True):
    import numpy as np
    import decimal as dec
    import string
    import operator as op
    import collections as coll
    from Bio import SeqIO
    from acora import AcoraBuilder
    from time import time, clock
    from string import Template
    from operator import itemgetter, attrgetter
    import Levenshtein as lev

    v_half_split, j_half_split = [10,6] # Do not change - V tags are split at position 10, J at position 6, to look for half tags if no full tag is found.

    ################

    print 'Commencing analysis on a total of', len(Sequence_Reads), 'file(s)'

    ## Create .txt file to store f=(v_index,j_index,v_deletions,j_deletions,nt_insert)
    analysis_file = open("DecombinatorResults.txt", "w")
    analysis_file.close()
    results = "DecombinatorResults.txt" # Name the .txt file to write to

    ################
    print ('Importing known V, D and J gene segments and tags...')

    handle = open("human_TRBV_region.fasta", "rU")
    v_genes = list(SeqIO.parse(handle, "fasta"))
    handle.close()

    handle = open("human_TRBJ_region.fasta", "rU")
    j_genes = list(SeqIO.parse(handle, "fasta"))
    handle.close()

    v_regions = []
    for j in range(0, len(v_genes)):
        v_regions.append(string.upper(v_genes[j].seq))

    j_regions = []
    for j in range(0, len(j_genes)):
        j_regions.append(string.upper(j_genes[j].seq))

    ##############
    ## Build keyword tries of V and J tags for fast assignment
    v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_trbv.txt", "rU"), v_half_split)
    j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_trbj.txt", "rU"), j_half_split)   

    v_builder = AcoraBuilder()
    for i in range(0,len(v_seqs)):
        v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie

    v_key = v_builder.build()

    j_builder = AcoraBuilder()
    for i in range(0,len(j_seqs)):
        j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie

    j_key = j_builder.build()

    ##############
    ## Build keyword tries for first and second halves of both V and J tags
    v_half1_builder = AcoraBuilder()
    for i in range(0,len(half1_v_seqs)):
        v_half1_builder.add(str(half1_v_seqs[i]))
    half1_v_key = v_half1_builder.build()

    v_half2_builder = AcoraBuilder()
    for i in range(0,len(half2_v_seqs)):
        v_half2_builder.add(str(half2_v_seqs[i]))
    half2_v_key = v_half2_builder.build()

    j_half1_builder = AcoraBuilder()
    for i in range(0,len(half1_j_seqs)):
        j_half1_builder.add(str(half1_j_seqs[i]))
    half1_j_key = j_half1_builder.build()

    j_half2_builder = AcoraBuilder()
    for i in range(0,len(half2_j_seqs)):
        j_half2_builder.add(str(half2_j_seqs[i]))
    half2_j_key = j_half2_builder.build()

    ###############
    ## Initialise variables
    assigned_count = 0 # this will just increase by one every time we correctly assign a seq read with all desired variables
    seq_count = 0 # this will simply track the number of sequences analysed in file
    t0 = time() # Begin timer

    ###############
    ## Open .txt file created at the start of analysis
    analysis_file = open(results, "a")
    stemplate = Template('$v $j $del_v $del_j $nt_insert') # Creates stemplate, a holder, for f. Each line will have the 5 variables separated by a space

    ###############
    ## Begin analysing sequences

    for i in range(len(Sequence_Reads)):
        
        print 'Importing sequences from', Sequence_Reads[i],' and assigning V and J regions...'
        handle = open(Sequence_Reads[i], "rU")
        
        for record in SeqIO.parse(handle, "fastq"):
            
            found_seq_match = 0
            seq_count += 1
            
            hold_v = v_key.findall(str(record.seq))
            hold_j = j_key.findall(str(record.seq))

            if hold_v:                
                v_match = v_seqs.index(hold_v[0][0]) # Assigns V
                temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found
                    [ end_v, deletions_v] = get_v_deletions( record.seq, v_match, temp_end_v, v_regions )
            else:
                found_v_match = 0
                hold_v1 = half1_v_key.findall(str(record.seq))
                hold_v2 = half2_v_key.findall(str(record.seq))
                for i in range(len(hold_v1)):
                    indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                found_v_match += 1
                for i in range(len(hold_v2)):
                    indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                found_v_match += 1

            if hold_j:
                j_match = j_seqs.index(hold_j[0][0]) # Assigns J
                temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions )
            else:
                found_j_match = 0
                hold_j1 = half1_j_key.findall(str(record.seq))
                hold_j2 = half2_j_key.findall(str(record.seq))
                for i in range(len(hold_j1)):
                    indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = half1_j_seqs.index(hold_j1[i][0])
                                temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                                found_j_match += 1
                for i in range(len(hold_j2)):
                    indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = half2_j_seqs.index(hold_j2[i][0])
                                temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be
                                found_j_match += 1

            if hold_v and hold_j:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence
                    assigned_count += 1
                    found_seq_match = 1
            elif hold_v and found_j_match == 1:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq
                    assigned_count += 1
                    found_seq_match = 1
            elif found_v_match == 1 and hold_j:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq
                    assigned_count += 1
                    found_seq_match = 1
            elif found_v_match == 1 and found_j_match == 1:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq
                    assigned_count += 1
                    found_seq_match = 1

            if found_seq_match == 0 and with_reverse_complement_search == True:
                
                #####################
                # REVERSE COMPLEMENT
                #####################

                record_reverse = record.reverse_complement()
                hold_v = v_key.findall(str(record_reverse.seq))
                hold_j = j_key.findall(str(record_reverse.seq))

                if hold_v:                
                    v_match = v_seqs.index(hold_v[0][0]) # Assigns V
                    temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found
                        [ end_v, deletions_v] = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions )
                else:
                    found_v_match = 0
                    hold_v1 = half1_v_key.findall(str(record_reverse.seq))
                    hold_v2 = half2_v_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_v1)):
                        indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                    found_v_match += 1
                    for i in range(len(hold_v2)):
                        indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                    found_v_match += 1

                if hold_j:
                    j_match = j_seqs.index(hold_j[0][0]) # Assigns J
                    temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                    if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        [ start_j, deletions_j] = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions )
                else:
                    found_j_match = 0
                    hold_j1 = half1_j_key.findall(str(record_reverse.seq))
                    hold_j2 = half2_j_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_j1)):
                        indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = half1_j_seqs.index(hold_j1[i][0])
                                    temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                                    found_j_match += 1
                    for i in range(len(hold_j2)):
                        indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = half2_j_seqs.index(hold_j2[i][0])
                                    temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be
                                    found_j_match += 1

                if hold_v and hold_j:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence
                        assigned_count += 1
                        found_seq_match = 1
                elif hold_v and found_j_match == 1:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq
                        assigned_count += 1
                        found_seq_match = 1
                elif found_v_match == 1 and hold_j:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq
                        assigned_count += 1
                        found_seq_match = 1
                elif found_v_match == 1 and found_j_match == 1:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq
                        assigned_count += 1
                        found_seq_match = 1
        handle.close()
    analysis_file.close()

    if with_statistics == True:
        timed = time() - t0
        print seq_count, 'sequences were analysed'
        print assigned_count, ' sequences were successfully assigned'
        print 'Time taken =', timed, 'seconds'
    if True:
        # investigate most frequently repeated tweets in each class
        c_in = Counter(in_class_lines)
        c_out = Counter(out_class_lines)

    # some hard-coded display routines for playing with the data...
    if False:
        plt.figure()
        plt.ion()
        if False:  # histogram of tweet lengths
            lengths_in_class = [len(s) for s in in_class_lines]
            lengths_out_class = [len(s) for s in out_class_lines]
            plt.title("Histogram of tweet lengths for classes in " + args.table)
            plt.xlabel("Bins of tweet lengths")
            plt.ylabel("Counts")
            tweet_lengths = (0, 140)
            filename_pattern = "histogram_tweet_lengths_{}.png"
        # note - tried counting spaces with s.count(" ") but this seems to mirror
        # tweet-length
        if True:  # counting number of capital letters
            lengths_in_class = [Levenshtein.hamming(s, s.lower()) for s in in_class_lines]
            lengths_out_class = [Levenshtein.hamming(s, s.lower()) for s in out_class_lines]
            plt.title("Histogram of number of capitals for classes in " + args.table)
            tweet_lengths = (0, 40)
            filename_pattern = "nbr_capitals_{}.png"
        plt.hist(lengths_in_class, range=tweet_lengths, color="blue", label="in-class", histtype="step")
        plt.hist(lengths_out_class, range=tweet_lengths, color="green", label="out-class", histtype="step")
        UPPER_LEFT = 2
        plt.legend(loc=UPPER_LEFT)
        plt.savefig(filename_pattern.format(args.table))
#     Jaro Distance
#     Jaro-Winkler Distance
#     Match Rating Approach Comparison
#     Hamming Distance

# Phonetic encoding:
#     American Soundex
#     Metaphone
#     NYSIIS (New York State Identification and Intelligence System)
#     Match Rating Codex
import jellyfish
print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish'))  # 2; 编辑距离
print(jellyfish.jaro_distance('jellyfish', 'smellyfish'))  # 0.89629629629629637
print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs'))  # 1; 编辑距离, 带翻转的
print(jellyfish.metaphone('Jellyfish'))  # 'JLFX'
print(jellyfish.soundex('Jellyfish'))  # 'J412'
print(jellyfish.nysiis('Jellyfish'))  # 'JALYF'
print(jellyfish.match_rating_codex('Jellyfish'))  # 'JLLFSH'

##################################################################
## Lenvenshtein
import Levenshtein
print(Levenshtein.hamming('hello', 'helol'))  # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数
print(Levenshtein.distance('hello', 'helol'))  # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换
print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf'))  # 5
print(Levenshtein.ratio('hello', 'helol'))  # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离
# 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2
# 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题
print(Levenshtein.jaro('hello', 'helol'))  # 0.9333333333333332; 计算 jaro 距离; 用于健康普查
print(Levenshtein.jaro_winkler('hello', 'helol'))  # 0.9533333333333333; 计算 Jaro – Winkler 距离
Example #43
0
      
      elif pc_dist > 20:
        
        # need to check if the distance is being artificially inflated by DCR finding a tag up/down-stream of the genuine ones
        # can try trimming either end of the longer sequence to see if the distance then drops low enough
        
        dist1 = 0
        dist2 = 0
        dist3 = 0
        dist4 = 0
        
        if len(seq) > len(proto_seq):
          
          diff = len(seq) - len(proto_seq)
          
          dist1 = lev.hamming(seq[:len(proto_seq)], proto_seq)   # trim 3'
          
          dist2 = lev.hamming(seq[diff:], proto_seq)             # trim 5'
          
          if dist1/len(proto_seq) * 100 <= 20 or dist2/len(proto_seq) * 100 <= 20:
                    
            count_match += 1

            #### OUTPUT THIS RECORD! ######
            
            dcr_collapsed[proto_dcr][d] += 1
            
          else:

            count_diff_len_discarded += 1
            
Example #44
0
def DupRemover(inputbam, output, bedout, chunk, quality, multi):
	start = time.time()
	#pre-processing. Generating unique names for temporary files, in case multiple instances of the script is run at the same time.

	mm = hashlib.sha1()
	mm.update(inputbam.split("/")[-1]+str(random.random()))
	tempbed = ".temp_bed-"+mm.hexdigest()
	tempbed0 = ".0temp_bed-"+mm.hexdigest()
	tempsam = ".temp_sam-"+mm.hexdigest()
	ticker = False
	ticker2 = True
#	logfile = "cleaned/"+inputbam.split(".bam")[0]+"_cleaned.log"
	logfile = output.split(".bam")[0]+".log"

	if multi:
		with open(logfile, 'a') as ff:
			msg = "Working on file %s\n" %inputbam
			ff.write("%s: %s" % (time.strftime("%X %x"), msg))
	else:
		printStatus("Working on the file: %s" %inputbam)
	
	output = output.split(".bam")[0]+".sam"

	# check if the input file is from bbmap or bowtie

	cmd0 = "bedtools bamtobed -i %s | head > %s" %(inputbam, tempbed0)
	subprocess.check_call(cmd0, shell=True)

	xx = open(tempbed0, 'r')

	# This routine tries to understand if the input file came from bowtie2 or bbmap (because bbmap generate some funny .sam files). 
	# It may not work for other aligners. Although I guess it might also work.

	if len(xx.next().strip().split(" ")) == 1:
		cmd = "bedtools bamtobed -i %s | sed 's/:/ /g' | sort -k1,1 -k2,2n -k3,3n -k11,11 -k12,12n -k13,13 > %s" %(inputbam, tempbed)
	elif len(xx.next().strip().split(" ")) == 2:
		ticker2 = False # marks  as "bbmap" output
		cmd = "bedtools bamtobed -i %s | awk '{print $1,$2,$3,$4,$6,$7}' | tr [:blank:] \\\\t | sed 's/:/ /g' | sort -k1,1 -k2,2n -k3,3n -k11,11 -k12,12n -k13,13 > %s" %(inputbam, tempbed)
	elif len(xx.next().strip().split(" ")) == 3:
		ticker2 = False#  marks  as "bbmap" output
		cmd = "bedtools bamtobed -i %s | awk '{print $1,$3,$4,$5,$7,$8}' | tr [:blank:] \\\\t | sed 's/:/ /g' | sort -k1,1 -k2,2n -k3,3n -k11,11 -k12,12n -k13,13 > %s" %(inputbam, tempbed)
	else:
		print "Unrecognized .bam formatting."
		sys.exit(1)
	xx.close()
	os.remove(tempbed0)

	if multi:
		with open(logfile, 'a') as ff:
			msg = "Generating some temp files.\n"
			ff.write("%s: %s" % (time.strftime("%X %x"), msg))
	else:
		printStatus("Generating some temp files.")
	cmd2 = "samtools view -h %s > %s" %(inputbam, tempsam)
	if multi:
		with open(logfile, 'a') as ff:
			msg = "Running command: \n\t%s\n"%cmd
			ff.write("%s: %s" % (time.strftime("%X %x"), msg))
	else:
		printStatus("Running command: \n\t%s"%cmd) 
	subprocess.check_call(cmd, shell=True)
	if multi:
		with open(logfile, 'a') as ff:
			msg = "Running command: \n\t%s"%cmd2
			ff.write("%s: %s\n" % (time.strftime("%X %x"), msg))	
	else:
		printStatus("Running command: \n\t%s"%cmd2) 
	subprocess.check_call(cmd2, shell=True)

	if multi:
		with open(logfile, 'a') as ff:
			msg = "Done. Starting to work on the temp files.\n"
			ff.write("%s: %s" % (time.strftime("%X %x"), msg))	
	else:
		printStatus("Done. Starting to work on the temp files.")

	aa = open(tempbed, 'r')

	seq_set = set()
	name_set = set()
	sam_set = set()
	counter = 0 # reads that are kept
	counter2 = 2 # total number of reads
	counter3 = 0 # reads that are below the quality threshold. 
	multiplier = 1

	# Initiate the whole thing with the first two lines.
	line1 = aa.next()
	line2 = aa.next()

	line1_bcode = line1.split(" ")[-1].split("\t")[0]
	line2_bcode = line2.split(" ")[-1].split("\t")[0]
	line1_qual = int(line1.split("\t")[-2])
	line1_strand = line1.strip().split("\t")[-1]
	line2_strand = line2.strip().split("\t")[-1]
	line1_begin = line1.split("\t")[0] + "," + line1.split("\t")[1]
	line1_end = line1.split("\t")[0] + "," + line1.split("\t")[2]
	line2_begin = line2.split("\t")[0] + "," + line2.split("\t")[1]
	line2_end = line2.split("\t")[0] + "," + line2.split("\t")[2]
	levent = Levenshtein.hamming(line1_bcode, line2_bcode)
	
	#comparing consecutive lines with each other. 

	if line1_strand == line2_strand:
		if line1_strand == "+":
			if line1_begin != line2_begin:
				if bedout:
					if line1_qual >= quality:
						seq_set.add(":".join(line1.split(" ")))
				if line1_qual >= quality:	
					name_set.add(":".join(line1.split(" ")).split("\t")[3])
					counter += 1
				else:
					counter3 +=1
			else:	
				if levent > 1:
					if bedout:
						if line1_qual >= quality:
							seq_set.add(":".join(line1.split(" ")))
					if line1_qual >= quality:
						name_set.add(":".join(line1.split(" ")).split("\t")[3])
						counter += 1
					else:
						counter3 += 1

		elif line1_strand =="-":
			if line1_end != line2_end:
				if line1_qual >= quality:
					if bedout:
						seq_set.add(":".join(line1.split(" ")))
				if line1_qual >= quality:
					name_set.add(":".join(line1.split(" ")).split("\t")[3])
					counter += 1
				else:
					counter3 += 1
			else:	
				if levent > 1:
					if line1_qual >= quality:
						if bedout:
							seq_set.add(":".join(line1.split(" ")))
					if line1_qual >= quality:
						name_set.add(":".join(line1.split(" ")).split("\t")[3])
						counter += 1
					else:
						counter3 += 1
		else:
			printStatus("The strand is neither + or - Something must be wrong, exiting")
			aa.close()
			os.remove(tempbed)
			os.remove(tempsam)
			sys.exit(1)
	else:		
		if line1_qual >= quality:
			if bedout:
				seq_set.add(":".join(line1.split(" ")))
		if line1_qual >= quality:
			name_set.add(":".join(line1.split(" ")).split("\t")[3])
			counter += 1
		else:
			counter3 += 1

	#switching lines, so we can iteratively work with consecutive lines
		
	line1 = line2

	while True:
		try:
			line1_bcode = line1.split(" ")[-1].split("\t")[0]
			line1_qual = int(line1.split("\t")[-2])
			line2 = aa.next()
			counter2 += 1 #this counter holds the total number of lines in the file. 
			line2_bcode = line2.split(" ")[-1].split("\t")[0]
			line1_strand = line1.strip().split("\t")[-1]
			line2_strand = line2.strip().split("\t")[-1]
			line1_begin = line1.split("\t")[0] + "," + line1.split("\t")[1]
			line1_end = line1.split("\t")[0] + "," + line1.split("\t")[2]
			line2_begin = line2.split("\t")[0] + "," + line2.split("\t")[1]
			line2_end = line2.split("\t")[0] + "," + line2.split("\t")[2]			
			levent = Levenshtein.hamming(line1_bcode, line2_bcode)
			
			if line1_strand == line2_strand:
				if line1_strand == "+":
					if line1_begin != line2_begin:
						if line1_qual >= quality:
							if bedout:
								seq_set.add(":".join(line1.split(" ")))
							name_set.add(":".join(line1.split(" ")).split("\t")[3])
							counter += 1
						else:
							counter3 += 1
					else:	
						if levent > 1:
							if line1_qual >= quality:
								if bedout:
									seq_set.add(":".join(line1.split(" ")))
								name_set.add(":".join(line1.split(" ")).split("\t")[3])
								counter += 1
							else:
								counter3 += 1

				elif line1_strand =="-":
					if line1_end != line2_end:
						if line1_qual >= quality:
							if bedout:
								seq_set.add(":".join(line1.split(" ")))
							name_set.add(":".join(line1.split(" ")).split("\t")[3])
							counter += 1
						else:
							counter3 += 1
					else:	
						if levent > 1:
							if line1_qual >= quality:
								if bedout:
									seq_set.add(":".join(line1.split(" ")))
								name_set.add(":".join(line1.split(" ")).split("\t")[3])
								counter += 1
							else:
								counter3 += 1
				else:
					printStatus("The strand is neither + or - Something must be wrong, exiting.")
					aa.close()
					os.remove(tempbed)
					os.remove(tempsam)
					sys.exit(1)
			else:		
				if line1_qual >= quality:
					if bedout:
						seq_set.add(":".join(line1.split(" ")))
				if line1_qual >= quality:
					name_set.add(":".join(line1.split(" ")).split("\t")[3])
					counter += 1
				else:
					counter3 += 1

			line1 = line2

			# Every 100000 unique reads (or multiples of 100000 as determined by the chunk variable), the set that holds unique reads are flushed. 

			if bedout:
				if counter == multiplier * chunk:
					for i in seq_set:
						with open(bedout, 'a') as ff:
							ff.write(i)
					multiplier += 1
					seq_set = set()

		except StopIteration:
			if bedout:
				for i in seq_set:
					with open(bedout, 'a') as ff:
						ff.write(i)
			if multi:
				with open(logfile, 'a') as ff:
					msg ="%s of %s reads reads were removed. %s reads were removed because they were below the set quality threshold: Q%s.\n" %(counter2-counter, counter2, counter3, quality)
					msg2 = "%%%s of the reads reads were removed. Of the reads thet were removed %%%s were below the set quality threshold\n" %(round(((counter2-counter)/counter2)*100), round((counter3/(counter2-counter))*100))
					ff.write("%s: %s" % (time.strftime("%X %x"), msg))
					ff.write("%s: %s" % (time.strftime("%X %x"), msg2))
			else:
				printStatus(" %s%% of the reads were removed, they were either PCR-duplicates or were below the quality threshold." %(round((1-(counter/(counter+counter2)))*100)))
				printStatus("%s of %s reads reads were removed. %s of the reads were removed because they were below the set quality threshold: Q%s.\n" %(counter2-counter, counter2, counter3, quality))
				printStatus("%%%s of the reads reads were removed. Of the reads thet were removed %%%s were below the set quality threshold" %(round(((counter2-counter)/counter2)*100), round((counter3/(counter2-counter))*100)))
			break
	aa.close()

	bb = open(tempsam, 'r')
	list_of_stuff = ['@SQ', '@PG', '@HD']
	counter3 = 0
	multiplier2 = 1
	if not multi:
		printStatus("Generating the .sam file now.")

	# Capturing the header from the .bam file.

	while True:
		line = bb.next()
		if line.split("\t")[0] in list_of_stuff:
			with open(output, 'a') as ff:
				ff.write(line)
		else:
			if line.split("\t")[0] in name_set:
				sam_set.add(line)
				counter3 += 1
			break


	while True:
		try:
			sam_line = bb.next()
			if ticker2: # If input was from bowtie
				if sam_line.split("\t")[0] in name_set:
					sam_set.add(sam_line)
					counter3 += 1 #this is a useless counter. Should be the same as counter
			else: #if input was from BBMap
				if sam_line.split(" ")[0] in name_set:
					sam_set.add(sam_line.split(" ")[0]+"\t"+"\t".join(sam_line.split("\t")[1:]))
					counter3 += 1 #this is a useless counter. Should be the same as counter				

			if counter3 == multiplier2*chunk:
				for i in sam_set:
					with open(output, 'a') as ff:
						ff.write(i)
				multiplier2 += 1
				sam_set = set()

		except StopIteration:
			for i in sam_set:
				with open(output, 'a') as ff:
					ff.write(i)
			sam_set = set()
			break
	bb.close()

#cleaning up

	os.remove(tempbed)
	os.remove(tempsam)

	if not multi:
		printStatus("Generating a bam file now.")
	cmd_bam = "samtools view -Sb %s | samtools sort - %s" %(output, output.split(".sam")[0])
	subprocess.check_call(cmd_bam, shell=True)
	if not multi:
		printStatus("Generating the index for that bam.")
	cmd_sam = "samtools index %s" %output.split(".sam")[0]+".bam"
	subprocess.check_call(cmd_sam, shell=True)
	os.remove(output)

	if multi:
		end = time.time()
		elapsed = round(end-start)
		with open(logfile, 'a') as ff:
				msg ="Done! Elapsed time: %s seconds.\n" %elapsed
				ff.write("%s: %s" % (time.strftime("%X %x"), msg))
Example #45
0
def analysis(fastqs, vfasta, jfasta, vtags, jtags, rev_comp=False,
                verbose=False, sep=" "):
    if verbose:
        sys.stderr.write('>> Analyzing %d file(s)\n' % len(fastqs))
        sys.stderr.write(">> Importing known V, and J gene segments and tags\n")

    # get the sequences per region
    v_genes = list(SeqIO.parse(nopen(vfasta), "fasta"))
    j_genes = list(SeqIO.parse(nopen(jfasta), "fasta"))
    # XXX
    # classes to parse fasta, fastq, and method to reverse complement
    # get rid of biopython
    v_regions = [str(v_genes[i].seq.upper()) for i, v in enumerate(v_genes)]
    j_regions = [str(j_genes[i].seq.upper()) for i, v in enumerate(j_genes)]

    v_seqs, vleft_seqs, vright_seqs, v_ends = get_tags(vtags)
    j_seqs, jleft_seqs, jright_seqs, j_starts = get_tags(jtags)

    # full sequences
    builder = AcoraBuilder(v_seqs)
    v_key = builder.build()
    builder = AcoraBuilder(j_seqs)
    j_key = builder.build()
    
    # half sequences
    builder = AcoraBuilder(vleft_seqs)
    vleft_key = builder.build()
    builder = AcoraBuilder(vright_seqs)
    vright_key = builder.build()
    builder = AcoraBuilder(jleft_seqs)
    jleft_key = builder.build()
    builder = AcoraBuilder(jright_seqs)
    jright_key = builder.build()
    
    # correctly assigned sequences
    assigned_count = 0
    # number of sequences analysed
    seq_count = 0
    # begin clock
    t0 = time()
    
    # XXX
    stemplate = Template('$v $j $del_v $del_j $nt_insert')

    for fastq in fastqs:
        if verbose:
            sys.stderr.write(">> Starting %s...\n" % fastq)
        for i, record in enumerate(SeqIO.parse(nopen(fastq), "fastq")):
            # if i == 50:
            #     sys.exit()
            found_seq_match = 0
            seq_count += 1
            hold_v = v_key.findall(str(record.seq))
            hold_j = j_key.findall(str(record.seq))

            if hold_v:
                # the index position of the found sequence among known (v_seqs)
                v_match = v_seqs.index(hold_v[0][0])
                
                # new variable names
                # do not like lists for this task
                match_idx = v_seqs.index(hold_v[0][0])
                match_start_idx = hold_v[0][1]
                vseq_end = v_ends[match_idx] - 1
                end_of_v = match_start_idx + vseq_end
                
                # Finds where the end of a full V would be
                temp_end_v = hold_v[0][1] + v_ends[v_match] - 1
                
                # If the number of deletions has been found
                if get_v_deletions(record.seq, v_match, temp_end_v, v_regions):
                    end_v, deletions_v = get_v_deletions(record.seq, v_match, temp_end_v, v_regions)
            else:
                found_v_match = 0
                hold_v1 = vleft_key.findall(str(record.seq))
                hold_v2 = vright_key.findall(str(record.seq))
                for i in range(len(hold_v1)):
                    indices = [y for y, x in enumerate(vleft_seqs) if x == hold_v1[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[vleft_seqs.index(hold_v1[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                # Finds where the end of a full V would be
                                temp_end_v = hold_v1[i][1] + v_ends[v_match] - 1
                                found_v_match += 1
                for i in range(len(hold_v2)):
                    indices = [y for y, x in enumerate(vright_seqs) if x == hold_v2[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[vright_seqs.index(hold_v2[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                # Finds where the end of a full V would be
                                temp_end_v = hold_v2[i][1] + v_ends[v_match] - 1
                                found_v_match += 1

            if hold_j:
                # Assigns J
                j_match = j_seqs.index(hold_j[0][0])
                # Finds where the start of a full J would be
                temp_start_j = hold_j[0][1] - j_starts[j_match]
                if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions )
            else:
                found_j_match = 0
                hold_j1 = jleft_key.findall(str(record.seq))
                hold_j2 = jright_key.findall(str(record.seq))
                for i in range(len(hold_j1)):
                    indices = [y for y, x in enumerate(jleft_seqs) if x == hold_j1[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[jleft_seqs.index(hold_j1[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = jleft_seqs.index(hold_j1[i][0])
                                # Finds where the start of a full J would be
                                temp_start_j = hold_j1[i][1] - j_starts[j_match]
                                found_j_match += 1
                for i in range(len(hold_j2)):
                    indices = [y for y, x in enumerate(jright_seqs) if x == hold_j2[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[jright_seqs.index(hold_j2[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = jright_seqs.index(hold_j2[i][0])
                                # Finds where the start of a full J would be
                                temp_start_j = hold_j2[i][1] - j_starts[j_match] - 6
                                found_j_match += 1

            if hold_v and hold_j:
                f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = record.seq[end_v+1:start_j])
                # Write to analysis_file (text file) the classification of the sequence
                print f_seq
                assigned_count += 1
                found_seq_match = 1
            elif hold_v and found_j_match == 1:
                f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                print f_seq
                assigned_count += 1
                found_seq_match = 1
            elif found_v_match == 1 and hold_j:
                f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                print f_seq
                assigned_count += 1
                found_seq_match = 1
            elif found_v_match == 1 and found_j_match == 1:
                f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                print f_seq
                assigned_count += 1
                found_seq_match = 1
            
            #####################
            # REVERSE COMPLEMENT
            #####################
            if found_seq_match == 0 and rev_comp:

                record_reverse = record.reverse_complement()
                hold_v = v_key.findall(str(record_reverse.seq))
                hold_j = j_key.findall(str(record_reverse.seq))

                if hold_v:
                    # Assigns V
                    v_match = v_seqs.index(hold_v[0][0])
                    # Finds where the end of a full V would be
                    temp_end_v = hold_v[0][1] + v_ends[v_match] - 1
                    # If the number of deletions has been found
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ):
                        end_v, deletions_v = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions )
                else:
                    found_v_match = 0
                    hold_v1 = vleft_key.findall(str(record_reverse.seq))
                    hold_v2 = vright_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_v1)):
                        indices = [y for y, x in enumerate(vleft_seqs) if x == hold_v1[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[vleft_seqs.index(hold_v1[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    # Finds where the end of a full V would be
                                    temp_end_v = hold_v1[i][1] + v_ends[v_match] - 1
                                    found_v_match += 1
                    for i in range(len(hold_v2)):
                        indices = [y for y, x in enumerate(vright_seqs) if x == hold_v2[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[vright_seqs.index(hold_v2[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    # Finds where the end of a full V would be
                                    temp_end_v = hold_v2[i][1] + v_ends[v_match] - 1
                                    found_v_match += 1

                if hold_j:
                    # Assigns J
                    j_match = j_seqs.index(hold_j[0][0])
                    # Finds where the start of a full J would be
                    temp_start_j = hold_j[0][1] - j_starts[j_match]
                    if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        start_j, deletions_j = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions )
                else:
                    found_j_match = 0
                    hold_j1 = jleft_key.findall(str(record_reverse.seq))
                    hold_j2 = jright_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_j1)):
                        indices = [y for y, x in enumerate(jleft_seqs) if x == hold_j1[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[jleft_seqs.index(hold_j1[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = jleft_seqs.index(hold_j1[i][0])
                                    # Finds where the start of a full J would be
                                    temp_start_j = hold_j1[i][1] - j_starts[j_match]
                                    found_j_match += 1
                    for i in range(len(hold_j2)):
                        indices = [y for y, x in enumerate(jright_seqs) if x == hold_j2[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[jright_seqs.index(hold_j2[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = jright_seqs.index(hold_j2[i][0])
                                    # Finds where the start of a full J would be
                                    temp_start_j = hold_j2[i][1] - j_starts[j_match] - 6
                                    found_j_match += 1

                if (hold_v and hold_j) or \
                        (hold_v and found_j_match == 1) or \
                        (found_v_match == 1 and hold_j) or \
                        (found_v_match == 1 and found_j_match == 1):
                    
                    f_seq = stemplate.substitute(v = v_match, j = j_match, 
                                del_v = deletions_v, del_j = deletions_j, 
                                nt_insert = str(record_reverse.seq[end_v + 1:start_j]))
                    fields = (v_match, j_match, deletions_v, deletions_j,
                                record_reverse.seq[end_v + 1:start_j])
                    assigned_count += 1
                    found_seq_match = 1
                    print sep.join(map(str, fields))
    if verbose:
        t = time() - t0
        sys.stderr.write('%d sequences were analysed\n' % seq_count)
        sys.stderr.write('%d sequences were successfully assigned\n' % assigned_count)
        sys.stderr.write('%s seconds elapsed\n' % t)
Example #46
0
def apm(motif_list, pattern, max_mismatch):
    list_indices = []
    for i in range(0, len(motif_list)):
        if Levenshtein.hamming(pattern,motif_list[i]) <= max_mismatch:
            list_indices.append(i)
    return list_indices
Example #47
0
        for word, col_num in otsu.cut_to_lines(rotate=False, show=0):
            _word = word
            word = word.resize((48, 48), Image.BICUBIC).convert('1')
            data = ''.join(str(p) for p in word.getdata()).replace('255', '1')
            m5 = md5(data)
            if m5 not in samples:
                # 请开着目录/tmp/cut方便输入
                path = '/tmp/cut/%s.%s_%s.png' % (line_num, col_num, m5)
                word.save(path)

                min_distance = len(data)
                maybe = None
                for key, value in samples.items():
                    binary_string = value[-2]
                    try:
                        distance = Levenshtein.hamming(binary_string, data)
                    except:
                        del samples[key]
                    if min_distance > distance:
                        maybe = value
                        min_distance = distance
                maychar = maybe[-1]
                print 'maybe:', maychar, min_distance
                char = raw_input('input(press RETURN to accept %s):' % maychar)
                if char == '':
                    char = maychar

                os.remove(path)
                os.system('clear')
                samples[m5] = [word.tostring(), data, char]
                pickle.dump(samples, open(pickle_file, 'wb'))