def match(short, long): mindis = -1 minpos = 0 for i in range(len(long) - len(short) + 1): dis = leve.hamming(short, long[i:i + len(short)]) if mindis == -1 or dis < mindis: mindis = dis minpos = i # tail for i in range( len(long) - len(short) + 1, len(long) - (len(short) // 2) + 1): common = len(long) - i # ceil(dis*LEN_SHORT/LEN_COMMON) dis = (leve.hamming(short[:common], long[i:i + common]) * len(short) + common - 1) // common if mindis == -1 or dis < mindis: mindis = dis minpos = i # head for i in range(-(len(short) // 2), 0): common = len(short) + i # ceil(dis*LEN_SHORT/LEN_COMMON) dis = (leve.hamming(short[-i:], long[:common]) * len(short) + common - 1) // common if mindis == -1 or dis < mindis: mindis = dis minpos = i return mindis, minpos
def vanalysis(read): hold_v = v_key.findall(read) if hold_v: if len(hold_v) > 1: counts['multiple_v_matches'] += 1 return v_match = v_seqs.index(hold_v[0][0]) # Assigns V temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be v_seq_start = hold_v[0][1] end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions ) if end_v_v_dels: # If the number of deletions has been found return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start else: hold_v1 = half1_v_key.findall(read) if hold_v1: for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: counts['verr2'] += 1 v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions ) if end_v_v_dels: v_seq_start = hold_v1[i][1] return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start counts['foundv1notv2'] += 1 return else: hold_v2 = half2_v_key.findall(read) if hold_v2: for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(read[hold_v2[i][1]-v_half_split:hold_v2[i][1]-v_half_split+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], read[hold_v2[i][1]-v_half_split:hold_v2[i][1]+len(v_seqs[k])-v_half_split] ) <= 1: counts['verr1'] += 1 v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - v_half_split - 1 # Finds where the end of a full V would be end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions ) if end_v_v_dels: v_seq_start = hold_v2[i][1] - v_half_split return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start counts['foundv2notv1'] += 1 return else: counts['no_vtags_found'] += 1 return
def vanalysis(read): hold_v = v_key.findall(read) if hold_v: if len(hold_v) > 1: counts['multiple_v_matches'] += 1 return v_match = v_seqs.index(hold_v[0][0]) # Assigns VJ temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be v_seq_start = hold_v[0][1] end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions ) if end_v_v_dels: # If the number of deletions has been found return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start else: hold_v1 = half1_v_key.findall(read) if hold_v1: for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: counts['verr2'] += 1 v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions ) if end_v_v_dels: v_seq_start = hold_v1[i][1] return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start counts['foundv1notv2'] += 1 return else: hold_v2 = half2_v_key.findall(read) if hold_v2: for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(read[hold_v2[i][1]-v_half_split:hold_v2[i][1]-v_half_split+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], read[hold_v2[i][1]-v_half_split:hold_v2[i][1]+len(v_seqs[k])-v_half_split] ) <= 1: counts['verr1'] += 1 v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - v_half_split - 1 # Finds where the end of a full V would be end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions ) if end_v_v_dels: v_seq_start = hold_v2[i][1] - v_half_split return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start counts['foundv2notv1'] += 1 return else: counts['no_vtags_found'] += 1 return
def janalysis(read, end_of_v): hold_j = j_key.findall(read) if hold_j: if len(hold_j) > 1: counts['multiple_j_matches'] += 1 return j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be j_seq_end = hold_j[0][1] + len(hold_j[0][0]) start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions, end_of_v ) if start_j_j_dels: # If the number of deletions has been found return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end else: hold_j1 = half1_j_key.findall(read) if hold_j1: for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: counts['jerr2'] += 1 j_match = k temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be j_seq_end = hold_j1[i][1] + len(hold_j1[i][0]) + j_half_split start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions, end_of_v ) if start_j_j_dels: return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end counts['foundj1notj2'] += 1 return else: hold_j2 = half2_j_key.findall(read) if hold_j2: for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(read[hold_j2[i][1]-j_half_split:hold_j2[i][1]-j_half_split+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], read[hold_j2[i][1]-j_half_split:hold_j2[i][1]+len(j_seqs[k])-j_half_split] ) <= 1: counts['jerr1'] += 1 j_match = k temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - j_half_split # Finds where the start of a full J would be j_seq_end = hold_j2[i][1] + len(hold_j2[i][0]) start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions, end_of_v ) if start_j_j_dels: return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end counts['foundv2notv1'] += 1 return else: counts['no_j_assigned'] += 1 return
def janalysis(read): hold_j = j_key.findall(read) if hold_j: if len(hold_j) > 1: counts['multiple_j_matches'] += 1 return j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be j_seq_end = hold_j[0][1] + len(hold_j[0][0]) start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions ) if start_j_j_dels: # If the number of deletions has been found return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end else: hold_j1 = half1_j_key.findall(read) if hold_j1: for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: counts['jerr2'] += 1 j_match = k temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be j_seq_end = hold_j1[i][1] + len(hold_j1[i][0]) + j_half_split start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions ) if start_j_j_dels: return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end counts['foundj1notj2'] += 1 return else: hold_j2 = half2_j_key.findall(read) if hold_j2: for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(read[hold_j2[i][1]-j_half_split:hold_j2[i][1]-j_half_split+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], read[hold_j2[i][1]-j_half_split:hold_j2[i][1]+len(j_seqs[k])-j_half_split] ) <= 1: counts['jerr1'] += 1 j_match = k temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - j_half_split # Finds where the start of a full J would be j_seq_end = hold_j2[i][1] + len(hold_j2[i][0]) start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions ) if start_j_j_dels: return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end counts['foundv2notv1'] += 1 return else: counts['no_j_assigned'] += 1 return
def test(): levenshtein = levenshtein_recur Start = 0 Stop = 565 limit = 10 list_1 = ''.join( [chr(random.randrange(Start, Stop)) for iter in range(limit)]) list_2 = ''.join( [chr(random.randrange(Start, Stop)) for iter in range(limit)]) print(list_1) print(list_2) print("distance between 'cat', 'chello'", levenshtein('cat', 'chello')) print("distance between '', 'chello'", levenshtein('', 'chello')) print("distance between 'cat', ''", levenshtein('cat', '')) print("distance between 'cat', 'chello'", levenshtein('cat', 'chello')) print("distance between 'cat', 'cate'", levenshtein('cat', 'cate')) print("distance between 'cat', 'ca'", levenshtein('cat', 'ca')) print("distance between 'cat', 'cad'", levenshtein('cat', 'cad')) begin = time.time() print("distance between list_1, list_2", Levenshtein.distance(list_1, list_2)) end = time.time() print(f"Total runtime of the Levenshtein.distance is {end - begin}") begin = time.time() print("distance between list_1, list_2", levenshtein_recur(list_1, list_2)) end = time.time() print(f"Total runtime of the levenshtein_recur is {end - begin}") print("hamming distance between 'cat', 'cad'", hammingDist('cat', 'cad')) print("hamming distance between 'cat', 'cad'", Levenshtein.hamming('cat', 'cad')) Start = 0 Stop = 565 limit = 100 list_1 = ''.join( [chr(random.randrange(Start, Stop)) for iter in range(limit)]) list_2 = ''.join( [chr(random.randrange(Start, Stop)) for iter in range(limit)]) print(list_1) print(list_2) begin = time.time() print("hamming distance between list_1, list_2", Levenshtein.hamming(list_1, list_2)) end = time.time() print(f"Total runtime of the Levenshtein.hamming is {end - begin}") begin = time.time() print("hamming distance between list_1, list_2", hammingDist(list_1, list_2)) end = time.time() print(f"Total runtime of the hammingDist is {end - begin}")
def simple(len_motifs_sup,l_max,L_Left,L_UP,epsilon,theta,top_k,output_filename): l = L_Left N = {} len_motifs_laplacesup = copy.deepcopy(len_motifs_sup) #len_motifs_consolidatesup = copy.deepcopy(len_motifs_laplacesup) #len_motifs_laplacesup ={} len_motifs_consolidatesup ={} alphabet = 'agct' while l <= L_UP: scale =((l_max- l + 1) * (L_UP - L_Left+1))/epsilon allsequence =itertools.product(alphabet,repeat = l) Seq_l = len_motifs_sup[l] for x in allsequence: seql = ''.join(x) if seql in Seq_l.keys(): len_motifs_laplacesup[l][seql] = len_motifs_sup[l][seql] + np.random.laplace(0,scale,1)[0] else: len_motifs_sup[l][seql] = 0 len_motifs_laplacesup[l][seql] = np.random.laplace(0,scale,1)[0] Seq_l = len_motifs_sup[l] s = Seq_l.keys()[random.randint(0,len(Seq_l))] Bucket = {} for each_seq1 in Seq_l: i = Levenshtein.hamming(each_seq1,s) if i not in Bucket.keys(): Bucket.setdefault(i,[]) Bucket[i].append(each_seq1) else: Bucket[i].append(each_seq1) Bucket_key = Bucket.keys() for i in Bucket_key: for each_seq1 in Bucket[i]: len_motifs_consolidatesup.setdefault(l,{})[each_seq1]= 0 if i >= deta: for j in xrange(i-deta ,min(i+deta,l)+1): if Bucket.has_key(j): for each_seq2 in Bucket[j]: if 0<=Levenshtein.hamming(each_seq1,each_seq2) <= deta: len_motifs_consolidatesup.setdefault(l,{})[each_seq1] = round(float(len_motifs_consolidatesup.setdefault(l,{})[each_seq1])) + round(float(len_motifs_sup[l][each_seq2])) else: for j in xrange(0,min(i+deta,1)+1): if Bucket.has_key(j): for each_seq2 in Bucket[j]: if 0<=Levenshtein.hamming(each_seq1,each_seq2 )<=deta: len_motifs_consolidatesup.setdefault(l,{})[each_seq1]=round(float(len_motifs_consolidatesup.setdefault(l,{})[each_seq1])) + round(float(len_motifs_sup[l][each_seq2])) N = TopN(N,len_motifs_consolidatesup[l],top_k) l += 1 f = open(output_filename,'w') for i in dict(sorted(N.iteritems(),key=lambda t:t[1],reverse=True)).keys(): f.write( i + ':' + str(N[i]) + '\n') return len_motifs_consolidatesup
def j_analysis(rc, hold_j, j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j, j_regions, half1_j_key, half2_j_key): j_match = None if hold_j: j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[ j_match] # Finds where the start of a full J would be if get_j_deletions(rc, j_match, temp_start_j, j_regions): [start_j, deletions_j] = get_j_deletions(rc, j_match, temp_start_j, j_regions) found_j_match = 1 else: found_j_match = 0 hold_j1 = half1_j_key.findall(rc) hold_j2 = half2_j_key.findall(rc) for i in range(len(hold_j1)): indices = [ y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len( rc[hold_j1[i][1]:hold_j1[i][1] + len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], rc[hold_j1[i][1]:hold_j1[i][1] + len(j_seqs[k])]) <= 1: j_match = k temp_start_j = hold_j1[i][1] - jump_to_start_j[ j_match] # Finds where the start of a full J would be found_j_match += 1 for i in range(len(hold_j2)): indices = [ y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len( rc[hold_j2[i][1] - j_half_split:hold_j2[i][1] - j_half_split + len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], rc[hold_j2[i][1] - j_half_split:hold_j2[i][1] + len(j_seqs[k]) - j_half_split]) <= 1: j_match = k temp_start_j = hold_j2[i][1] - jump_to_start_j[ j_match] - j_half_split # Finds where the start of a full J would be found_j_match += 1 if j_match is not None: return j_match, temp_start_j, found_j_match else: return [None, None, None]
def trim5(seq): for base in range(the_minimum_length_of_adapter5, len(seq)): #取不到的那个值 if len(seq[:base]) < len(adapter5): seq_part = seq[:base] ada_part = adapter5[-base:] if Levenshtein.hamming(seq_part, ada_part) <= 1: return True else: seq_part = seq[base - len(adapter5):base] ada_part = adapter5 if Levenshtein.hamming(seq_part, ada_part) <= 1: return True return False
def trim3(seq): read_trimed = '' for base in range(len(seq) - the_minimum_length_of_adapter3, -1, -1): if len(seq[base:]) < len(adapter3): seq_part = seq[base:] ada_part = adapter3[:len(seq_part)] if Levenshtein.hamming(seq_part, ada_part) <= 2: read_trimed = seq[:base] else: seq_part = seq[base:base + len(adapter3)] ada_part = adapter3 if Levenshtein.hamming(seq_part, ada_part) <= 2: read_trimed = seq[:base] return read_trimed
def find_best_match(TAG_seq, tags, maximum_distance): """ Find the best match from the list of tags. Compares the Levenshtein distance between tags and the trimmed sequences. The tag and the sequence must have the same length. If no matches found returns 'unmapped'. We add 1 Args: TAG_seq (string): Sequence from R1 already start trimmed tags (dict): A dictionary with the TAGs as keys and TAG Names as values. maximum_distance (int): Maximum distance given by the user. Returns: best_match (string): The TAG name that will be used for counting. """ best_match = 'unmapped' best_score = maximum_distance for tag, name in tags.items(): score = Levenshtein.hamming(tag, TAG_seq[:len(tag)]) if score == 0: #Best possible match return (name) elif score <= best_score: best_score = score best_match = name return (best_match) return (best_match)
def hamming_distance(first, second): ''' returns the edit distance/hamming distances between its two arguements ''' # dist = sum([not a == b for a, b in zip(first, second)]) # return dist return Levenshtein.hamming(first, second)
def check_clusters(seq_list, all_clusters, cutoff, unique): print('Checking for unique input sequences') seqs = {} for seq in seq_list: seqs[seq[0]] = seqs.get(seq, 0) + 1 # Sequences in seq_list are unique if unique: for k, v in seqs.items(): if v != 1: print('Error: sequence %s appears in seq_list more than once.' % k) # There is a one-to-one correspondence between sequences in seq_list and sequences in all_clusters print('Checking for one-to-one correspondence between input and output sequences.') for cluster, max_len, min_len in all_clusters: for seq, id in cluster: if seq not in seqs: print('Error: sequence %s is in all_clusters but not in seq_list.' % seq) else: seqs[seq] += 1 for k, v in seqs.items(): if v < 2: print('Error: sequence %s appears in seq_list but not in all_clusters.' % k) elif unique and v > 2: print('Error: sequence %s appears in all_clusters more than once.' % k) # The cluster forms a connected network with each sequence in a cluster having a nearest neighbour within the cutoff distance print('Checking cluster membership.') t0 = time.time() i = 0 for cluster, max_len, min_len in all_clusters: # push each cluster through get_clusters and check it results in a single cluster # one could argue that this isn't strictly an independent check, but the underlying algorithm is in scipy # this does check that merging across chunks has happened correctly if len(cluster) > 1: res = get_clusters(cluster, cutoff) if len(res) != 1: print('Error: cluster with sequence %s (id %s) is partitioned into %d clusters by further application of get_cluster.' % (cluster[0], len(res))) i += 1 t1 = time.time() if t1 - t0 > 10: print 'Checking cluster %d\n' % i t0 = time.time() # No clusters are mergeable print('Checking that clusters are distinct.') for i in range(len(all_clusters)): c1 = all_clusters[i][0] for c2, max_len, min_len in all_clusters[i+1:]: for s1, i1 in c1: for s2, i2 in c2: cut = int(cutoff * min(len(s1), len(s2))) if hamming: if len(s1) == len(s2) and ld.hamming(s1, s2) <= cut: print('Error: sequences %s (id %s) and %s (id %s) are in different clusters but are within the cutoff distance.' % (s1, i1, s2, i2)) else: if ld.distance(s1, s2, cut) <= cut: print('Error: sequences %s (id %s) and %s (id %s) are in different clusters but are within the cutoff distance.' % (s1, i1, s2, i2))
def hamming1_align((peptide, protein_list)): # I and L are considered the same in this alignment query = peptide.replace('I', 'L') query_length = len(query) match_list = [] for protein in protein_list: subject = protein['seq'].replace('I', 'L') subject_length = len(subject) # First, find candidate locations by pigeonhole principle: # if hamming distance is 1, the left or right half must be exact match # Then, calculate hamming distance at candidate locations and return those equal to 1 query_left = query[:query_length / 2] query_right = query[query_length / 2:] left_index = [x.start() for x in re.finditer(query_left, subject)] right_index = [x.start() for x in re.finditer(query_right, subject)] right_index = [(x - query_length / 2) for x in right_index] candidate_index = left_index + right_index candidate_index = [ x for x in candidate_index if x >= 0 and (x + query_length) <= subject_length ] hamming1_index = [ x for x in candidate_index if Levenshtein.hamming(query, subject[x:(x + query_length)]) == 1 ] if hamming1_index: match_list += [{ 'protein': protein, 'match_index': index } for index in hamming1_index] return peptide, match_list
def apm(sequence, pattern, max_mismatch): motif_size = len(pattern) list_motif_mism = [] for i in range(0, len(sequence) - motif_size + 1): if Levenshtein.hamming(pattern,sequence[i:i + motif_size]) <= max_mismatch: list_motif_mism.append(sequence[i:i + motif_size]) return list_motif_mism
def calculateD(example): ''' 计算各种距离 :param example: :param request_template: :return: 返回跟每个模版比较的加权距离,此处加权较为简单,平均做的 ''' # sim = {'hamming':0,'distance':0,'Leven':0,..} sim_all = [] # if example in request_template: # return for request_M in request_template: if example != request_M['request_data']: sim = {'hamming': 0, 'distance': 0, 'Leven': 0,'jaro':0,'jaro_winkler':0,'function':request_M['function'],'sum':0} sim['distance'] = 1/Levenshtein.distance(example, request_M['request_data']) sim['Leven'] = Levenshtein.ratio(example, request_M['request_data']) sim['jaro'] = Levenshtein.jaro(example,request_M['request_data']) sim['jaro_winkler'] = Levenshtein.jaro_winkler(example,request_M['request_data']) try: sim['hamming'] = 1/Levenshtein.hamming(example, request_M['request_data']) except ValueError: sim['hamming'] = 0 sim['sum'] = (sim['hamming']+sim['distance']+sim['Leven']+sim['jaro']+sim['jaro_winkler'])/5 sim_all.append(sim) else: return [{'hamming': 1, 'distance': 1, 'Leven': 1,'jaro':1,'jaro_winkler':1,'function':request_M['function'],'sum':1}] # print(sim) return sim_all
def match_head(self, line, keywords): dist_list = [] len_list = [] for target in keywords: t_len = min(len(target), len(line)) dist = Levenshtein.hamming(line[:t_len], target[:t_len]) dist_list.append(dist) len_list.append(t_len) npdist = np.array(dist_list) min_idx = np.array(dist_list).argmin() if (dist_list[min_idx] <= 1): if (dist_list[min_idx] == 1): nplen = np.array(len_list) min_val = np.array(dist_list).min() min_idx_list = np.where(npdist == min_val) if min_idx_list[0].size > 1: min_len_list = nplen[list(min_idx_list[0])] min_idx = min_idx_list[0][min_len_list.argmax()] if (dist_list[min_idx] > 0): logging.warning( f"将[{line}]开头纠正为[{keywords[min_idx]}],距离{dist_list[min_idx]}" ) return keywords[min_idx] else: return None
def find_best_match_shift(TAG_seq, tags, maximum_distance): """ Find the best match from the list of tags with sliding window. Compares the Levenshtein distance between tags and the trimmed sequences. The tag and the sequence must have the same length. If no matches found returns 'unmapped'. We add 1 Args: TAG_seq (string): Sequence from R1 already start trimmed tags (dict): A dictionary with the TAGs as keys and TAG Names as values. maximum_distance (int): Maximum distance given by the user. Returns: best_match (string): The TAG name that will be used for counting. """ best_match = "unmapped" best_score = maximum_distance shifts = range(0, len(TAG_seq) - len(max(tags, key=len))) for shift in shifts: for tag, name in tags.items(): score = Levenshtein.hamming(tag, TAG_seq[shift:len(tag) + shift]) if score == 0: # Best possible match return name elif score <= best_score: best_score = score best_match = name return best_match return best_match
def superhamming(inpA, inpB, max_cutoff): if len(inpA) > len(inpB): A = inpB B = inpA else: A = inpA B = inpB len_diff = len(B) - len(A) best_ham = 100 bestA = "" bestB = "" top_buffer = len_diff + max_cutoff bot_buffer = max_cutoff for i in range(top_buffer + 1): ii = top_buffer - i top_string = "." * i + A + ii * "." # print "" # print top_string for j in range(bot_buffer + 1): jj = bot_buffer - j bot_string = "." * j + B + jj * "." # print bot_string, Levenshtein.hamming(top_string, bot_string) ham = Levenshtein.hamming(top_string, bot_string) if ham < best_ham: best_ham = ham bestA = top_string bestB = bot_string return (best_ham)
def get_exp_mismatch_matrix(k, _lambda): """ Compute the mismatch mixing matrix for A(k) with an _lambda-exponentially decaying mixing coefficient in the Hamming distance (number of character mismatches). Eg: 'AAAA' and 'AABB' have a Hamming distance of 2, thus a mixing coefficient of _lambda**2 PARAMETERS: - k: the length of strings in the alphabet - _lambda: the exponential parameter of the decay per mismatches RETURNS: - (4^k, 4^k) mixing matrix """ words = get_words(k) N = len(words) exp_mismatch_matrix = np.zeros((N, N)) for i in range(N): exp_mismatch_matrix[i,i] = 1 for j in range(i+1, N): exp_mismatch_matrix[i,j] = _lambda**Levenshtein.hamming(words[i], words[j]) exp_mismatch_matrix[j,i] = exp_mismatch_matrix[i,j] return exp_mismatch_matrix
def test_cell_distances(whitelist, collapsing_threshold): """Tests cell barcode distances to validate provided cell barcode collapsing threshold Function needs the given whitelist as well as the threshold. If the value is too high, it will rerun until an acceptable value is found. Args: whitelist (set): Whitelist barcode set collapsing_threshold (int): Value of threshold Returns: collapsing_threshold (int): Valid threshold """ ok = False while not ok: print('Testing cell barcode collapsing threshold of {}'.format( collapsing_threshold)) all_comb = combinations(whitelist, 2) for comb in all_comb: if Levenshtein.hamming(comb[0], comb[1]) <= collapsing_threshold: collapsing_threshold -= 1 print('Value is too high, reducing it by 1') break else: ok = True print('Using {} for cell barcode collapsing threshold'.format( collapsing_threshold)) return (collapsing_threshold)
def match_short_data(long_data, short_data): long = long_data['s'] short = short_data['s'] mindis = -1 minpos = 0 for i in range(len(long)-len(short)+1): dis = leve.hamming(short, long[i:i+len(short)]) if mindis == -1 or dis < mindis: mindis = dis minpos = i # # tail # for i in range(len(long)-len(short)+1, len(long)-(len(short)//2)+1): # common = len(long)-i # # ceil(dis*LEN_SHORT/LEN_COMMON) # dis = (leve.hamming(short[:common], # long[i:i+common])*len(short)+common-1)//common # if mindis == -1 or dis < mindis: # mindis = dis # minpos = i # # head # for i in range(-(len(short)//2), 0): # common = len(short)+i # # ceil(dis*LEN_SHORT/LEN_COMMON) # dis = (leve.hamming(short[-i:], long[:common]) # * len(short)+common-1)//common # if mindis == -1 or dis < mindis: # mindis = dis # minpos = i match = { 'name': short_data['name'], 'pos': minpos, 'dis': mindis, 's': short_data['s'] } return match
def check_index_distance(indexes): print >> sys.stderr, "# Check hamming distance for indexes:" for index1 in indexes: print >> sys.stderr, index1, for index2 in indexes: print >> sys.stderr, lv.hamming(index1, index2), print >> sys.stderr
def apm(sequence, pattern, max_mismatch): motif_size = len(pattern) list_motif_mism = [] for i in range(0, len(sequence) - motif_size + 1): if Levenshtein.hamming(pattern, sequence[i:i + motif_size]) <= max_mismatch: list_motif_mism.append(sequence[i:i + motif_size]) return list_motif_mism
def loop_formacth3(str3, longstr): dict_num = {} for i in range(0, (len(longstr) - 8)): sim2 = Levenshtein.hamming(str3, longstr[i:(i + 8)]) dict_num[longstr[i:(i + 8)]] = sim2 dict_num = sorted(dict_num.items(), key=lambda x: x[1], reverse=False) # print(dict_num[0]) return dict_num[0]
def loop_formacth2(str2, longstr): dict_num = {} for i in range(10, 117): sim2 = Levenshtein.hamming(str2, longstr[i:(i + 33)]) dict_num[longstr[i:(i + 33)]] = sim2 dict_num = sorted(dict_num.items(), key=lambda x: x[1], reverse=False) # print(dict_num[0]) return dict_num[0]
def get_dists(x): (lowrow, highrow, seq_list, hamming) = x dists = [] for i in range(lowrow, highrow): for j in range(i+1, len(seq_list)): if hamming: dists.append(ld.hamming(seq_list[i][0], seq_list[j][0])) else: dists.append(ld.distance(seq_list[i][0], seq_list[j][0])) return (dists)
def loop_formacth1(str1, longstr): dict_num = {} for i in range(10, 116): # print(longstr[i:(i+34)]) # print(str1) sim1 = Levenshtein.hamming(str1, longstr[i:(i + 34)]) dict_num[longstr[i:(i + 34)]] = sim1 dict_num = sorted(dict_num.items(), key=lambda x: x[1], reverse=False) # print(dict_num[0]) return dict_num[0]
def add_items(edges, verts, colours, idprefix, seqs, seq_counts, first_sample_seen, cutoff, highlightcolour, cluster): # Check the cluster spans at least two samples samples_hit = [] for id in cluster: for colour in colours: if colour[0] in id: if colour[0] not in samples_hit: samples_hit.append(colour[0]) if len(samples_hit) > 1: break if len(samples_hit) < 2: return 0 newedges = len(edges) for i in range(len(cluster)): id1 = cluster[i] for id2 in cluster[i + 1:]: if len(seqs[id1]) == len(seqs[id2]): limit = int(len(seqs[id1]) * cutoff) hd = ld.hamming(seqs[id1], seqs[id2]) if hd <= limit: edge = {} edge['Source'] = id1 edge['Target'] = id2 edge['Hamming'] = hd if hd == 1: edge['Color'] = 'black' else: edge['Color'] = 'white' edges.append(edge) for id in cluster: found = False for edge in edges[newedges:]: if edge['Source'] == id or edge['Target'] == id: found = True break if not found: print 'Warning: vertex %s is not connected.' % id vert = {} vert['Id'] = id if id[:len(idprefix)] == idprefix: vert['color'] = 'black' if seqs[id] in first_sample_seen: vert['color'] = colours[first_sample_seen[seqs[id]]][1] else: vert['color'] = highlightcolour vert['size'] = 2 + 2 * math.log(seq_counts[seqs[id]]) verts.append(vert) return 1
def check_bc(seq, bc_len): bc = seq[0:bc_len] print(bc_len, bc) min_dist = 100 min_bc = None for b in bc_lens[bc_len]: dist = lv.hamming(b, bc) if dist < min_dist: min_dist = dist min_bc = b return bc_len, min_dist, min_bc
def smooth_step(seq1, seq2): mylist, myseq = [], [] num = len(seq2) - len(seq1) + 1 for i in range(num): misnum = Levenshtein.hamming(seq1, seq2[i:i + len(seq1)]) myseq.append(seq2[i:]) mylist.append(misnum) if min(mylist) <= 5: return min(mylist), myseq[mylist.index(min(mylist))][:len(seq1)] else: return 100, "NNNNNNNNNN"
def seq_ngrams_query(self, query_seq, max_mismatch=2): ''' Return a list of tuples of mqtches, each with a reference to a read that contains the query sequence query_seq, and the postion at which it was found in that read's umi_well_seq ''' if len(query_seq) < FastqReadNgramHash.ngram_length: sys.stderr.write( f'Cannot query a FastqReadNgramHash with a query string shorter than the ngram length ({FastqReadNgramHash.ngram_length})\n' ) match_umi_well_seqs = {} num_ngrams = len(query_seq) - FastqReadNgramHash.ngram_length + 1 for offset in range(num_ngrams): ngram = query_seq[offset:(offset + FastqReadNgramHash.ngram_length)] if ngram in self.ngram_hash: new_matches = self.ngram_hash[ngram] else: new_matches = [] for (match_umi_well_seq, match_offset) in new_matches: if match_umi_well_seq in match_fastq_reads: match_fastq_reads[match_umi_well_seq].update( {match_offset: offset}) else: match_fastq_reads[match_umi_well_seq] = { match_offset: offset } final_matches = [] for match_umi_well_seq, offsets in match_umi_well_seqs.items(): if (num_ngrams - len(offsets) <= max_mismatch): sorted_match_offsets = sorted(offsets.keys()) # print(query_seq) # print(fastq_read.umi_well_seq) # print('\n'.join([f'{match_offset}: {offsets[match_offset]}' for match_offset in sorted_match_offsets])) best_run = _longest_consecutive_run(sorted_match_offsets) if best_run == None: continue # print(best_run) # print(offsets[sorted_match_offsets[_longest_consecutive_run(sorted_match_offsets)]]) best_match_start = sorted_match_offsets[best_run] - offsets[ sorted_match_offsets[_longest_consecutive_run( sorted_match_offsets)]] # print(f'best_match_start: {best_match_start}') if best_match_start > FastqReadNgramHash.seq_length - len( query_seq) or best_match_start < 0: continue best_match_dist = Levenshtein.hamming( query_seq, match_umi_well_seq[best_match_start:best_match_start + len(query_seq)]) # print(f'fastq_read: {fastq_read}\nbest_match_start: {best_match_start}\nbest_match_dist: {best_match_dist}') final_matches.append( (match_umi_well_seq, best_match_start, best_match_dist)) # print('#########') return (final_matches)
def add_items(edges, verts, colours, idprefix, seqs, seq_counts, first_sample_seen, cutoff, highlightcolour, cluster): # Check the cluster spans at least two samples samples_hit = [] for id in cluster: for colour in colours: if colour[0] in id: if colour[0] not in samples_hit: samples_hit.append(colour[0]) if len(samples_hit) > 1: break if len(samples_hit) < 2: return 0 newedges = len(edges) for i in range(len(cluster)): id1 = cluster[i] for id2 in cluster[i+1:]: if len(seqs[id1]) == len(seqs[id2]): limit = int(len(seqs[id1]) * cutoff) hd = ld.hamming(seqs[id1], seqs[id2]) if hd <= limit: edge = {} edge['Source'] = id1 edge['Target'] = id2 edge['Hamming'] = hd if hd == 1: edge['Color'] = 'black' else: edge['Color'] = 'white' edges.append(edge) for id in cluster: found = False for edge in edges[newedges:]: if edge['Source'] == id or edge['Target'] == id: found = True break if not found: print 'Warning: vertex %s is not connected.' % id vert = {} vert['Id'] = id if id[:len(idprefix)] == idprefix: vert['color'] = 'black' if seqs[id] in first_sample_seen: vert['color'] = colours[first_sample_seen[seqs[id]]][1] else: vert['color'] = highlightcolour vert['size'] = 2 + 2*math.log(seq_counts[seqs[id]]) verts.append(vert) return 1
def get_exp_mismatch_matrix(words, _lambda): N = len(words) exp_mismatch_matrix = np.zeros((N, N)) for i in range(N): exp_mismatch_matrix[i, i] = 1 for j in range(i + 1, N): exp_mismatch_matrix[i, j] = _lambda**Levenshtein.hamming( words[i], words[j]) exp_mismatch_matrix[j, i] = exp_mismatch_matrix[i, j] return exp_mismatch_matrix
def can_merge(A, B, off): res = '' if off < 0: A, B = B, A off = -off common = min(len(A) - off, len(B)) # if common <= 100: # Threshold # return False error_rate = leve.hamming(A[off:off + common], B[:common]) / common if error_rate > 0.2: # Threshold return False return True
def v_analysis( rc, hold_v, v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v, v_regions, half1_v_key, half2_v_key, error0_count, error1_count ): # rc is a string of record.seq as input v_match = None if hold_v: v_match = v_seqs.index(hold_v[0][0]) # Assigns V temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be if get_v_deletions( rc, v_match, temp_end_v, v_regions ): # If the number of deletions has been found [ end_v, deletions_v] = get_v_deletions( rc, v_match, temp_end_v, v_regions ) found_v_match = 1 error0_count += 1 else: found_v_match = 0 hold_v1 = half1_v_key.findall(rc) hold_v2 = half2_v_key.findall(rc) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(rc[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(rc)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 error1_count += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(rc[hold_v2[i][1]-v_half_split:hold_v2[i][1]-v_half_split+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], rc[hold_v2[i][1]-v_half_split:hold_v2[i][1]+len(v_seqs[k])-v_half_split] ) <= 1: v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - v_half_split - 1 # Finds where the end of a full V would be found_v_match += 1 error1_count += 1 if v_match is not None: return v_match, temp_end_v, found_v_match, error0_count, error1_count else: return [None, None, None, error0_count, error1_count]
def j_analysis( rc, hold_j, j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j, j_regions, half1_j_key, half2_j_key ): j_match = None if hold_j: j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be if get_j_deletions( rc, j_match, temp_start_j, j_regions ): [ start_j, deletions_j] = get_j_deletions( rc, j_match, temp_start_j, j_regions ) found_j_match = 1 else: found_j_match = 0 hold_j1 = half1_j_key.findall(rc) hold_j2 = half2_j_key.findall(rc) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(rc[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], rc[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = k temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(rc[hold_j2[i][1]-j_half_split:hold_j2[i][1]-j_half_split+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], rc[hold_j2[i][1]-j_half_split:hold_j2[i][1]+len(j_seqs[k])-j_half_split] ) <= 1: j_match = k temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - j_half_split # Finds where the start of a full J would be found_j_match += 1 if j_match is not None: return j_match, temp_start_j, found_j_match else: return [None, None, None]
def find_merge_point(x): cs1, c, cutoff, hamming = x c2, c2_max_length, c2_min_length = c merges = [] for s2, i2 in c2: s2_len = len(s2) for i in range(len(cs1)): if i not in merges: (c1, c1_max_length, c1_min_length) = cs1[i] for s1, i1 in cs1[i][0]: cut = int(cutoff * min(len(s1), s2_len)) if hamming: if c1_min_length != c1_max_length or c2_min_length != c2_max_length: print 'Error: variable length clusters found.' if c1_min_length == c2_min_length and ld.hamming(s1, s2) <= cut: merges.append(i) break else: if c1_min_length - c2_max_length <= cut and c2_min_length - c1_max_length <= cut and ld.distance(s1, s2, cut) <= cut: merges.append(i) break return merges
def main(argv): parser = argparse.ArgumentParser(description='Plot the minimum distance between sequences in the file, uusing Hamming distance between sequences of the same length.') parser.add_argument('infile', help='input file (FASTA)') parser.add_argument('outprefix', help='prefix for output files') parser.add_argument('-l', '--limit', help='limit to at most this many sequences, drawn at random without replacement') parser.add_argument('-v', '--verbose', help='display progress', action='store_true') parser.add_argument('-i', '--interactive', help='display charts interactively', action='store_true') parser.add_argument('-g', '--length_lims', help='axis limits for CDR length plots <xmin,xmax,ymin,ymax>') parser.add_argument('-d', '--dist_lims', help='axis limits for CDR length plots <xmin,xmax,ymin,ymax>') parser.add_argument('-c', '--csv', help='produce comma separated variable output instead of plots', action='store_true') args = parser.parse_args() length_lims = None dist_lims = None def split_lims(lims): ll = lims.split(',') if len(ll) != 4: print('Error: limit specifier "%s" should contain four numbers separated by commas.' % lims) exit() else: for i in range(len(ll)): ll[i] = float(ll[i]) return ll if args.length_lims: length_lims = split_lims(args.length_lims) if args.dist_lims: dist_lims = split_lims(args.dist_lims) # move seqs into dict, indexed by length. Eliminate duplicates. seq_list = [] seen_seqs = {} for seq_record in SeqIO.parse(args.infile, "fasta"): seq = str(seq_record.seq) if seq not in seen_seqs: seq_list.append(seq) seen_seqs[seq] = 1 print '%d unique sequences.' % len(seen_seqs) if args.limit and len(seq_list) > int(args.limit): if args.verbose: print 'Limiting to a sample of %d' % int(args.limit) seq_list = random.sample(seq_list, int(args.limit)) seqs = {} for seq in seq_list: length = len(seq) if length in seqs: seqs[length].append(seq) else: seqs[length] = [seq] maxval = max(seqs.keys()) + 1 minval = min(seqs.keys()) sizes = np.zeros(maxval) for k, v in seqs.items(): sizes[k] = len(v) sizes = sizes / sizes.sum() # plot length distribution if not args.csv: plt.figure() plt.bar(range(maxval), sizes, 1/1.5, color='blue') plt.xlabel('CDR3 length') plt.ylabel('Frequency') if length_lims: plt.axis(length_lims) plt.savefig(args.outprefix + '_length_distribution.pdf') if args.interactive: plt.show() else: with open(args.outprefix + '_length_distribution.csv', 'wb') as fo: writer = csv.writer(fo) writer.writerow(['Length'] + range(minval, maxval)) writer.writerow(['Frequency'] + list(sizes[minval:maxval])) # Calculate min distances mindists = [] for seq_length, seq_list in seqs.items(): if args.verbose: print 'Processing sequences of length %d' % seq_length for i in range(len(seq_list)-1): mindist = 9999 for j in range(i+1, len(seq_list)): h = Levenshtein.hamming(seq_list[i], seq_list[j]) mindist = min(mindist, h) mindists.append(float(mindist)/seq_length) if not args.csv: plt.figure() plt.hist(mindists, bins=50) plt.xlabel('Minimum distance') plt.ylabel('Occurrences') if dist_lims: plt.axis(dist_lims) plt.savefig(args.outprefix + '_min_dist.pdf') if args.interactive: plt.show() else: freq, bins = np.histogram(mindists, 50) with open(args.outprefix + '_min_dist.csv', 'wb') as fo: writer = csv.writer(fo) writer.writerow(['Distance'] + list(bins)) writer.writerow(['Occurrences'] + list(freq))
def analysis( Sequence_Reads, with_statistics=True, with_reverse_complement_search=True): import numpy as np import decimal as dec import string import operator as op import collections as coll from Bio import SeqIO from acora import AcoraBuilder from time import time, clock from string import Template from operator import itemgetter, attrgetter import Levenshtein as lev v_half_split, j_half_split = [10,6] # Do not change - V tags are split at position 10, J at position 6, to look for half tags if no full tag is found. ################ print 'Commencing analysis on a total of', len(Sequence_Reads), 'file(s)' ## Create .txt file to store f=(v_index,j_index,v_deletions,j_deletions,nt_insert) analysis_file = open("DecombinatorResults.txt", "w") analysis_file.close() results = "DecombinatorResults.txt" # Name the .txt file to write to ################ print ('Importing known V, D and J gene segments and tags...') handle = open("human_TRBV_region.fasta", "rU") v_genes = list(SeqIO.parse(handle, "fasta")) handle.close() handle = open("human_TRBJ_region.fasta", "rU") j_genes = list(SeqIO.parse(handle, "fasta")) handle.close() v_regions = [] for j in range(0, len(v_genes)): v_regions.append(string.upper(v_genes[j].seq)) j_regions = [] for j in range(0, len(j_genes)): j_regions.append(string.upper(j_genes[j].seq)) ############## ## Build keyword tries of V and J tags for fast assignment v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_trbv.txt", "rU"), v_half_split) j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_trbj.txt", "rU"), j_half_split) v_builder = AcoraBuilder() for i in range(0,len(v_seqs)): v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie v_key = v_builder.build() j_builder = AcoraBuilder() for i in range(0,len(j_seqs)): j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie j_key = j_builder.build() ############## ## Build keyword tries for first and second halves of both V and J tags v_half1_builder = AcoraBuilder() for i in range(0,len(half1_v_seqs)): v_half1_builder.add(str(half1_v_seqs[i])) half1_v_key = v_half1_builder.build() v_half2_builder = AcoraBuilder() for i in range(0,len(half2_v_seqs)): v_half2_builder.add(str(half2_v_seqs[i])) half2_v_key = v_half2_builder.build() j_half1_builder = AcoraBuilder() for i in range(0,len(half1_j_seqs)): j_half1_builder.add(str(half1_j_seqs[i])) half1_j_key = j_half1_builder.build() j_half2_builder = AcoraBuilder() for i in range(0,len(half2_j_seqs)): j_half2_builder.add(str(half2_j_seqs[i])) half2_j_key = j_half2_builder.build() ############### ## Initialise variables assigned_count = 0 # this will just increase by one every time we correctly assign a seq read with all desired variables seq_count = 0 # this will simply track the number of sequences analysed in file t0 = time() # Begin timer ############### ## Open .txt file created at the start of analysis analysis_file = open(results, "a") stemplate = Template('$v $j $del_v $del_j $nt_insert') # Creates stemplate, a holder, for f. Each line will have the 5 variables separated by a space ############### ## Begin analysing sequences for i in range(len(Sequence_Reads)): print 'Importing sequences from', Sequence_Reads[i],' and assigning V and J regions...' handle = open(Sequence_Reads[i], "rU") for record in SeqIO.parse(handle, "fastq"): found_seq_match = 0 seq_count += 1 hold_v = v_key.findall(str(record.seq)) hold_j = j_key.findall(str(record.seq)) if hold_v: v_match = v_seqs.index(hold_v[0][0]) # Assigns V temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found [ end_v, deletions_v] = get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) else: found_v_match = 0 hold_v1 = half1_v_key.findall(str(record.seq)) hold_v2 = half2_v_key.findall(str(record.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 if hold_j: j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = half1_j_key.findall(str(record.seq)) hold_j2 = half2_j_key.findall(str(record.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = half1_j_seqs.index(hold_j1[i][0]) temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = half2_j_seqs.index(hold_j2[i][0]) temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be found_j_match += 1 if hold_v and hold_j: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence assigned_count += 1 found_seq_match = 1 elif hold_v and found_j_match == 1: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and hold_j: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and found_j_match == 1: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 if found_seq_match == 0 and with_reverse_complement_search == True: ##################### # REVERSE COMPLEMENT ##################### record_reverse = record.reverse_complement() hold_v = v_key.findall(str(record_reverse.seq)) hold_j = j_key.findall(str(record_reverse.seq)) if hold_v: v_match = v_seqs.index(hold_v[0][0]) # Assigns V temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found [ end_v, deletions_v] = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) else: found_v_match = 0 hold_v1 = half1_v_key.findall(str(record_reverse.seq)) hold_v2 = half2_v_key.findall(str(record_reverse.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 if hold_j: j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): [ start_j, deletions_j] = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = half1_j_key.findall(str(record_reverse.seq)) hold_j2 = half2_j_key.findall(str(record_reverse.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = half1_j_seqs.index(hold_j1[i][0]) temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = half2_j_seqs.index(hold_j2[i][0]) temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be found_j_match += 1 if hold_v and hold_j: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence assigned_count += 1 found_seq_match = 1 elif hold_v and found_j_match == 1: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and hold_j: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and found_j_match == 1: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 handle.close() analysis_file.close() if with_statistics == True: timed = time() - t0 print seq_count, 'sequences were analysed' print assigned_count, ' sequences were successfully assigned' print 'Time taken =', timed, 'seconds'
if True: # investigate most frequently repeated tweets in each class c_in = Counter(in_class_lines) c_out = Counter(out_class_lines) # some hard-coded display routines for playing with the data... if False: plt.figure() plt.ion() if False: # histogram of tweet lengths lengths_in_class = [len(s) for s in in_class_lines] lengths_out_class = [len(s) for s in out_class_lines] plt.title("Histogram of tweet lengths for classes in " + args.table) plt.xlabel("Bins of tweet lengths") plt.ylabel("Counts") tweet_lengths = (0, 140) filename_pattern = "histogram_tweet_lengths_{}.png" # note - tried counting spaces with s.count(" ") but this seems to mirror # tweet-length if True: # counting number of capital letters lengths_in_class = [Levenshtein.hamming(s, s.lower()) for s in in_class_lines] lengths_out_class = [Levenshtein.hamming(s, s.lower()) for s in out_class_lines] plt.title("Histogram of number of capitals for classes in " + args.table) tweet_lengths = (0, 40) filename_pattern = "nbr_capitals_{}.png" plt.hist(lengths_in_class, range=tweet_lengths, color="blue", label="in-class", histtype="step") plt.hist(lengths_out_class, range=tweet_lengths, color="green", label="out-class", histtype="step") UPPER_LEFT = 2 plt.legend(loc=UPPER_LEFT) plt.savefig(filename_pattern.format(args.table))
# Jaro Distance # Jaro-Winkler Distance # Match Rating Approach Comparison # Hamming Distance # Phonetic encoding: # American Soundex # Metaphone # NYSIIS (New York State Identification and Intelligence System) # Match Rating Codex import jellyfish print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish')) # 2; 编辑距离 print(jellyfish.jaro_distance('jellyfish', 'smellyfish')) # 0.89629629629629637 print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')) # 1; 编辑距离, 带翻转的 print(jellyfish.metaphone('Jellyfish')) # 'JLFX' print(jellyfish.soundex('Jellyfish')) # 'J412' print(jellyfish.nysiis('Jellyfish')) # 'JALYF' print(jellyfish.match_rating_codex('Jellyfish')) # 'JLLFSH' ################################################################## ## Lenvenshtein import Levenshtein print(Levenshtein.hamming('hello', 'helol')) # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数 print(Levenshtein.distance('hello', 'helol')) # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换 print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf')) # 5 print(Levenshtein.ratio('hello', 'helol')) # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离 # 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2 # 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题 print(Levenshtein.jaro('hello', 'helol')) # 0.9333333333333332; 计算 jaro 距离; 用于健康普查 print(Levenshtein.jaro_winkler('hello', 'helol')) # 0.9533333333333333; 计算 Jaro – Winkler 距离
elif pc_dist > 20: # need to check if the distance is being artificially inflated by DCR finding a tag up/down-stream of the genuine ones # can try trimming either end of the longer sequence to see if the distance then drops low enough dist1 = 0 dist2 = 0 dist3 = 0 dist4 = 0 if len(seq) > len(proto_seq): diff = len(seq) - len(proto_seq) dist1 = lev.hamming(seq[:len(proto_seq)], proto_seq) # trim 3' dist2 = lev.hamming(seq[diff:], proto_seq) # trim 5' if dist1/len(proto_seq) * 100 <= 20 or dist2/len(proto_seq) * 100 <= 20: count_match += 1 #### OUTPUT THIS RECORD! ###### dcr_collapsed[proto_dcr][d] += 1 else: count_diff_len_discarded += 1
def DupRemover(inputbam, output, bedout, chunk, quality, multi): start = time.time() #pre-processing. Generating unique names for temporary files, in case multiple instances of the script is run at the same time. mm = hashlib.sha1() mm.update(inputbam.split("/")[-1]+str(random.random())) tempbed = ".temp_bed-"+mm.hexdigest() tempbed0 = ".0temp_bed-"+mm.hexdigest() tempsam = ".temp_sam-"+mm.hexdigest() ticker = False ticker2 = True # logfile = "cleaned/"+inputbam.split(".bam")[0]+"_cleaned.log" logfile = output.split(".bam")[0]+".log" if multi: with open(logfile, 'a') as ff: msg = "Working on file %s\n" %inputbam ff.write("%s: %s" % (time.strftime("%X %x"), msg)) else: printStatus("Working on the file: %s" %inputbam) output = output.split(".bam")[0]+".sam" # check if the input file is from bbmap or bowtie cmd0 = "bedtools bamtobed -i %s | head > %s" %(inputbam, tempbed0) subprocess.check_call(cmd0, shell=True) xx = open(tempbed0, 'r') # This routine tries to understand if the input file came from bowtie2 or bbmap (because bbmap generate some funny .sam files). # It may not work for other aligners. Although I guess it might also work. if len(xx.next().strip().split(" ")) == 1: cmd = "bedtools bamtobed -i %s | sed 's/:/ /g' | sort -k1,1 -k2,2n -k3,3n -k11,11 -k12,12n -k13,13 > %s" %(inputbam, tempbed) elif len(xx.next().strip().split(" ")) == 2: ticker2 = False # marks as "bbmap" output cmd = "bedtools bamtobed -i %s | awk '{print $1,$2,$3,$4,$6,$7}' | tr [:blank:] \\\\t | sed 's/:/ /g' | sort -k1,1 -k2,2n -k3,3n -k11,11 -k12,12n -k13,13 > %s" %(inputbam, tempbed) elif len(xx.next().strip().split(" ")) == 3: ticker2 = False# marks as "bbmap" output cmd = "bedtools bamtobed -i %s | awk '{print $1,$3,$4,$5,$7,$8}' | tr [:blank:] \\\\t | sed 's/:/ /g' | sort -k1,1 -k2,2n -k3,3n -k11,11 -k12,12n -k13,13 > %s" %(inputbam, tempbed) else: print "Unrecognized .bam formatting." sys.exit(1) xx.close() os.remove(tempbed0) if multi: with open(logfile, 'a') as ff: msg = "Generating some temp files.\n" ff.write("%s: %s" % (time.strftime("%X %x"), msg)) else: printStatus("Generating some temp files.") cmd2 = "samtools view -h %s > %s" %(inputbam, tempsam) if multi: with open(logfile, 'a') as ff: msg = "Running command: \n\t%s\n"%cmd ff.write("%s: %s" % (time.strftime("%X %x"), msg)) else: printStatus("Running command: \n\t%s"%cmd) subprocess.check_call(cmd, shell=True) if multi: with open(logfile, 'a') as ff: msg = "Running command: \n\t%s"%cmd2 ff.write("%s: %s\n" % (time.strftime("%X %x"), msg)) else: printStatus("Running command: \n\t%s"%cmd2) subprocess.check_call(cmd2, shell=True) if multi: with open(logfile, 'a') as ff: msg = "Done. Starting to work on the temp files.\n" ff.write("%s: %s" % (time.strftime("%X %x"), msg)) else: printStatus("Done. Starting to work on the temp files.") aa = open(tempbed, 'r') seq_set = set() name_set = set() sam_set = set() counter = 0 # reads that are kept counter2 = 2 # total number of reads counter3 = 0 # reads that are below the quality threshold. multiplier = 1 # Initiate the whole thing with the first two lines. line1 = aa.next() line2 = aa.next() line1_bcode = line1.split(" ")[-1].split("\t")[0] line2_bcode = line2.split(" ")[-1].split("\t")[0] line1_qual = int(line1.split("\t")[-2]) line1_strand = line1.strip().split("\t")[-1] line2_strand = line2.strip().split("\t")[-1] line1_begin = line1.split("\t")[0] + "," + line1.split("\t")[1] line1_end = line1.split("\t")[0] + "," + line1.split("\t")[2] line2_begin = line2.split("\t")[0] + "," + line2.split("\t")[1] line2_end = line2.split("\t")[0] + "," + line2.split("\t")[2] levent = Levenshtein.hamming(line1_bcode, line2_bcode) #comparing consecutive lines with each other. if line1_strand == line2_strand: if line1_strand == "+": if line1_begin != line2_begin: if bedout: if line1_qual >= quality: seq_set.add(":".join(line1.split(" "))) if line1_qual >= quality: name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 +=1 else: if levent > 1: if bedout: if line1_qual >= quality: seq_set.add(":".join(line1.split(" "))) if line1_qual >= quality: name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 += 1 elif line1_strand =="-": if line1_end != line2_end: if line1_qual >= quality: if bedout: seq_set.add(":".join(line1.split(" "))) if line1_qual >= quality: name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 += 1 else: if levent > 1: if line1_qual >= quality: if bedout: seq_set.add(":".join(line1.split(" "))) if line1_qual >= quality: name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 += 1 else: printStatus("The strand is neither + or - Something must be wrong, exiting") aa.close() os.remove(tempbed) os.remove(tempsam) sys.exit(1) else: if line1_qual >= quality: if bedout: seq_set.add(":".join(line1.split(" "))) if line1_qual >= quality: name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 += 1 #switching lines, so we can iteratively work with consecutive lines line1 = line2 while True: try: line1_bcode = line1.split(" ")[-1].split("\t")[0] line1_qual = int(line1.split("\t")[-2]) line2 = aa.next() counter2 += 1 #this counter holds the total number of lines in the file. line2_bcode = line2.split(" ")[-1].split("\t")[0] line1_strand = line1.strip().split("\t")[-1] line2_strand = line2.strip().split("\t")[-1] line1_begin = line1.split("\t")[0] + "," + line1.split("\t")[1] line1_end = line1.split("\t")[0] + "," + line1.split("\t")[2] line2_begin = line2.split("\t")[0] + "," + line2.split("\t")[1] line2_end = line2.split("\t")[0] + "," + line2.split("\t")[2] levent = Levenshtein.hamming(line1_bcode, line2_bcode) if line1_strand == line2_strand: if line1_strand == "+": if line1_begin != line2_begin: if line1_qual >= quality: if bedout: seq_set.add(":".join(line1.split(" "))) name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 += 1 else: if levent > 1: if line1_qual >= quality: if bedout: seq_set.add(":".join(line1.split(" "))) name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 += 1 elif line1_strand =="-": if line1_end != line2_end: if line1_qual >= quality: if bedout: seq_set.add(":".join(line1.split(" "))) name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 += 1 else: if levent > 1: if line1_qual >= quality: if bedout: seq_set.add(":".join(line1.split(" "))) name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 += 1 else: printStatus("The strand is neither + or - Something must be wrong, exiting.") aa.close() os.remove(tempbed) os.remove(tempsam) sys.exit(1) else: if line1_qual >= quality: if bedout: seq_set.add(":".join(line1.split(" "))) if line1_qual >= quality: name_set.add(":".join(line1.split(" ")).split("\t")[3]) counter += 1 else: counter3 += 1 line1 = line2 # Every 100000 unique reads (or multiples of 100000 as determined by the chunk variable), the set that holds unique reads are flushed. if bedout: if counter == multiplier * chunk: for i in seq_set: with open(bedout, 'a') as ff: ff.write(i) multiplier += 1 seq_set = set() except StopIteration: if bedout: for i in seq_set: with open(bedout, 'a') as ff: ff.write(i) if multi: with open(logfile, 'a') as ff: msg ="%s of %s reads reads were removed. %s reads were removed because they were below the set quality threshold: Q%s.\n" %(counter2-counter, counter2, counter3, quality) msg2 = "%%%s of the reads reads were removed. Of the reads thet were removed %%%s were below the set quality threshold\n" %(round(((counter2-counter)/counter2)*100), round((counter3/(counter2-counter))*100)) ff.write("%s: %s" % (time.strftime("%X %x"), msg)) ff.write("%s: %s" % (time.strftime("%X %x"), msg2)) else: printStatus(" %s%% of the reads were removed, they were either PCR-duplicates or were below the quality threshold." %(round((1-(counter/(counter+counter2)))*100))) printStatus("%s of %s reads reads were removed. %s of the reads were removed because they were below the set quality threshold: Q%s.\n" %(counter2-counter, counter2, counter3, quality)) printStatus("%%%s of the reads reads were removed. Of the reads thet were removed %%%s were below the set quality threshold" %(round(((counter2-counter)/counter2)*100), round((counter3/(counter2-counter))*100))) break aa.close() bb = open(tempsam, 'r') list_of_stuff = ['@SQ', '@PG', '@HD'] counter3 = 0 multiplier2 = 1 if not multi: printStatus("Generating the .sam file now.") # Capturing the header from the .bam file. while True: line = bb.next() if line.split("\t")[0] in list_of_stuff: with open(output, 'a') as ff: ff.write(line) else: if line.split("\t")[0] in name_set: sam_set.add(line) counter3 += 1 break while True: try: sam_line = bb.next() if ticker2: # If input was from bowtie if sam_line.split("\t")[0] in name_set: sam_set.add(sam_line) counter3 += 1 #this is a useless counter. Should be the same as counter else: #if input was from BBMap if sam_line.split(" ")[0] in name_set: sam_set.add(sam_line.split(" ")[0]+"\t"+"\t".join(sam_line.split("\t")[1:])) counter3 += 1 #this is a useless counter. Should be the same as counter if counter3 == multiplier2*chunk: for i in sam_set: with open(output, 'a') as ff: ff.write(i) multiplier2 += 1 sam_set = set() except StopIteration: for i in sam_set: with open(output, 'a') as ff: ff.write(i) sam_set = set() break bb.close() #cleaning up os.remove(tempbed) os.remove(tempsam) if not multi: printStatus("Generating a bam file now.") cmd_bam = "samtools view -Sb %s | samtools sort - %s" %(output, output.split(".sam")[0]) subprocess.check_call(cmd_bam, shell=True) if not multi: printStatus("Generating the index for that bam.") cmd_sam = "samtools index %s" %output.split(".sam")[0]+".bam" subprocess.check_call(cmd_sam, shell=True) os.remove(output) if multi: end = time.time() elapsed = round(end-start) with open(logfile, 'a') as ff: msg ="Done! Elapsed time: %s seconds.\n" %elapsed ff.write("%s: %s" % (time.strftime("%X %x"), msg))
def analysis(fastqs, vfasta, jfasta, vtags, jtags, rev_comp=False, verbose=False, sep=" "): if verbose: sys.stderr.write('>> Analyzing %d file(s)\n' % len(fastqs)) sys.stderr.write(">> Importing known V, and J gene segments and tags\n") # get the sequences per region v_genes = list(SeqIO.parse(nopen(vfasta), "fasta")) j_genes = list(SeqIO.parse(nopen(jfasta), "fasta")) # XXX # classes to parse fasta, fastq, and method to reverse complement # get rid of biopython v_regions = [str(v_genes[i].seq.upper()) for i, v in enumerate(v_genes)] j_regions = [str(j_genes[i].seq.upper()) for i, v in enumerate(j_genes)] v_seqs, vleft_seqs, vright_seqs, v_ends = get_tags(vtags) j_seqs, jleft_seqs, jright_seqs, j_starts = get_tags(jtags) # full sequences builder = AcoraBuilder(v_seqs) v_key = builder.build() builder = AcoraBuilder(j_seqs) j_key = builder.build() # half sequences builder = AcoraBuilder(vleft_seqs) vleft_key = builder.build() builder = AcoraBuilder(vright_seqs) vright_key = builder.build() builder = AcoraBuilder(jleft_seqs) jleft_key = builder.build() builder = AcoraBuilder(jright_seqs) jright_key = builder.build() # correctly assigned sequences assigned_count = 0 # number of sequences analysed seq_count = 0 # begin clock t0 = time() # XXX stemplate = Template('$v $j $del_v $del_j $nt_insert') for fastq in fastqs: if verbose: sys.stderr.write(">> Starting %s...\n" % fastq) for i, record in enumerate(SeqIO.parse(nopen(fastq), "fastq")): # if i == 50: # sys.exit() found_seq_match = 0 seq_count += 1 hold_v = v_key.findall(str(record.seq)) hold_j = j_key.findall(str(record.seq)) if hold_v: # the index position of the found sequence among known (v_seqs) v_match = v_seqs.index(hold_v[0][0]) # new variable names # do not like lists for this task match_idx = v_seqs.index(hold_v[0][0]) match_start_idx = hold_v[0][1] vseq_end = v_ends[match_idx] - 1 end_of_v = match_start_idx + vseq_end # Finds where the end of a full V would be temp_end_v = hold_v[0][1] + v_ends[v_match] - 1 # If the number of deletions has been found if get_v_deletions(record.seq, v_match, temp_end_v, v_regions): end_v, deletions_v = get_v_deletions(record.seq, v_match, temp_end_v, v_regions) else: found_v_match = 0 hold_v1 = vleft_key.findall(str(record.seq)) hold_v2 = vright_key.findall(str(record.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(vleft_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[vleft_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k # Finds where the end of a full V would be temp_end_v = hold_v1[i][1] + v_ends[v_match] - 1 found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(vright_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[vright_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k # Finds where the end of a full V would be temp_end_v = hold_v2[i][1] + v_ends[v_match] - 1 found_v_match += 1 if hold_j: # Assigns J j_match = j_seqs.index(hold_j[0][0]) # Finds where the start of a full J would be temp_start_j = hold_j[0][1] - j_starts[j_match] if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = jleft_key.findall(str(record.seq)) hold_j2 = jright_key.findall(str(record.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(jleft_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[jleft_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = jleft_seqs.index(hold_j1[i][0]) # Finds where the start of a full J would be temp_start_j = hold_j1[i][1] - j_starts[j_match] found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(jright_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[jright_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = jright_seqs.index(hold_j2[i][0]) # Finds where the start of a full J would be temp_start_j = hold_j2[i][1] - j_starts[j_match] - 6 found_j_match += 1 if hold_v and hold_j: f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = record.seq[end_v+1:start_j]) # Write to analysis_file (text file) the classification of the sequence print f_seq assigned_count += 1 found_seq_match = 1 elif hold_v and found_j_match == 1: f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and hold_j: f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and found_j_match == 1: f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print f_seq assigned_count += 1 found_seq_match = 1 ##################### # REVERSE COMPLEMENT ##################### if found_seq_match == 0 and rev_comp: record_reverse = record.reverse_complement() hold_v = v_key.findall(str(record_reverse.seq)) hold_j = j_key.findall(str(record_reverse.seq)) if hold_v: # Assigns V v_match = v_seqs.index(hold_v[0][0]) # Finds where the end of a full V would be temp_end_v = hold_v[0][1] + v_ends[v_match] - 1 # If the number of deletions has been found if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ): end_v, deletions_v = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) else: found_v_match = 0 hold_v1 = vleft_key.findall(str(record_reverse.seq)) hold_v2 = vright_key.findall(str(record_reverse.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(vleft_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[vleft_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k # Finds where the end of a full V would be temp_end_v = hold_v1[i][1] + v_ends[v_match] - 1 found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(vright_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[vright_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k # Finds where the end of a full V would be temp_end_v = hold_v2[i][1] + v_ends[v_match] - 1 found_v_match += 1 if hold_j: # Assigns J j_match = j_seqs.index(hold_j[0][0]) # Finds where the start of a full J would be temp_start_j = hold_j[0][1] - j_starts[j_match] if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): start_j, deletions_j = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = jleft_key.findall(str(record_reverse.seq)) hold_j2 = jright_key.findall(str(record_reverse.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(jleft_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[jleft_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = jleft_seqs.index(hold_j1[i][0]) # Finds where the start of a full J would be temp_start_j = hold_j1[i][1] - j_starts[j_match] found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(jright_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[jright_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = jright_seqs.index(hold_j2[i][0]) # Finds where the start of a full J would be temp_start_j = hold_j2[i][1] - j_starts[j_match] - 6 found_j_match += 1 if (hold_v and hold_j) or \ (hold_v and found_j_match == 1) or \ (found_v_match == 1 and hold_j) or \ (found_v_match == 1 and found_j_match == 1): f_seq = stemplate.substitute(v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v + 1:start_j])) fields = (v_match, j_match, deletions_v, deletions_j, record_reverse.seq[end_v + 1:start_j]) assigned_count += 1 found_seq_match = 1 print sep.join(map(str, fields)) if verbose: t = time() - t0 sys.stderr.write('%d sequences were analysed\n' % seq_count) sys.stderr.write('%d sequences were successfully assigned\n' % assigned_count) sys.stderr.write('%s seconds elapsed\n' % t)
def apm(motif_list, pattern, max_mismatch): list_indices = [] for i in range(0, len(motif_list)): if Levenshtein.hamming(pattern,motif_list[i]) <= max_mismatch: list_indices.append(i) return list_indices
for word, col_num in otsu.cut_to_lines(rotate=False, show=0): _word = word word = word.resize((48, 48), Image.BICUBIC).convert('1') data = ''.join(str(p) for p in word.getdata()).replace('255', '1') m5 = md5(data) if m5 not in samples: # 请开着目录/tmp/cut方便输入 path = '/tmp/cut/%s.%s_%s.png' % (line_num, col_num, m5) word.save(path) min_distance = len(data) maybe = None for key, value in samples.items(): binary_string = value[-2] try: distance = Levenshtein.hamming(binary_string, data) except: del samples[key] if min_distance > distance: maybe = value min_distance = distance maychar = maybe[-1] print 'maybe:', maychar, min_distance char = raw_input('input(press RETURN to accept %s):' % maychar) if char == '': char = maychar os.remove(path) os.system('clear') samples[m5] = [word.tostring(), data, char] pickle.dump(samples, open(pickle_file, 'wb'))