def get_nn(self, q, nonn="None"): from Levenshtein import hamming assert (len(q) == self.l) set_for_check = set() for j in range(self.tau + 1): substr = q[j * self.piece_len:j * self.piece_len + self.piece_lens[j]] if substr in self.sets_interval[j]: set_for_check.update(self.sets_interval[j][substr]) if len(set_for_check) == 0: return None i = min(set_for_check, key=lambda i: (hamming(self.reads[i], q), -self.priority[i])) result = self.reads[i] if hamming(result, q) > self.tau and nonn == "None": result = None return result
def hamming_graph_naive(reads, tau=1, **kwargs): """ Construct hamming(tau) graph using naive O(N**2 d) algorithm """ import igraph as ig import numpy as np from Levenshtein import hamming l = len(reads[0]) for read in reads: assert (len(read) == l) N = len(reads) m = np.zeros((N, N), dtype=int) for i in range(N): for j in range(i): dist = hamming(reads[i], reads[j]) m[i, j] = m[j, i] = dist if dist <= tau else 0 # Be careful! Zero elements are not interpreted as zero-length edges g = ig.Graph.Weighted_Adjacency(m.tolist(), mode="UNDIRECTED", attr="weight", loops=False) g.vs["read"] = reads for attr_name, attr_data in kwargs.iteritems(): g.vs[attr_name] = attr_data return g
def filter(msg, filterWords, englishWords, hammingDistance, levenshteinDistance): #Filter special characters from string filteredMsg = ''.join(e for e in msg if e.isalnum()) #for each word in filter list for word in filterWords: #for all criteria check if it's a proper english word before filtering it #If a word matches a filter criteria, and is not an english word, the asterisk string is returned #only check hamming distance of strings are same length if len(word) == len(filteredMsg): #if hamming distance isthe value passed in or less, censor it if hamming(word, filteredMsg) <= hammingDistance: if not WordChecker.check_word_exists_in( englishWords, filteredMsg): return generateRandomAsteriskString() #if not within hamming distance, check levenshtein distance is within range to be filtered elif distance(word, filteredMsg) <= levenshteinDistance: if not WordChecker.check_word_exists_in( englishWords, filteredMsg): return generateRandomAsteriskString() #if not same length, check if within levenshtein distance insert range elif abs(len(word) - len(filteredMsg) <= levenshteinDistance): if distance(word, filteredMsg) <= levenshteinDistance: if not WordChecker.check_word_exists_in( englishWords, filteredMsg): return generateRandomAsteriskString() # Otherwise, return original string return msg
def hamming_distance(words: Iterator[str], vocabulary: Dict[str, int]): """Corrects the words based on Hamming distances Args: words (Iterator[str]): Iterator over the misspelled words vocabulary (Dict[str,int]) : dictionary holding words and their frequency """ for word in words: distances = [] suggestions = [] vocab_list = list(vocabulary) for (i,vocab) in enumerate(vocab_list): if len(vocab) == len(word): distances.append(hamming(word, vocab)) else: distances.append(120) idx = np.array(distances).argsort()[:5] for i in range(5): for j in range(i+1,5): if distances[idx[i]] == distances[idx[j]]: if vocabulary.get(vocab_list[idx[i]]) < vocabulary.get(vocab_list[idx[j]]): temp = idx[i] idx[i] = idx[j] idx[j] = temp for i in idx: suggestions.append(vocab_list[i]) output("{misspelled}\t{corrections}".format( misspelled=word, corrections="\t".join(suggestions) )) # may cause IO bottleneck
def hamm(str1, str2): """ Levenshtein distance with option to throw None for strings of different lengths. :param str1: string :param str2: string :return: int or None """ return hamming(str1, str2) if len(str1) is len(str2) else None
def checkHamming(barcodes, barcode): for bc in barcodes: match = False hd = hamming(barcode, bc) if hd <= 2: match = True barcode = bc break return (match, barcode)
def checkHamming(barcodes, barcode): '''Given a list of barcodes, check that the given barcode is within edit\ distance 2 to any of the list of barcodes''' for bc in barcodes: match = False hd = hamming(barcode, bc) if hd <= 2: match = True barcode = bc break return (match, barcode)
def segment_match(feature_strings, target_segment): '''Returns the best match for the IPA string of the given Segment, from the given list of tuples containing feature strings. The first item in each tuple is the phoneme and the second is the feature string. ''' target_feature_string = feature_string(target_segment) # If the segment has previously been matched, return the cached value if target_feature_string in deparse_cache: return deparse_cache[target_feature_string] # Find the distance of the initial candidate to serve as a benchmark. best_distance = hamming(target_feature_string, feature_strings[0][1]) best_strings = [feature_strings[0][0]] # Loop through the rest of the available strings. If the distance between # the string and the target is greater than the current best, jump to the # next string. Otherwise, if it's the same add it to best_strings, or if # it's less overwrite best_strings. for string in feature_strings[1:]: new_distance = hamming(target_feature_string, string[1]) if new_distance > best_distance: continue elif new_distance < best_distance: best_distance = new_distance best_strings = [string[0]] else: best_strings.append(string[0]) # Find the shortest of these strings, because we want to deparse # into the simplest segments possible. deparsed_segment = min(best_strings, key=len) # Add the new match to the cache. deparse_cache[target_feature_string] = deparsed_segment return deparsed_segment
def main(): args = get_args() #pdb.set_trace() fastqs = fastq.FastqReader(args.input) out_fastq = fastq.FastqWriter(args.output) for read in fastqs: read_tag = read.identifier.split('#')[-1].split('/')[0] if hamming(read_tag, args.tag) <= 1: out_fastq.write(read) else: pass out_fastq.close()
def check_hamming_distance(self, iList, datatype, d_type, split_line): MAX_SPELLING_ERRORS = 2 if len(d_type) == len(datatype): if hamming(d_type, datatype) <= MAX_SPELLING_ERRORS: if datatype == 'indicators of compromise': print('\tno regex match on %s, using hamming distance' % datatype) return self.ret_indicators_of_compromise(iList) else: print('\tno regex match on %s, using hamming distance' % datatype) return ''.join(split_line[1:]) return ''
def merge_paths(paths, MIN_DIST=1): paths_sorted = sorted(paths, key=lambda tup: tup[1]) num_paths = len(paths) paths_merged = {tup[0]: tup for tup in paths_sorted} get_seq = lambda tup: tup[0] for (i, path) in enumerate(paths_sorted): for j in range(i + 1, num_paths): ham_dist = hamming(get_seq(paths[i]), get_seq(paths[j])) if (ham_dist <= MIN_DIST): bad_path = min([paths[i], paths[j]], key=lambda tup: tup[1]) if (get_seq(bad_path) in paths_merged.keys()): del (paths_merged[get_seq(bad_path)]) return list(paths_merged.values())
def hamming_graph_knuth(reads, tau=1, **kwargs): """ Construct hamming(tau) graph using Knuth's algorithm """ import igraph as ig from collections import defaultdict from Levenshtein import hamming l = len(reads[0]) for read in reads: assert (len(read) == l) piece_len = int(l / (tau + 1)) piece_len_last = l - piece_len * tau piece_lens = [piece_len] * tau + [piece_len_last] g = ig.Graph(len(reads)) g.vs["read"] = reads for attr_name, attr_data in kwargs.iteritems(): g.vs[attr_name] = attr_data edges_for_check = set() for j in range(tau + 1): sets = defaultdict(list) for i in range(len(reads)): substr = reads[i][j * piece_len:j * piece_len + piece_lens[j]] sets[substr].append(i) for v_list in sets.itervalues(): # print("N += %d" % len(v_list)) for i1 in range(len(v_list)): for i2 in range(i1 + 1, len(v_list)): edges_for_check.add((v_list[i1], v_list[i2])) # print("Edges for check %d" % len(edges_for_check)) for v1, v2 in edges_for_check: read1, read2 = reads[v1], reads[v2] d = hamming(read1, read2) if d <= tau: g.add_edge(v1, v2, weight=d) return g
def assign_read(params): (consensus_bcs, (reads_data, reads_offset), (barcodes_data, barcodes_offset)) = params obs_bc = reads_data[1].strip()[ \ args['barcode_start']: args['barcode_end']] min_dist = None assignment = [] for consensus_bc in consensus_bcs: dist = hamming(obs_bc, consensus_bc) if min_dist == None or dist < min_dist: min_dist = dist assignment = [consensus_bc] #in the case of a tie, elif dist == min_dist: assignment.append(consensus_bc) #return the best unique assignment if len(assignment) == 1: return (assignment[0], reads_offset, barcodes_offset) #or don't assign read (in the case of a tie) return ('unassigned', reads_offset, barcodes_offset)
group_cnt += 1 if group_cnt < 39490: continue print index groupname1, groupname2 = row[0], row[1] dna_series = df[(df[0] == groupname1) & (df[1] == groupname2)][2] #前者存数据,后者存结果 dna_list = list(dna_series) #将大组数据存进一个list dna_result_dict = {} for dna in dna_list: isNewKey = True for key in dna_result_dict.keys(): if hamming(dna, key) < 3: #小于3则相似 dna_result_dict[key].append(dna) isNewKey = False break if isNewKey: dna_result_dict[dna] = [dna] #不相似的归到一组 no_match_list = [] group_index = 1 for key, value in dna_result_dict.iteritems(): if len(value) == 1: no_match_list.append(key) # del dna_result_dict[key] else: result_list.append([groupname1, groupname2, group_index, value])
def dist(i, j): return hamming(reads[i], reads[j])
def test_zero_differences(self): """[hamming-c] no differences""" expected = 0 observed = hamming('wonderful', 'wonderful') self.assertEqual(expected, observed)
priority=original_barcode_mult) print "Index constructed" bad_barcodes = [] barcode_barcode = {} dists = [] for barcode in data_barcodes: if barcode in original_barcodes: barcode_barcode[barcode] = barcode dists.append(0) else: neib = tree.get_nn(barcode, nonn="None") if neib is not None: barcode_barcode[barcode] = neib dists.append(hamming(neib, barcode)) else: bad_barcodes.append(barcode) dists.append(-1) print "Bad-coded reads %d unique barcodes %d" % (sum(barcodes_count[barcode] for barcode in bad_barcodes), len(bad_barcodes)) print "Well-coded reads %d unique barcodes %d" % (sum(barcodes_count[barcode] for barcode in barcode_barcode.iterkeys()), len(barcode_barcode)) dist_hist = defaultdict(int) dist_barcodes = defaultdict(list) for dist, barcode in zip(dists, data_barcodes): dist_hist[dist] += 1
remaining_seqs.pop(max_seq) seqs_in_cluster = 1 for sample_ID in all_samples_reads: if max_seq in all_samples_reads[sample_ID]: all_samples_clusters[sample_ID][max_seq] = all_samples_reads[ sample_ID][max_seq] for next_seq, matches in sorted(remaining_seqs.items(), key=lambda x: x[1], reverse=True): if len(next_seq) != len(max_seq): mismatches = 99 #wrong length else: mismatches = hamming(max_seq, next_seq) if mismatches <= 1 or (next_seq in max_seq): #allow missing bases at ends #add_to_cluster current_cluster += remaining_seqs[next_seq] for sample_ID in all_samples_reads: if next_seq in all_samples_reads[sample_ID]: increment(all_samples_clusters[sample_ID], max_seq, all_samples_reads[sample_ID][next_seq]) remaining_seqs.pop(next_seq) seqs_in_cluster += 1
def even_split_mismatching(self, kmers, kmer_dict, rev_kmer_dict, peptide_length): ''' ''' # record matches in a set so as to not duplicate matches matches = set() for i in range(0, len(kmers), self.split): # find each hit for each k-mer try: for hit in kmer_dict[kmers[i]]: mismatches = 0 # if the k-mer is found in the middle or end, check the neighboring # k-mers to the left for j in range(0, i, self.split): # use reverse dictionary to retrive k-mers for Hamming distance try: mismatches += hamming(rev_kmer_dict[hit+j-i], kmers[j]) # if mismatches ever reach threshold, break out of loop if mismatches >= self.max_mismatches + 1: break # if first k-mer finds nothing, set mismatches to 100 to disqualify this # peptide from matching with this area except KeyError: mismatches = 100 # if the k-mer is found in the middle or end, check the neighbors # k-mers to the right for k in range(i+self.split, len(kmers), self.split): try: # use reverse dictionary to retrive k-mers for Hamming distance mismatches += hamming(rev_kmer_dict[hit+k-i], kmers[k]) # if mismatches ever reach threshold, break out of loop if mismatches >= self.max_mismatches + 1: break # if last k-mer finds nothing, set mismatches to 100 to disqualify this # peptide from matching with this area except KeyError: mismatches = 100 # if the mismatches that were calculated is less than threshold # for all neighbors, then it's a match if mismatches < self.max_mismatches + 1: matched_peptide = '' try: for s in range(0, peptide_length, self.split): matched_peptide += rev_kmer_dict[hit-i+s] except KeyError: continue matches.add((matched_peptide, mismatches, hit - i)) if self.best_match and not mismatches: return matches # if nothing is found, you can check the next k-mer, since it can still be a match except KeyError: continue return matches
def determine_edit_distance(G1,G2): G1str = str(graph2str(G1)) G2str = str(graph2str(G2)) #G1str and G2str MUST have same length return hamming(G1str,G2str)
def test_one_substitution(self): """[hamming-c] one difference""" expected = 1 observed = hamming('wonderful', 'wondirful') self.assertEqual(expected, observed)
def hammng(a, b): """return the hamming distance between a and b""" return hamming(a, b)
def matched_name_in_snt(name, alias_names, snt, mode): user_utterance = snt domEnt = name alias_domEnts = alias_names # check 4968 if mode == 'exact': # hard matching searchObj = re.search("(^{}\W|\W{}\W|\W{}$)".format(domEnt, domEnt, domEnt), user_utterance, re.I) if searchObj: return True if mode == 'hamming': # hamming distance if len(domEnt) <= 4: return False # if not "acorn" in domEnt.lower(): # return False for start in range(len(user_utterance) - len(domEnt) + 2): ## if not segmented properly, continue # if "acorn" in domEnt.lower(): # print (user_utterance.lower()[start: start + len(domEnt) - 1]) # print (start, len(domEnt), len(user_utterance)) if ( start == 0 and start + len(domEnt) < len(user_utterance) and\ not user_utterance[start + len(domEnt)].isalnum() or\ start == len(user_utterance) - len(domEnt) and start > 0 and\ not user_utterance[start-1].isalnum() or\ (start-1 > 0 and not user_utterance[start-1].isalnum() and\ start + len(domEnt) < len(user_utterance) and\ not user_utterance[start + len(domEnt)].isalnum()) ): if hamming(domEnt.lower(), user_utterance.lower()[start: start + len(domEnt)]) <= 1 and user_utterance[start].lower() == domEnt[0].lower():# and\ #user_utterance[start].isupper(): # print ("hamming0: ", domEnt, user_utterance[start: start + len(domEnt)]) return True # +1 if ( start == 0 and start + len(domEnt) + 1 < len(user_utterance) and\ not user_utterance[start + len(domEnt) + 1].isalnum() or\ start == len(user_utterance) - len(domEnt) - 1 and start > 0 and\ not user_utterance[start-1].isalnum() or\ (start-1 > 0 and not user_utterance[start-1].isalnum() and\ start + len(domEnt) + 1 < len(user_utterance) and\ not user_utterance[start + len(domEnt) + 1].isalnum()) ): if fuzz.ratio(domEnt.lower(), user_utterance.lower()[start: start + len(domEnt) + 1]) >= 90 and user_utterance[start].lower() == domEnt[0].lower():# and\ #user_utterance[start].isupper(): # print ("hamming: ", domEnt, user_utterance[start: start + len(domEnt)]) # print ("hamming+1: ", domEnt, user_utterance) return True # -1 if ( start == 0 and start + len(domEnt) - 1 < len(user_utterance) and\ not user_utterance[start + len(domEnt) - 1].isalnum() or\ start == len(user_utterance) - len(domEnt) + 1 and start > 0 and\ not user_utterance[start-1].isalnum() or\ (start-1 > 0 and not user_utterance[start-1].isalnum() and\ start + len(domEnt) - 1 < len(user_utterance) and\ not user_utterance[start + len(domEnt) - 1].isalnum()) ): # print (user_utterance[start: start + len(domEnt) - 1]) if fuzz.ratio(domEnt.lower(), user_utterance.lower()[start: start + len(domEnt) - 1]) >= 90 and user_utterance[start].lower() == domEnt[0].lower():# and\ #user_utterance[start].isupper(): # print ("hamming-1: ", domEnt, user_utterance.lower()[start: start + len(domEnt) - 1]) return True if mode == 'alias': # alias matching for domEnt in alias_domEnts: # print ("try alias: ", domEnt, user_utterance) searchObj = re.search("(^{}\W|\W{}\W|\W{}$)".format(domEnt, domEnt, domEnt), user_utterance, re.I) if searchObj: if user_utterance[searchObj.span(1)[0]+1].isupper(): return True else: # print (domEnt, "vs", user_utterance) return False # if upper then cannot match 5325, 5326, etc, where user types lowercase incomplete entity name # but if not upper, then ask will be identified as Ask Restaurant # for start in range(len(user_utterance) - len(domEnt) + 2): # ## if not segmented properly, continue # # if "acorn" in domEnt.lower(): # # print (user_utterance.lower()[start: start + len(domEnt) - 1]) # # print (start, len(domEnt), len(user_utterance)) # if ( # start == 0 and start + len(domEnt) < len(user_utterance) and\ # not user_utterance[start + len(domEnt)].isalnum() or\ # start == len(user_utterance) - len(domEnt) and start > 0 and\ # not user_utterance[start-1].isalnum() or\ # (start-1 > 0 and not user_utterance[start-1].isalnum() and\ # start + len(domEnt) < len(user_utterance) and\ # not user_utterance[start + len(domEnt)].isalnum()) # ): # if hamming(domEnt.lower(), user_utterance.lower()[start: start + len(domEnt)]) <= 1 and user_utterance[start].lower() == domEnt[0].lower():# and\ # #user_utterance[start].isupper(): # print ("alias hamming: ", domEnt, user_utterance) # return True if mode == 'lemma': lemmatizer = WordNetLemmatizer().lemmatize # lemmatizer function or None tokenizer = nltk.tokenize.WordPunctTokenizer().tokenize tokenized = tokenizer(user_utterance) tokenized_pos = nltk.pos_tag(tokenized) tokenized_joined = " ".join([lemmatize(lemmatizer, token.lower(), get_wordnet_pos(pos) or wordnet.NOUN).lower() for token, pos in tokenized_pos]) # lemma matching searchObj = re.search("(^{}\W|\W{}\W|\W{}$)".format(domEnt, domEnt, domEnt), tokenized_joined, re.I) if searchObj: return True return False
bc = dict() first = True with open(args.barcodesFile, 'r') as f: for line in f: barcode, count = line.rstrip().split("\t") count = int(count) if first: bc[barcode] = count first = False else: found = False for b in list(bc): # Hamming distance is only for equal lengths. Different lengths should not occur under current CaTCH design, but may be diagnostic counts or unforseen irregularities. # Explicitly exclude any diagnostic categories from having their distances compared. if len(b) == len(barcode) and b not in ["unknown", "SampleUnknown", "unmatched", "BCUnmatched", "empty", "EmptyVector", "spike", "SpikeIn"]: h = hamming(b, barcode) sh = str(h) hbcstat.update([sh]) if sh in hrmin: hrmin[sh] = min([hrmin[sh], count, bc[b]]) hrmax[sh] = max([hrmax[sh], count, bc[b]]) else: hrmin[sh] = count hrmax[sh] = count if len(b) == len(barcode) and h <= args.hammDist and not found: if bc[b] >= count: # If existing sequence is more abundant, update its count and discard the new sequence # This is the most likely scenario, as the quantifier orders the barcodes by decreasing abundance. bc[b] = bc[b] + count found = True # break