def get_edit_dist(input_file, output_file, target_seq): fi = open(input_file, 'r') fo = open(output_file, 'w') fo.write( "Chr\tLocation\tForward29\tReverse29\tEdit dist for\tEdit dist rev\tStep for\tStep rev\tDeletion# for\tDeletion# rev\tBulge dist for\tBulge dist rev\n" ) for line in fi.xreadlines(): units = line.split() chrm = units[0] loc = units[1] forseq = units[2] revseq = units[3] value = {} for nuc in "ATGC": target_seq.replace('N', nuc) value[nuc] = (l.distance(target_seq, forseq), l.editops(target_seq, forseq)) for_max = max(value, key=value.get) for_dist, for_editops = value[for_max] for_step = [] for each in for_editops: a, b, c = each for_step.append(a) for_deletion = for_step.count('delete') value = {} for nuc in "ATGC": target_seq.replace('N', nuc) value[nuc] = (l.distance(target_seq, revseq), l.editops(target_seq, revseq)) rev_max = max(value, key=value.get) rev_dist, rev_editops = value[rev_max] rev_step = [] for each in rev_editops: a, b, c = each rev_step.append(a) rev_deletion = rev_step.count('delete') #print int(rev_deletion) bulge_l = [for_dist + for_deletion * 2, rev_dist + rev_deletion * 2] del_l = [for_deletion, rev_deletion] fo.write( "%s\t%s\t\ %s\t%s\t\ %d\t%d\t\ %s\t%s\t\ %d\t%d\t\ %d\t%d\t\ %d\t%d\n" % (chrm, loc, forseq, revseq, for_dist, rev_dist, for_editops, rev_editops, for_deletion, rev_deletion, bulge_l[0], bulge_l[1], min(bulge_l), del_l[bulge_l.index(min(bulge_l))])) #print rev_dist+rev_deletion*4 fi.close() fo.close()
def main(): # a = 'GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC' # b = 'GAGTCGAGCAGAAGAAGAANGG' a = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA' b = 'TTTNCTGATGGTCCATGTCTGTTACTC' print(l.distance(a, b)) print(l.editops(a, b)) print(l.matching_blocks(l.editops(a, b), a, b))
def get_edit_dist(input_file, output_file, target_seq): fi = open (input_file,'r') fo = open (output_file, 'w') fo.write("Chr\tLocation\tForward29\tReverse29\tEdit dist for\tEdit dist rev\tStep for\tStep rev\tDeletion# for\tDeletion# rev\tBulge dist for\tBulge dist rev\n") for line in fi.xreadlines(): units = line.split() chrm = units[0] loc = units[1] forseq = units[2] revseq = units[3] value = {} for nuc in "ATGC": target_seq.replace('N',nuc) value[nuc] = (l.distance(target_seq,forseq),l.editops(target_seq,forseq)) for_max = max(value, key=value.get) for_dist, for_editops = value[for_max] for_step=[] for each in for_editops: a,b,c =each for_step.append(a) for_deletion=for_step.count('delete') value = {} for nuc in "ATGC": target_seq.replace('N',nuc) value[nuc] = (l.distance(target_seq,revseq),l.editops(target_seq,revseq)) rev_max = max(value, key=value.get) rev_dist, rev_editops = value[rev_max] rev_step=[] for each in rev_editops: a,b,c=each rev_step.append(a) rev_deletion=rev_step.count('delete') #print int(rev_deletion) bulge_l = [for_dist+for_deletion*2,rev_dist+rev_deletion*2] del_l = [for_deletion,rev_deletion] fo.write( "%s\t%s\t\ %s\t%s\t\ %d\t%d\t\ %s\t%s\t\ %d\t%d\t\ %d\t%d\t\ %d\t%d\n"% (chrm,loc, forseq,revseq, for_dist,rev_dist, for_editops,rev_editops, for_deletion,rev_deletion, bulge_l[0],bulge_l[1], min(bulge_l),del_l[bulge_l.index(min(bulge_l))])) #print rev_dist+rev_deletion*4 fi.close();fo.close()
def debugging(folderName): referenceDic = IORobot.loadContigsFromFile(folderName, "reference.fasta") interiorsDic = IORobot.loadContigsFromFile(folderName, "interiors.fasta") GTDic = IORobot.loadContigsFromFile(folderName, "GTDic.fasta") str1 = referenceDic["Segkk0"][2500000:2500000 + 12000] str2 = referenceDic["Segkk1"][2500000:2500000 + 12000] print Levenshtein.distance(str1, str2) print Levenshtein.editops(str1, str2) str3 = interiorsDic["Segkk0"][7000:7000 + 12000] str4 = interiorsDic["Segkk1"][7000:7000 + 12000] print Levenshtein.distance(str1, str4) print Levenshtein.distance(str2, str3) print Levenshtein.editops(str1, str4) print "" offset = 4000 print Levenshtein.editops(str2, str3) print str1[offset - 10:offset + 10] print str2[offset - 10:offset + 10] print str3[offset - 10:offset + 10] print str4[offset - 10:offset + 10] str5 = GTDic["Segkk0"][7000:7000 + 12000] str6 = GTDic["Segkk1"][7000:7000 + 12000] print str5[offset - 10:offset + 10] print str6[offset - 10:offset + 10] print Levenshtein.editops(str2, str4)
def mappingtxtans(self, speech_results): ans_txt = config['Answer'].get('ans_txt') f = open(ans_txt) text = [] for line in f: text.append(line.rstrip()) print(text) if self.read_num > len(text): self.read_num == 0 txt_ans = text[self.read_num] speech_ans_list = list(speech_results) speech_ans_range = range(len(speech_ans_list)) e = Levenshtein.editops(txt_ans, speech_results) index = [] com = ',' com_index = speech_results.find(com) e = list(filter(lambda x: x[0] != 'delete', e)) for item in e: if item[2] != com_index: index.append(item[2]) if index == []: return "no different" else: return index
def operations(self): """ Metrics to determine the functionality of the surface segmentation generator :return: number of edited words, total words in dataset, number of operations performed, number of deletions, number of replacements """ edited_words, total_words, operations, delete, replace = 0, 0, 0, 0, 0 for file in self.all_files: input_file = open(os.path.join(sys.path[0], file), 'r') for line in input_file.readlines(): content = line.rstrip('\n').split(" | ") orthographic = de_segment(removeLabels(content[3])) word = content[0] total_words += 1 if not word.__eq__(orthographic): edited_words += 1 edits = LevenshteinDistance.editops(orthographic, word) for ed in edits: if ed[0] == 'delete': operations += 1 delete += 1 elif ed[0] == 'replace': operations += 1 replace += 1 return edited_words, total_words, operations, delete, replace
def main(args): logging.info('Starting main...') _start_time = time.clock() if not os.path.isfile(args.true): raise OSError('True file not found: {0}'.format(args.true)) if not os.path.isfile(args.prediction): raise OSError('Prediction file not found: {0}'.format(args.prediction)) conversion_script = os.path.join(args.m2ly_path, 'musicxml2ly') if not os.path.isfile(conversion_script): raise OSError('Cannot find musicxml2ly conversion script: {0}'.format(conversion_script)) conversion_cmd = '{0} --lxml -a -o - '.format(conversion_script) true_ly = postprocess_ly(execute(conversion_cmd + args.true)) pred_ly = postprocess_ly(execute(conversion_cmd + args.prediction)) edits = Levenshtein.editops(true_ly, pred_ly) print('{0}'.format(len(edits))) if args.export_pred: with codecs.open(args.prediction + '.ly', 'w', 'utf-8') as export_h: export_h.write(pred_ly + u'\n') _end_time = time.clock() logging.info('lilypond_eval.py done in {0:.3f} s'.format(_end_time - _start_time))
def makeCigar(seq, ref): if (len(seq) > 16384) or (len(ref) > 16384): rmid = len(ref) / 2 smid = len(seq) / 2 prox = makeCigar(seq[:smid], ref[:rmid]) dist = makeCigar(seq[smid:], ref[rmid:]) return prox + dist ops = Levenshtein.editops(seq, ref) code = ['=' for i in xrange(len(seq))] offset = 0 for op, si, di in ops: if (op == "replace"): code[si + offset] = 'X' elif (op == "insert"): code.insert(si + offset, 'D') offset += 1 elif (op == "delete"): code[si + offset] = 'I' # LM: fixed bug here 2019-04-15 cigar = '' count = 1 prev = code[0] for c in code[1:]: if (c == prev): count += 1 else: cigar += "%d%c" % (count, prev) count = 1 prev = c cigar += "%d%c" % (count, prev) return cigar
def get_cost(source_word, target_word, action_count=None): ops = lev.editops(source_word, target_word) if flag: print(ops) total_cost = 0 for op in ops: action = op[0] if action_count: action_count[action] += 1 source_pos = op[1] target_pos = op[2] char = source_word[source_pos] if action == 'delete' else target_word[ target_pos] if flag: print(char) part1 = source_word[:source_pos] part2 = source_word[source_pos:] if action == 'insert' else source_word[ source_pos + 1:] #if pos < len(source_word) else '' if flag: print(action, part1, part2) consecutive_count_1 = get_consecutive_count(part1[::-1], char) consecutive_count_2 = get_consecutive_count(part2, char) total = consecutive_count_1 + consecutive_count_2 + 1 #min 1 (denote the inserted/replaced/deleted char) source_len = len(source_word) if flag: print(op, "total", total, "weight", COST_WEIGHTS[action], "source len", source_len) cost = COST_WEIGHTS[action] / ( total / source_len ) #if action == 'delete' else COST_WEIGHTS[action]*(total/source_len) total_cost += cost if flag: print("cost", cost) if flag: print(total_cost) if not action_count: return total_cost return total_cost, action_count
def editOpts(correct, raw, threshold=2): opts = Levenshtein.editops(correct, raw) opsLength = len(opts) transOpts = [] if opsLength == 0: transOpts = None elif opsLength >= 2: idx = 0 while idx < opsLength - 1: opt1 = opts[idx] opt2 = opts[idx + 1] idx1 = opt1[2] idx2 = opt2[2] try: if opt1[0] == opt2[0] == 'replace' and abs(idx1 - idx2) == 1 and correct[idx1] == raw[idx2] and correct[idx2] == raw[idx1]: transOpts.append(('transposition', idx1, idx2)) idx += 2 else: transOpts.append(opt1) idx += 1 except: transOpts.append(opt1) idx += 1 if idx == opsLength - 1: transOpts.append(opts[idx]) else: transOpts = opts if transOpts and len(transOpts) > threshold: return None else: return transOpts
def add_bi_query(count, ss, sd): global N ss = '^' + ss sd = '^' + sd i_s = 0 i_d = 0 eds = Levenshtein.editops(ss, sd) for ed in eds: if ed[0] == 'insert': ss = string_insert(ss, ed[1] + i_s, '_') i_s += 1 if ed[0] == 'delete': sd = string_insert(sd, ed[2] + i_d, '_') i_d += 1 assert (len(ss) == len(sd)) sym = '='.decode('utf8') for i in range(len(ss) - 2): if sd[i + 1] == ss[i + 1] or sym == sd[i] or sym == sd[ i + 1] or sym == ss[i] or sym == ss[i + 1]: continue if sd[i:i + 2] not in count: count[sd[i:i + 2]] = {} if ss[i] not in count[sd[i:i + 2]]: count[sd[i:i + 2]][ss[i]] = Counter() count[sd[i:i + 2]][ss[i]][ss[i + 1]] += 1 N += 1
def identify_anchor_kmer_in_reference_graph(reference_graph, kmer_to_anchor, leftmost=None, rightmost=None, path_length=None): """ :type reference_graph: nx.DiGraph """ toposort = {v: k for k, v in enumerate(nx.topological_sort(reference_graph))} # print "Righmost is ",rightmost,toposort[rightmost] nodes_to_consider = reference_graph.nodes() if rightmost: idx = toposort[rightmost] nodes_to_consider = ifilter(lambda x: toposort[x] <= idx, nodes_to_consider) # print "Max is ", idx if leftmost: idx = toposort[leftmost] nodes_to_consider = ifilter(lambda x: toposort[x] >= idx, nodes_to_consider) # print "Min is ", idx nodes_to_consider = list(nodes_to_consider) node_dists = [(node, Levenshtein.distance(node, kmer_to_anchor), Levenshtein.editops(node, kmer_to_anchor)) for node in nodes_to_consider] # print "Will search anchor in ",list(node_dists) min_dist = min(node_dists, key=itemgetter(1))[1] node_dists = [x for x in node_dists if x[1] == min_dist] print "Min possible dist is", min_dist if rightmost: score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[rightmost] - path_length)) elif leftmost: score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[leftmost] + path_length)) dist_sorted = sorted(node_dists, key=score_func) # identify the rightmost node with minimal distance return dist_sorted[0][0]
def decompose_multiple_alterations(reference_path, alternative_path, kmer_length): reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length) multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length) edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence) if len(edit_ops) > 2: logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence) logger.info("Globally apply %s", edit_ops) start, end = 0, 0 while start < len(edit_ops): if edit_ops[start] == 'replace': atomic_sequence = Levenshtein.apply_edit([edit_ops[start]], reference_sequence, multi_alternative_sequence) # print atomic_sequence atomic_path = ALT.kmerize(atomic_sequence, kmer_length) start += 1 else: start_e = edit_ops[start] end = start + 1 while (end < len(edit_ops) and edit_ops[end][0] == start_e[0] and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])): end += 1 edit_op_to_apply = edit_ops[start:end] start = end logger.info("Will apply %s", edit_op_to_apply) atomic_sequence = Levenshtein.apply_edit(edit_op_to_apply, reference_sequence, multi_alternative_sequence) atomic_path = ALT.kmerize(atomic_sequence, kmer_length) # record each atomic alteration logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence) yield atomic_sequence, atomic_path
def _get_char_errors(s1, s2): s1, s2, = s1.replace(' ', ''), s2.replace(' ', '') ops = Lev.editops(s1, s2) errors = {"delete": 0, "insert": 0, "replace": 0} for x in ops: errors[x[0]] += 1 return errors
def compute_uer_confusion_matrix(predictions_dict, labels_dict, unit_dict): slim_dict = {key:val for key, val in unit_dict.items() if val not in ['GO', 'EOS', 'MASK', 'END']} vocab_size = len(slim_dict) invdict = {v: k for k, v in slim_dict.items()} conf_matrix = np.zeros(shape=(vocab_size, vocab_size + 2)) # plus deletions, insertions edit_ops_indices = [] edit_ops_at_word_boundaries = [] edit_ops_not_at_word_boundaries = [] for (id, label) in labels_dict.items(): label_str = ''.join(_strip_extra_chars(label)) prediction_str = ''.join(_strip_extra_chars(predictions_dict[id])) edit_ops = Levenshtein.editops(prediction_str, label_str) seen_positions = [] for op in edit_ops: opname = op[0] if len(prediction_str) >= 40: edit_ops_indices.append(op[1] / len(prediction_str)) # store all errors in the source (prediction) string if opname == 'delete': source_unit = prediction_str[op[1]] mat_col = vocab_size seen_positions.append(op[1]) if source_unit == ' ': edit_ops_at_word_boundaries.append(source_unit) else: edit_ops_not_at_word_boundaries.append(source_unit) elif opname == 'insert': source_unit = label_str[op[2]] # the inserted unit does not exist in the source string mat_col = vocab_size + 1 elif opname == 'replace': source_unit = prediction_str[op[1]] dest_unit = label_str[op[2]] mat_col = invdict[dest_unit] - 1 seen_positions.append(op[1]) if source_unit == ' ': edit_ops_at_word_boundaries.append(source_unit) else: edit_ops_not_at_word_boundaries.append(source_unit) else: raise Exception('unknown opname {}'.format(opname)) mat_row = invdict[source_unit] - 1 conf_matrix[mat_row, mat_col] += 1 for idx, symbol in enumerate(prediction_str): if idx not in seen_positions: # correct match mat_pos = invdict[symbol] - 1 conf_matrix[mat_pos, mat_pos] += 1 # plot_confusion_matrix(conf_matrix, invdict) plot_edit_ops_histogram(edit_ops_indices)
def change(m): (mfile, sourcefile, pos, orig, mutant) = m eops = Levenshtein.editops(orig, mutant) blocks = Levenshtein.matching_blocks(eops, orig, mutant) if len(blocks) > 4: return mutant[:-1] keep = ''.join([orig[x[0]:x[0] + x[2]] for x in blocks]) notKeep = "" pos = 0 wasDot = False for c in range(0, len(orig)): if orig[c] == keep[pos]: pos += 1 if not wasDot: notKeep += "..." wasDot = True else: notKeep += orig[c] wasDot = False notKeep += "==>" pos = 0 wasDot = False for c in range(0, len(mutant)): if (pos < len(keep)) and mutant[c] == keep[pos]: pos += 1 if not wasDot: notKeep += "..." wasDot = True else: notKeep += mutant[c] wasDot = False return notKeep
def main(args): logging.info('Starting main...') _start_time = time.clock() if not os.path.isfile(args.true): raise OSError('True file not found: {0}'.format(args.true)) if not os.path.isfile(args.prediction): raise OSError('Prediction file not found: {0}'.format(args.prediction)) conversion_script = os.path.join(args.m2ly_path, 'musicxml2ly') if not os.path.isfile(conversion_script): raise OSError('Cannot find musicxml2ly conversion script: {0}'.format( conversion_script)) conversion_cmd = '{0} --lxml -a -o - '.format(conversion_script) true_ly = postprocess_ly(execute(conversion_cmd + args.true)) pred_ly = postprocess_ly(execute(conversion_cmd + args.prediction)) edits = Levenshtein.editops(true_ly, pred_ly) print('{0}'.format(len(edits))) if args.export_pred: with codecs.open(args.prediction + '.ly', 'w', 'utf-8') as export_h: export_h.write(pred_ly + u'\n') _end_time = time.clock() logging.info('lilypond_eval.py done in {0:.3f} s'.format(_end_time - _start_time))
def edit_distance_list(url, topN=300): o = urlparse(url) ext = (tldextract.extract(o.netloc)) url_ext = ext.subdomain + (ext.domain) features = [] global minx global mina mina = 99999999 minx = 0 for x in website[:topN]: comp = urlsimpler(x[0]) compo = urlparse(comp) comp_ext = (tldextract.extract(compo.netloc)) comp_domain = comp_ext.domain ed = Levenshtein.editops(url_ext, comp_domain) insert = 0 replace = 0 delete = 0 for g in ed: if (g[0] == "insert"): insert += 1 if (g[0] == "replace"): replace += 1 if (g[0] == "delete"): delete += 1 weight = insert + replace + delete if (weight < mina): mina = weight minx = x features.extend([insert, replace, delete]) return features
def error_statistics(correct_sentences: List[str], written_sentences: List[str], keystroke_stats: schemas.keystroke_stats): """ Use Levenshtein.editops to get types of mistakes and calculate number of specific mistakes """ #Levenshtein leditops loop for sentence in range(len(correct_sentences)): for word in range(len(correct_sentences[sentence])): differences = Levenshtein.editops( correct_sentences[sentence][word], written_sentences[sentence][word]) #Uncomment to show differences that has been found #print(differences) if len(differences): for difference in differences: diff_type = difference[0] if diff_type is 'delete': keystroke_stats.so += 1 elif diff_type is 'insert': keystroke_stats.sa += 1 else: keystroke_stats.sch += 1 correct_index = difference[1] written_index = difference[2] correct_word = correct_sentences[sentence][word] written_word = written_sentences[sentence][word] sch(correct_index, written_index, correct_word, written_word, keystroke_stats) keystroke_stats.enc = keystroke_stats.sa + keystroke_stats.so + keystroke_stats.sch
def get_editops(src_name, tar_name): ops = Levenshtein.editops(src_name, tar_name) ops_pos, n1_pos, n2_pos = 0, 0, 0 editops = [] while True: if n1_pos == len(src_name) and n2_pos == len(tar_name): break c1 = src_name[n1_pos] if n1_pos < len(src_name) else '' c2 = tar_name[n2_pos] if n2_pos < len(tar_name) else '' op = ops[ops_pos] if ops_pos < len(ops) else None if op and op[1] == n1_pos and op[2] == n2_pos: if op[0] == 'replace': editops.append(c1 + c2) n1_pos += 1 n2_pos += 1 elif op[0] == 'insert': editops.append('_' + c2) n2_pos += 1 elif op[0] == 'delete': editops.append(c1 + '_') n1_pos += 1 else: raise Exception(f'Unexpected op {op}') ops_pos += 1 else: editops.append(c1 + c2) n1_pos += 1 n2_pos += 1 return editops
def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False): import Levenshtein # out = difflib.get_close_matches('ἐστιν',words) (dict_words, words_clean, words_freq) = word_dicts # print "word in:" # print dump(wordIn) # wordIn = preprocess_word(wordIn) # print "word in pp:" # print dump(wordIn) wordInTrans = leven.transIn(wordIn) if (debug): print print "getCloseWords for", wordInTrans.encode( 'utf-8'), "(", wordIn.encode('utf-8'), ")" dump(wordIn) output_words = [] #dict_words_set = set(dict_words) n = 0 # print "Now comparing to..." if wordInTrans in dict_words: pass # print "short-circuting dictionary word" # output_words.append((wordInTrans,0,0,0,'xxx','yyy')) else: for word in dict_words: # print u"*****" + words_clean[n] # print "word into comparison:" # print dump(word) lev_distance = Levenshtein.distance( wordInTrans, word ) # difflib.SequenceMatcher(None, word, wordInTrans).ratio() # print "distance: ", # print ratio if lev_distance <= threshold: edits = Levenshtein.editops(wordInTrans, word) w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False) output_words.append( (word, lev_distance, len(edits), w, 'xxx', 'yyy')) if (lev_distance == 0) and (fast == True): # In the case of an exact match, cut the search short # We might have got some close matches ahead of time, so this # will not create a complete list output_words = sorted( output_words, key=lambda word: int(words_freq[word[0]])) return sorted(output_words, key=lambda word: int(word[3])) n = n + 1 return sorted(output_words, key=lambda word: word[3])
def distance_simliarity(entity_list): for name1, name2 in combinations(entity_list, 2): dist_lvst = lvst.distance(name1, name2) dist_jaro = lvst.jaro_winkler(name1, name2) edit_ops = lvst.editops(name1, name2) match_blocks = lvst.matching_blocks(edit_ops, name1, name2) yield ((name1, name2), dist_lvst, dist_jaro, edit_ops, match_blocks)
def get_editops(hyp_phns, ref_phns): phn_super_set = set(hyp_phns + ref_phns) p2c = {ph:chr(65+i) for i, ph in enumerate(sorted(phn_super_set))} c2p = {chr(65+i):ph for i, ph in enumerate(sorted(phn_super_set))} hyp_chars = "".join([p2c[ph] for ph in hyp_phns]) ref_chars = "".join([p2c[ph] for ph in ref_phns]) return lev.editops(hyp_chars, ref_chars)
def mapJuliusPronunciationToCabocha(juliusPhonesTxt, cabochaPhonesByWord): ''' Aligns the phones in a julius pronunciation list into phones chunked into words by cabocha The phonetisation is a little different, so the mapping tries to do so gracefully. Basically, modify cabochaPhonesByWord to have the same number of phones as juliusPhonesTxt. This is done by finding the edits necessary to make the two strings the same and then applying those edits. Once the two strings contain the same number of phones, dump the phones from julius into the slots for the phones inside of the modified cabocha words. ''' def _buildWordIndicies(cabochaPhonesByWord): startI = 0 wordIndicies = [] for word in cabochaPhonesByWord: wordIndicies.append([startI, startI + len(word)]) startI += len(word) return wordIndicies def _getWordForCharIndex(indiciesForWords, targetI): returnI = None for i, indicies in enumerate(indiciesForWords): start, stop = indicies if targetI >= start and targetI < stop: returnI = i break return returnI cabochaPhonesByWord = [ phones.replace(":", "") for phones in cabochaPhonesByWord ] cabochaPhonesTxt = " ".join(cabochaPhonesByWord) # Mutate cabochaPhonesByWord to contain the same number # of phones as juliusPhonesTxt edits = Levenshtein.editops(cabochaPhonesTxt, juliusPhonesTxt) wordIndicies = _buildWordIndicies(cabochaPhonesByWord) for operation, startIndex, _ in edits: wordI = _getWordForCharIndex(wordIndicies, startIndex) if operation == 'delete': cabochaPhonesByWord[wordI] = cabochaPhonesByWord[wordI][:-1] elif operation == 'insert': cabochaPhonesByWord[wordI] += '-' # Chunk juliusPhonesByWord according to the number # of phones in the now aligned cabochaPhonesByWord juliusPhonesByWord = [] startI = 0 for wordNum, phones in enumerate(cabochaPhonesByWord): endI = startI + len(phones) juliusWordPhones = juliusPhonesTxt[startI:endI] juliusPhonesByWord.append(juliusWordPhones) startI = endI + 1 # Add 1 space for the space between words return juliusPhonesByWord
def choose_best_match(word_meta, word_alternatives): #word_alternatives = sort_word_alternatives(word_alternatives) read_word = co.word_from_meta_array(word_meta) # print "Checking -- ", read_word chosen_word = read_word # Traverse through alternatives, received from elasticsearch for word_alt in word_alternatives: # print 'Word alternative', word_alt['word'], word_alt['score'] word_alt_is_wrong = False modifying_word_meta = copy.deepcopy(word_meta) # Take edit operations from read word to alternative. # Check if alternative is better than original. editops = lev.editops(read_word, word_alt['word']) for editop in editops: (op, source_index, dest_index) = editop # print op, source_index, dest_index, read_word, word_alt['word'] if op == 'replace': if len(modifying_word_meta) <= source_index: print 'Asking to replace unknown index in word. Skipping alternative word' word_alt_is_wrong = True break if replacing_letter_is_wrong(modifying_word_meta[source_index], word_alt['word'][dest_index]): word_alt_is_wrong = True break modifying_word_meta[source_index]['char'] = word_alt['word'][ dest_index] elif op == 'delete': if len(modifying_word_meta) <= source_index: print 'Asking to delete unknown index in word. Skipping alternative word' word_alt_is_wrong = True break if deleting_letter_is_wrong(modifying_word_meta[source_index]): word_alt_is_wrong = True break del modifying_word_meta[source_index] elif op == 'insert': modifying_word_meta.insert( source_index, {'char': word_alt['word'][dest_index]}) if not word_alt_is_wrong: # Word alternative passed all the checks, so we replace the original chosen_word = word_alt['word'] if read_word != chosen_word: print 'Word was corrected ' + read_word + ' with ' + chosen_word break return chosen_word
def edit_str(a,b): arr = [] op_dict = {'replace': 'R', 'delete': 'D', 'insert': 'I'} for op, i, j in Levenshtein.editops(a, b): arr += ['%s%d:%s->%s' % (op_dict[op], i, a[i], b[j])] # need to track actual insertions/deletions from here on... if op in ('insert', 'delete'): arr += ['***'] break return '\n'.join(arr)
def wordDistance(sentence1, sentence2): '''get the damerau levenshtein distance between sentences, in terms of words''' symbolset = list("abcdefghijklmnopqrstuvwxyz") symbolset = symbolset + [x.upper() for x in symbolset] symbolset = symbolset + list("1234567890") s1 = sentence1.lower().split(' ') s2 = sentence2.lower().split(' ') vocab = list(set(s1).union(set(s2))) symbol_to_word = dict(zip([symbolset[x] for x in range(len(vocab))], vocab)) word_to_symbol = dict(zip(symbol_to_word.values(), symbol_to_word.keys())) s1_translated = ''.join([word_to_symbol[x] for x in s1]) s2_translated = ''.join([word_to_symbol[x] for x in s2]) ls1 = list(s1_translated) ls2 = list(s2_translated) #dist = pyxdameraulevenshtein.damerau_levenshtein_distance(s1_translated, s2_translated) editops = Levenshtein.editops(s1_translated, s2_translated) translated_editops = [] for editop in editops: if editop[0] == 'replace': translated_editops.append({ 'operation': editop[0], 'in_input': symbol_to_word[ls1[editop[1]]], 'in_output': symbol_to_word[ls2[editop[2]]] }) elif editop[0] == 'insert': translated_editops.append({ 'operation': editop[0], 'in_input': '', 'in_output': symbol_to_word[ls2[editop[2]]] }) elif editop[0] == 'delete': translated_editops.append({ 'operation': editop[0], 'in_input': symbol_to_word[ls1[editop[1]]], 'in_output': '' }) else: raise NotImplementedError return ({ 'num_edits': len(translated_editops), 'edit_ops': translated_editops, 'normalized_dist': len(translated_editops) / float(len(list(s1))) })
def error_distribution(self, src, tgt, errors): edits = Levenshtein.editops(src, tgt) for edit in edits: if edit[0] == "replace": errors[("replace", src[edit[1]], tgt[edit[2]])] += 1 elif edit[0] == "delete": errors[("delete", src[edit[1]])] += 1 elif edit[0] == "insert": errors[("insert", tgt[edit[2]])] += 1 else: print(edit)
def _get_word_errors(s1, s2): b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] ops = Lev.editops(''.join(w1), ''.join(w2)) errors = {"delete": 0, "insert": 0, "replace": 0} for x in ops: errors[x[0]] += 1 return errors
def edit_str(a, b): arr = [] op_dict = {'replace': 'R', 'delete': 'D', 'insert': 'I'} for op, i, j in Levenshtein.editops(a, b): if op in ('insert', 'delete'): arr += ['***'] break arr += ['%s%d:%s->%s' % (op_dict[op], i, a[i], b[j])] # need to track actual insertions/deletions from here on... return '\n'.join(arr)
def distance_to(self, other): ''' Length-adjusted Levenshtein "distance" to other OTU other: OTU distance to this OTU returns: float ''' ops = Levenshtein.editops(self.sequence, other.sequence) return len(ops) / (len(self.sequence) + len([o for o in ops if o[0] == 'delete']))
def nbeditops(s1, s2): d = 0 i = 0 s = 0 for op in L.editops(s1, s2): if op[0] == 'delete': d += 1 elif op[0] == 'insert': i += 1 elif op[0] == 'replace': s += 1 return d, i, s
def encode(cls, fullname, fullname_true): target = [''] * len(fullname) edit_opts = Levenshtein.editops(fullname, fullname_true) edit_opts = sorted(edit_opts, key=lambda x: (x[0], -x[1]), reverse=True) for op, src, dst in edit_opts: if op == 'delete': target[src] = '--' if op == 'replace': target[src] = fullname_true[dst] if op == 'insert': target[src] = '+' + fullname_true[dst] return target
def _condProbName(name1, name2, edit_count, total_edits, smoothing, cp_memoize): # computes the conditional probability of arriving at name1 # by performing a series of operation on name2. temp_count = defaultdict(float) holder = 0.0 for k, v in edit_count.iteritems(): temp_count[k] = v / total_edits edits = edist.editops(name1, name2) for e in edits: holder += np.log(temp_count[e] + smoothing) log_cnd_prob = np.sum(holder) cp_memoize[(name1, name2)] = np.exp(log_cnd_prob) return cp_memoize
def make_improved_old(old, new): """ 3. Modify the old version of the hunk by these typo edits, so that it looks more like the new version.""" # To avoid MemoryErroring out, we calculate # Calculate the edit moves necessary eo = lev.editops(old, new) # Now, filter those through something that looks for only "typo edits" do_these = only_typo_editops(eo) # Now, do them to old return lev.apply_edit(do_these, old, new)
def _get_editops( source_string: str, destination_string: str ): editops = Levenshtein.editops(source_string, destination_string) # type(editops) # substitutions = sum(1 if op[0] == "replace" else 0 for op in editops) # deletions = sum(1 if op[0] == "delete" else 0 for op in editops) # insertions = sum(1 if op[0] == "insert" else 0 for op in editops) # hits = len(source_string) - (substitutions + deletions) return editops
def correct(match): word = match.group(0) normed_word = word.lower() if normed_word in correction_list: new_word = correction_list[normed_word] if word.isupper(): return new_word.upper() else: edits = Levenshtein.editops(normed_word, new_word) return Levenshtein.apply_edit(edits, word, new_word) elif normed_word in dictionary: return word else: return word
def _editCounts(name_samp): # to compute probability of edit operations use a subsample of names edit_count = defaultdict(int) p = len(name_samp) total_edits = 0 for i in range(p): for j in range(i + 1, p): if i < j: edits = edist.editops(name_samp[i], name_samp[j]) p = len(edits) lene = p total_edits += len(edits) for k in range(lene): edit_count[edits[k]] += 1 return edit_count, total_edits
def pitch_sequence_edits(true_pitches, pred_pitches): """Given two lists of <pitch> elements, computes their edit distance. :param pitches1: First sequence of pitches. :param pitches2: Second sequence of pitches. :return: The Levenshtein edits. """ coder = PitchCoder() true_code = coder.pitches2string(true_pitches) pred_code = coder.pitches2string(pred_pitches) edits = Levenshtein.editops(true_code, pred_code) return edits
def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False): import Levenshtein # out = difflib.get_close_matches('ἐστιν',words) (dict_words, words_clean, words_freq) = word_dicts # print "word in:" # print dump(wordIn) # wordIn = preprocess_word(wordIn) # print "word in pp:" # print dump(wordIn) wordInTrans = leven.transIn(wordIn) if (debug): print print "getCloseWords for", wordInTrans.encode('utf-8'), "(", wordIn.encode('utf-8'),")" dump(wordIn) output_words = [] #dict_words_set = set(dict_words) n = 0 # print "Now comparing to..." if wordInTrans in dict_words: pass # print "short-circuting dictionary word" # output_words.append((wordInTrans,0,0,0,'xxx','yyy')) else: for word in dict_words: # print u"*****" + words_clean[n] # print "word into comparison:" # print dump(word) lev_distance = Levenshtein.distance( wordInTrans, word) # difflib.SequenceMatcher(None, word, wordInTrans).ratio() # print "distance: ", # print ratio if lev_distance <= threshold: edits = Levenshtein.editops(wordInTrans, word) w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False) output_words.append( (word, lev_distance, len(edits), w, 'xxx', 'yyy')) if (lev_distance == 0) and (fast == True): # In the case of an exact match, cut the search short # We might have got some close matches ahead of time, so this # will not create a complete list output_words = sorted( output_words, key=lambda word: int(words_freq[word[0]])) return sorted(output_words, key=lambda word: int(word[3])) n = n + 1 return sorted(output_words, key=lambda word: word[3])
def test_edit_dist(x): s1 = '12012014321231200112211' s2 = '1300201231200112211' seq1 = [1,2,0,1,2,0,1,4,3,2,1,2,3,1,2,0,0,1,1,2,2,1,1] seq2 = [1,3,0,0,2,0,1,2,3,1,2,0,0,1,1,2,2,1,1] pos = np.asarray([[0,0],[0,1], #0 and 1 are nn [2,0],[2,1], #2 and 3 are nn [4,0],[4,1], #4 and 5 are nn [6,0],[6,1], #6 and 7 are nn [8,0],[8,1], #8 and 9 are nn [9,0],[9,1], #10 and 11 are nn [10,0],[10,1]],#12 and 13 are nn dtype=float) #modify this to ensure it is a non-connected k-nn nn = distance.ann(pos,1)[1][:,1:] k = 0 rp = 1 w = {'M':lambda x:0,'I':lambda x:1,'D':lambda x:1, 'S':lambda x:2, 'P':lambda x:0.5 } a = align.Align(w,rp,nn,k) u,v = 0,0 t0 = time.time() for i in range(0,int(x)): u = jellyfish.levenshtein_distance(s1,s2) t1 = time.time() t2 = time.time() for i in range(0,int(x)): v = Levenshtein.editops(s1,s2) v = Levenshtein.distance(s1,s2) t3 = time.time() t4 = time.time() for i in range(0,int(x)): #v = a.edit_dist(seq1,seq2) #w = a.edit_graph(seq1,seq2) #w = a.levenshtein(seq1,seq2) w = 1 w = a.edit_dist(seq1,seq2) t5 = time.time() #w = a.edit_dist(seq1,seq2) print('editdist dist = %s'%v) print('seq edit dist = %s'%w) print('editdist runtime is %s seconds'%(t3-t2)) print('seq edit dist = %s'%(t5-t4))
def compile_channels(self): """ Compiles the list of channels found. This will attempt to group channels by edit distance. """ group_name_omits = ["train_", "valid_", "test_"] edit_thresh = 6 for channel in self.channels: edit_distances = dict((c, Levenshtein.distance(channel, c)) for c in self.channel_groups.keys()) if len(edit_distances) == 0: group_name = channel for omit in group_name_omits: group_name = group_name.replace(omit, "") self.channel_groups[group_name] = [channel] else: group = None min_ed = len(channel) for c, d in edit_distances.iteritems(): if d <= min_ed: min_ed = d group = c if min_ed > edit_thresh or group is None: group_name = channel for omit in group_name_omits: group_name = group_name.replace(omit, "") self.channel_groups[group_name] = [channel] else: # Now we reduce the group to the minimum shared string # mb = matching blocks (see Levenshtein docs). mb =\ Levenshtein.matching_blocks( Levenshtein.editops(channel, group), channel, group) new_group = "".join([group[x[1]:x[1]+x[2]] for x in mb]) if new_group != group: self.channel_groups[new_group] =\ copy.deepcopy(self.channel_groups[group]) self.channel_groups.pop(group) self.channel_groups[new_group].append(channel) for group, channels in self.channel_groups.iteritems(): self.d["logs"][group] = {} for channel in channels: self.d["logs"][group][channel] = [] self.logger.info("Channels: %r" % self.d["logs"].keys())
def print_error_analysis(): options = config.options(read=True) output = get_output(options.run_dir, 'eval') errors = [(inst['input'], pred, inst['output']) for inst, pred in zip(output.data, output.predictions) if inst['output'] != pred] if 0 < options.max_examples < len(errors): indices = np.random.choice(np.arange(len(errors)), size=options.max_examples, replace=False) else: indices = range(len(errors)) if options.html: print('<!DOCTYPE html>') print('<html><head><title>Error analysis</title><meta charset="utf-8" /></head><body>') for i in indices: inp, pred, gold = [unicode(s).strip() for s in errors[i]] editops = lev.editops(gold, pred) print_visualization(inp, pred, gold, editops, html=options.html) if options.html: print('</body></html>')
def editops(w1, w2): # print >>sys.stderr, w1, w2, ':\t', if (w1,w2) in editops_dict: return editops_dict[(w1,w2)] ops_str = '' uw1 = w1.decode('utf-8') uw2 = w2.decode('utf-8') # >>>> Levenshtein.editops('Iwentu', 'I-want') # [('insert', 1, 1), ('replace', 2, 3), ('delete', 5, 6)] # apply_edit(edit_operations, source_string, destination_string) # In the case of editops, the sequence can be arbitrary ordered subset # of the edit sequence transforming source_string to destination_string. # Examples: # >>> e = editops('man', 'scotsman') # >>> apply_edit(e, 'man', 'scotsman') # 'scotsman' # >>> apply_edit(e[:3], 'man', 'scotsman') # 'scoman' ops = Levenshtein.editops(uw1, uw2) for opnum in range(len(ops)): (opname, sub1, sub2) = ops[opnum] if opname == 'delete': ops_str += opname[0] + uw1[sub1].encode('utf-8') + '&' elif opname == 'insert': ops_str += opname[0] + uw2[sub2].encode('utf-8') + '&' else: ops_str += opname[0] + uw1[sub1].encode('utf-8') + uw2[sub2].encode('utf-8') + '&' # except: # print >>sys.stderr, uw1, uw2, sub1, sub2, ops editops_dict[(w1,w2)] = ops_str # print >>sys.stderr, ops_str return ops_str
def get_parts(string1, string2): length1 = len(string1) length2 = len(string2) editops = lev.editops(string1, string2) # only include strings which are different? equal_blocks = lev.matching_blocks(editops, length1, length2) get_distance1 = functools.partial(get_index_distance, length=length1) get_distance2 = functools.partial(get_index_distance, length=length2) # there is always one zero-length 'matching block' at the end if len(equal_blocks) > 1: # for each matching block, get the corresponding substring # and store the indexes from both strings # this will allow us to keep track of where the blocks come from in the strings equal_parts = [(string1[index1:index1 + block_length], get_distance1(index1), get_distance2(index2)) for index1, index2, block_length in equal_blocks if block_length] return equal_parts else: return []
def differential_encode (self, form_non_tonal, form_tonal, seperator = True) : self.p_src = -1 self.p_dst = -1 self.src = reshaping(form_non_tonal, False) if not self.src : if seperator: return [u"", [token_seperator]] else : return [u"", []] self.chunks = chunking(self.src) self.ret = [u"" for i in range(len(self.chunks))] self.dst = reshaping(form_tonal, False) ops = Levenshtein.editops(self.src, self.dst) self.stat.form_non_tonal[self.src] += 1 self.stat.form_tonal [self.dst] += 1 self.stat.dict_form_tonal.setdefault(self.src, []).append(self.dst) for op in ops : mode, self.p_src, self.p_dst = op if mode == "delete" : self.delete() elif mode == "insert" : self.insert() else : # mode == "replace" self.insert() self.delete() # enlèvement du séparateur du code à la fin du chunk tmp = [] for ret2 in self.ret :
def get_str_simis(self, str1, str2): return [Levenshtein.jaro(str1, str2), Levenshtein.ratio(str1,str2), len(Levenshtein.editops(str1, str2))]
for line in target_f.xreadlines(): units = line.split() gene=units[0] seq=units[1] target_dic[gene] = seq for each in target_dic.keys(): each_file = './' + each + '.txt' each_f = open (each_file, 'r') each_out = open ('./result_'+each+'.txt','w') header = each_f.readline() each_out.write(header.strip('\n')+'\t'+'bulge:1'+'\t'+'3'+'\t'+'5'+'\n') for line in each_f.xreadlines(): units = line.split() target_seq = units[4] value ={} for nuc in "ATGC": target_seq.replace('N',nuc) value[nuc] = (l.distance(target_seq,target_dic[each]),l.editops(target_seq,target_dic[each])) max_dist, max_editops = value[max(value, key=value.get)] step = [] for each_editop in max_editops: a,b,c = each_editop step.append(a) deletion = step.count('delete') each_out.write(line.strip()+'\t'+str(max_dist)+'\t'+str(max_dist+deletion*2)+'\t'+str(max_dist+deletion*4)+'\n') each_f.close() each_out.close()
def alignChars( source, target, ErrStats = None, ErrStats_lock = None ): """ alignChars takes a pair of words from parallel corpora that have been word aligned. Errors introduced by the noisy channel (OCR) are revealed by finding the sequence of edit operations that map source to target using Levenshtein Edit Distance module. The edit sequence can be used to generate character alignments. @param source: original word from corrected corpora @type source: str @param target: OCR output word from uncorrected corpora @type target: str @return: source and target words represented character aligned in a list of tuples e.g. [(s_1,t_1),...,(s_n,t_n)] @rtype: list """ editops = Levenshtein.editops( source, target ) SPOS = 1 TPOS = 2 OP = 0 sourceArray = [ char for char in source ] targetArray = [ char for char in target ] substituteCount = 0 insertCount = 0 deleteCount = 0 for element in editops: if element[ OP ] == 'insert': sourceArray.insert( element[ TPOS ], '' ) insertCount += 1 if element[ OP ] == 'delete': targetArray.insert( element[ SPOS ], '' ) deleteCount += 1 if element[ OP ] == 'replace': substituteCount += 1 if ErrStats: with ErrStats_lock: ErrStats.updateDistribution( 'editDist_correctLen', len( editops ), len( source ) ) ErrStats.updateDistribution( 'editDist_errorLen', len( editops ), len( target ) ) ErrStats.updateDistribution( 'errorLen_correctLen', len( target ), len( source ) ) ErrStats.updateDistribution( 'errorLen_editDist', len( target ), len( editops ) ) ErrStats.updateDistribution( 'errorLen_editOps', len( target ), (insertCount, deleteCount, substituteCount) ) ErrStats.updateDistribution( 'errorLen_insertOp', len( target ), insertCount ) ErrStats.updateDistribution( 'errorLen_deleteOp', len( target ), deleteCount ) ErrStats.updateDistribution( 'errorLen_substituteOp', len( target ), substituteCount ) ErrStats.updateDistribution( 'correctLen_editDist', len( source ), len( editops ) ) ErrStats.updateDistribution( 'errorLens', len( target ) ) ErrStats.updateDistribution( 'correctLens', len( source ) ) ErrStats.updateDistribution( 'editDists', len( editops ) ) ErrStats.updateDistribution( 'insertEdits', insertCount ) ErrStats.updateDistribution( 'deleteEdits', deleteCount ) ErrStats.updateDistribution( 'substituteEdits', substituteCount ) output = [ w for w in zip( targetArray, sourceArray ) ] return output