def get_edit_dist(input_file, output_file, target_seq):
    fi = open(input_file, 'r')
    fo = open(output_file, 'w')
    fo.write(
        "Chr\tLocation\tForward29\tReverse29\tEdit dist for\tEdit dist rev\tStep for\tStep rev\tDeletion# for\tDeletion# rev\tBulge dist for\tBulge dist rev\n"
    )
    for line in fi.xreadlines():
        units = line.split()
        chrm = units[0]
        loc = units[1]
        forseq = units[2]
        revseq = units[3]

        value = {}
        for nuc in "ATGC":
            target_seq.replace('N', nuc)
            value[nuc] = (l.distance(target_seq,
                                     forseq), l.editops(target_seq, forseq))
        for_max = max(value, key=value.get)
        for_dist, for_editops = value[for_max]
        for_step = []
        for each in for_editops:
            a, b, c = each
            for_step.append(a)
        for_deletion = for_step.count('delete')
        value = {}
        for nuc in "ATGC":
            target_seq.replace('N', nuc)
            value[nuc] = (l.distance(target_seq,
                                     revseq), l.editops(target_seq, revseq))
        rev_max = max(value, key=value.get)
        rev_dist, rev_editops = value[rev_max]
        rev_step = []
        for each in rev_editops:
            a, b, c = each
            rev_step.append(a)
        rev_deletion = rev_step.count('delete')
        #print int(rev_deletion)

        bulge_l = [for_dist + for_deletion * 2, rev_dist + rev_deletion * 2]
        del_l = [for_deletion, rev_deletion]
        fo.write(
            "%s\t%s\t\
        %s\t%s\t\
        %d\t%d\t\
        %s\t%s\t\
        %d\t%d\t\
        %d\t%d\t\
        %d\t%d\n" %
            (chrm, loc, forseq, revseq, for_dist, rev_dist, for_editops,
             rev_editops, for_deletion, rev_deletion, bulge_l[0], bulge_l[1],
             min(bulge_l), del_l[bulge_l.index(min(bulge_l))]))
        #print rev_dist+rev_deletion*4
    fi.close()
    fo.close()
Example #2
0
def main():

    # a = 'GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC'
    # b = 'GAGTCGAGCAGAAGAAGAANGG'

    a = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA'
    b = 'TTTNCTGATGGTCCATGTCTGTTACTC'

    print(l.distance(a, b))
    print(l.editops(a, b))
    print(l.matching_blocks(l.editops(a, b), a, b))
def get_edit_dist(input_file, output_file, target_seq):
    fi = open (input_file,'r')
    fo = open (output_file, 'w')
    fo.write("Chr\tLocation\tForward29\tReverse29\tEdit dist for\tEdit dist rev\tStep for\tStep rev\tDeletion# for\tDeletion# rev\tBulge dist for\tBulge dist rev\n")
    for line in fi.xreadlines():
        units = line.split()
        chrm = units[0]
        loc = units[1]
        forseq = units[2]
        revseq = units[3]
        
        value = {}
        for nuc in "ATGC":
            target_seq.replace('N',nuc)
            value[nuc] = (l.distance(target_seq,forseq),l.editops(target_seq,forseq))
        for_max = max(value, key=value.get)
        for_dist, for_editops = value[for_max]
        for_step=[]        
        for each in for_editops:
            a,b,c =each
            for_step.append(a)
        for_deletion=for_step.count('delete')
        value = {}
        for nuc in "ATGC":
            target_seq.replace('N',nuc)
            value[nuc] = (l.distance(target_seq,revseq),l.editops(target_seq,revseq))
        rev_max = max(value, key=value.get)
        rev_dist, rev_editops = value[rev_max]
        rev_step=[]        
        for each in rev_editops:
            a,b,c=each
            rev_step.append(a)
        rev_deletion=rev_step.count('delete')
        #print int(rev_deletion)
        
        bulge_l = [for_dist+for_deletion*2,rev_dist+rev_deletion*2]
        del_l = [for_deletion,rev_deletion]
        fo.write(
        "%s\t%s\t\
        %s\t%s\t\
        %d\t%d\t\
        %s\t%s\t\
        %d\t%d\t\
        %d\t%d\t\
        %d\t%d\n"%
        (chrm,loc,
         forseq,revseq,
         for_dist,rev_dist,
         for_editops,rev_editops,
         for_deletion,rev_deletion,
         bulge_l[0],bulge_l[1],
         min(bulge_l),del_l[bulge_l.index(min(bulge_l))]))
        #print rev_dist+rev_deletion*4
    fi.close();fo.close()
Example #4
0
def debugging(folderName):
    referenceDic = IORobot.loadContigsFromFile(folderName, "reference.fasta")
    interiorsDic = IORobot.loadContigsFromFile(folderName, "interiors.fasta")
    GTDic = IORobot.loadContigsFromFile(folderName, "GTDic.fasta")

    str1 = referenceDic["Segkk0"][2500000:2500000 + 12000]
    str2 = referenceDic["Segkk1"][2500000:2500000 + 12000]
    print Levenshtein.distance(str1, str2)
    print Levenshtein.editops(str1, str2)

    str3 = interiorsDic["Segkk0"][7000:7000 + 12000]
    str4 = interiorsDic["Segkk1"][7000:7000 + 12000]
    print Levenshtein.distance(str1, str4)
    print Levenshtein.distance(str2, str3)
    print Levenshtein.editops(str1, str4)

    print ""
    offset = 4000
    print Levenshtein.editops(str2, str3)
    print str1[offset - 10:offset + 10]
    print str2[offset - 10:offset + 10]
    print str3[offset - 10:offset + 10]
    print str4[offset - 10:offset + 10]

    str5 = GTDic["Segkk0"][7000:7000 + 12000]
    str6 = GTDic["Segkk1"][7000:7000 + 12000]

    print str5[offset - 10:offset + 10]
    print str6[offset - 10:offset + 10]

    print Levenshtein.editops(str2, str4)
Example #5
0
    def mappingtxtans(self, speech_results):
        ans_txt = config['Answer'].get('ans_txt')
        f = open(ans_txt)
        text = []
        for line in f:
            text.append(line.rstrip())
        print(text)

        if self.read_num > len(text):
            self.read_num == 0

        txt_ans = text[self.read_num]
        speech_ans_list = list(speech_results)
        speech_ans_range = range(len(speech_ans_list))
        e = Levenshtein.editops(txt_ans, speech_results)
        index = []
        com = ','
        com_index = speech_results.find(com)
        e = list(filter(lambda x: x[0] != 'delete', e))
        for item in e:
            if item[2] != com_index:
                index.append(item[2])
        if index == []:
            return "no different"
        else:
            return index
 def operations(self):
     """
     Metrics to determine the functionality of the surface segmentation generator
     :return: number of edited words, total words in dataset, number of operations performed, number of deletions,
     number of replacements
     """
     edited_words, total_words, operations, delete, replace = 0, 0, 0, 0, 0
     for file in self.all_files:
         input_file = open(os.path.join(sys.path[0], file), 'r')
         for line in input_file.readlines():
             content = line.rstrip('\n').split(" | ")
             orthographic = de_segment(removeLabels(content[3]))
             word = content[0]
             total_words += 1
             if not word.__eq__(orthographic):
                 edited_words += 1
                 edits = LevenshteinDistance.editops(orthographic, word)
                 for ed in edits:
                     if ed[0] == 'delete':
                         operations += 1
                         delete += 1
                     elif ed[0] == 'replace':
                         operations += 1
                         replace += 1
     return edited_words, total_words, operations, delete, replace
Example #7
0
def main(args):
    logging.info('Starting main...')
    _start_time = time.clock()

    if not os.path.isfile(args.true):
        raise OSError('True file not found: {0}'.format(args.true))
    if not os.path.isfile(args.prediction):
        raise OSError('Prediction file not found: {0}'.format(args.prediction))

    conversion_script = os.path.join(args.m2ly_path, 'musicxml2ly')
    if not os.path.isfile(conversion_script):
        raise OSError('Cannot find musicxml2ly conversion script: {0}'.format(conversion_script))

    conversion_cmd = '{0} --lxml -a -o - '.format(conversion_script)

    true_ly = postprocess_ly(execute(conversion_cmd + args.true))
    pred_ly = postprocess_ly(execute(conversion_cmd + args.prediction))

    edits = Levenshtein.editops(true_ly, pred_ly)
    print('{0}'.format(len(edits)))

    if args.export_pred:
        with codecs.open(args.prediction + '.ly', 'w', 'utf-8') as export_h:
            export_h.write(pred_ly + u'\n')

    _end_time = time.clock()
    logging.info('lilypond_eval.py done in {0:.3f} s'.format(_end_time - _start_time))
 def makeCigar(seq, ref):
     if (len(seq) > 16384) or (len(ref) > 16384):
         rmid = len(ref) / 2
         smid = len(seq) / 2
         prox = makeCigar(seq[:smid], ref[:rmid])
         dist = makeCigar(seq[smid:], ref[rmid:])
         return prox + dist
     ops = Levenshtein.editops(seq, ref)
     code = ['=' for i in xrange(len(seq))]
     offset = 0
     for op, si, di in ops:
         if (op == "replace"):
             code[si + offset] = 'X'
         elif (op == "insert"):
             code.insert(si + offset, 'D')
             offset += 1
         elif (op == "delete"):
             code[si + offset] = 'I'  # LM: fixed bug here 2019-04-15
     cigar = ''
     count = 1
     prev = code[0]
     for c in code[1:]:
         if (c == prev):
             count += 1
         else:
             cigar += "%d%c" % (count, prev)
             count = 1
             prev = c
     cigar += "%d%c" % (count, prev)
     return cigar
Example #9
0
def get_cost(source_word, target_word, action_count=None):
    ops = lev.editops(source_word, target_word)
    if flag: print(ops)
    total_cost = 0
    for op in ops:
        action = op[0]
        if action_count: action_count[action] += 1
        source_pos = op[1]
        target_pos = op[2]
        char = source_word[source_pos] if action == 'delete' else target_word[
            target_pos]
        if flag: print(char)
        part1 = source_word[:source_pos]
        part2 = source_word[source_pos:] if action == 'insert' else source_word[
            source_pos + 1:]  #if pos < len(source_word) else ''
        if flag: print(action, part1, part2)
        consecutive_count_1 = get_consecutive_count(part1[::-1], char)
        consecutive_count_2 = get_consecutive_count(part2, char)
        total = consecutive_count_1 + consecutive_count_2 + 1  #min 1 (denote the inserted/replaced/deleted char)
        source_len = len(source_word)
        if flag:
            print(op, "total", total, "weight", COST_WEIGHTS[action],
                  "source len", source_len)
        cost = COST_WEIGHTS[action] / (
            total / source_len
        )  #if action == 'delete' else COST_WEIGHTS[action]*(total/source_len)
        total_cost += cost
        if flag: print("cost", cost)
    if flag: print(total_cost)
    if not action_count:
        return total_cost
    return total_cost, action_count
Example #10
0
def editOpts(correct, raw, threshold=2):
    opts = Levenshtein.editops(correct, raw)
    opsLength = len(opts)

    transOpts = []
    if opsLength == 0:
        transOpts = None
    elif opsLength >= 2:
        idx = 0
        while idx < opsLength - 1:
            opt1 = opts[idx]
            opt2 = opts[idx + 1]
            idx1 = opt1[2]
            idx2 = opt2[2]
            try:
                if opt1[0] == opt2[0] == 'replace' and abs(idx1 - idx2) == 1 and correct[idx1] == raw[idx2] and correct[idx2] == raw[idx1]:
                    transOpts.append(('transposition', idx1, idx2))
                    idx += 2
                else:
                    transOpts.append(opt1)
                    idx += 1
            except:
                transOpts.append(opt1)
                idx += 1

        if idx == opsLength - 1:
            transOpts.append(opts[idx])
    else:
        transOpts = opts

    if transOpts and len(transOpts) > threshold:
        return None
    else:
        return transOpts
Example #11
0
def add_bi_query(count, ss, sd):
    global N
    ss = '^' + ss
    sd = '^' + sd
    i_s = 0
    i_d = 0
    eds = Levenshtein.editops(ss, sd)
    for ed in eds:
        if ed[0] == 'insert':
            ss = string_insert(ss, ed[1] + i_s, '_')
            i_s += 1
        if ed[0] == 'delete':
            sd = string_insert(sd, ed[2] + i_d, '_')
            i_d += 1
    assert (len(ss) == len(sd))
    sym = '='.decode('utf8')
    for i in range(len(ss) - 2):
        if sd[i + 1] == ss[i + 1] or sym == sd[i] or sym == sd[
                i + 1] or sym == ss[i] or sym == ss[i + 1]:
            continue
        if sd[i:i + 2] not in count:
            count[sd[i:i + 2]] = {}
        if ss[i] not in count[sd[i:i + 2]]:
            count[sd[i:i + 2]][ss[i]] = Counter()
        count[sd[i:i + 2]][ss[i]][ss[i + 1]] += 1
        N += 1
Example #12
0
def identify_anchor_kmer_in_reference_graph(reference_graph, kmer_to_anchor, leftmost=None, rightmost=None, path_length=None):
	"""

	:type reference_graph: nx.DiGraph
	"""
	toposort = {v: k for k, v in enumerate(nx.topological_sort(reference_graph))}
	# print "Righmost is ",rightmost,toposort[rightmost]
	nodes_to_consider = reference_graph.nodes()
	if rightmost:
		idx = toposort[rightmost]
		nodes_to_consider = ifilter(lambda x: toposort[x] <= idx, nodes_to_consider)
	# print "Max is ", idx
	if leftmost:
		idx = toposort[leftmost]
		nodes_to_consider = ifilter(lambda x: toposort[x] >= idx, nodes_to_consider)
	# print "Min is ", idx
	nodes_to_consider = list(nodes_to_consider)

	node_dists = [(node, Levenshtein.distance(node, kmer_to_anchor), Levenshtein.editops(node, kmer_to_anchor)) for node in
				  nodes_to_consider]
	# print "Will search anchor in ",list(node_dists)
	min_dist = min(node_dists, key=itemgetter(1))[1]
	node_dists = [x for x in node_dists if x[1] == min_dist]
	print "Min possible dist is", min_dist
	if rightmost:
		score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[rightmost] - path_length))
	elif leftmost:
		score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[leftmost] + path_length))
	dist_sorted = sorted(node_dists, key=score_func)
	# identify the rightmost node with minimal distance
	return dist_sorted[0][0]
Example #13
0
def decompose_multiple_alterations(reference_path, alternative_path, kmer_length):
	reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length)
	multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length)

	edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence)
	if len(edit_ops) > 2:
		logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence)
		logger.info("Globally apply %s", edit_ops)
	start, end = 0, 0
	while start < len(edit_ops):
		if edit_ops[start] == 'replace':
			atomic_sequence = Levenshtein.apply_edit([edit_ops[start]], reference_sequence, multi_alternative_sequence)
			# print atomic_sequence
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
			start += 1
		else:
			start_e = edit_ops[start]
			end = start + 1
			while (end < len(edit_ops)
				   and edit_ops[end][0] == start_e[0]
				   and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])):
				end += 1
			edit_op_to_apply = edit_ops[start:end]
			start = end
			logger.info("Will apply %s", edit_op_to_apply)
			atomic_sequence = Levenshtein.apply_edit(edit_op_to_apply, reference_sequence, multi_alternative_sequence)
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
		# record each atomic alteration
		logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence)
		yield atomic_sequence, atomic_path
Example #14
0
 def _get_char_errors(s1, s2):
     s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
     ops = Lev.editops(s1, s2)
     errors = {"delete": 0, "insert": 0, "replace": 0}
     for x in ops:
         errors[x[0]] += 1
     return errors
Example #15
0
def compute_uer_confusion_matrix(predictions_dict, labels_dict, unit_dict):

    slim_dict = {key:val for key, val in unit_dict.items() if val not in ['GO', 'EOS', 'MASK', 'END']}
    vocab_size = len(slim_dict)
    invdict = {v: k for k, v in slim_dict.items()}

    conf_matrix = np.zeros(shape=(vocab_size, vocab_size + 2))  # plus deletions, insertions
    edit_ops_indices = []
    edit_ops_at_word_boundaries = []
    edit_ops_not_at_word_boundaries = []

    for (id, label) in labels_dict.items():
        label_str = ''.join(_strip_extra_chars(label))
        prediction_str = ''.join(_strip_extra_chars(predictions_dict[id]))
        edit_ops = Levenshtein.editops(prediction_str, label_str)

        seen_positions = []
        for op in edit_ops:
            opname = op[0]
            if len(prediction_str) >= 40:
                edit_ops_indices.append(op[1] / len(prediction_str))  # store all errors in the source (prediction) string

            if opname == 'delete':
                source_unit = prediction_str[op[1]]
                mat_col = vocab_size
                seen_positions.append(op[1])

                if source_unit == ' ':
                    edit_ops_at_word_boundaries.append(source_unit)
                else:
                    edit_ops_not_at_word_boundaries.append(source_unit)

            elif opname == 'insert':
                source_unit = label_str[op[2]]  # the inserted unit does not exist in the source string
                mat_col = vocab_size + 1
            elif opname == 'replace':
                source_unit = prediction_str[op[1]]
                dest_unit = label_str[op[2]]
                mat_col = invdict[dest_unit] - 1
                seen_positions.append(op[1])

                if source_unit == ' ':
                    edit_ops_at_word_boundaries.append(source_unit)
                else:
                    edit_ops_not_at_word_boundaries.append(source_unit)

            else:
                raise Exception('unknown opname {}'.format(opname))

            mat_row = invdict[source_unit] - 1
            conf_matrix[mat_row, mat_col] += 1


        for idx, symbol in enumerate(prediction_str):
            if idx not in seen_positions:  # correct match
                mat_pos = invdict[symbol] - 1
                conf_matrix[mat_pos, mat_pos] += 1

    # plot_confusion_matrix(conf_matrix, invdict)
    plot_edit_ops_histogram(edit_ops_indices)
Example #16
0
def change(m):
    (mfile, sourcefile, pos, orig, mutant) = m
    eops = Levenshtein.editops(orig, mutant)
    blocks = Levenshtein.matching_blocks(eops, orig, mutant)
    if len(blocks) > 4:
        return mutant[:-1]
    keep = ''.join([orig[x[0]:x[0] + x[2]] for x in blocks])
    notKeep = ""
    pos = 0
    wasDot = False
    for c in range(0, len(orig)):
        if orig[c] == keep[pos]:
            pos += 1
            if not wasDot:
                notKeep += "..."
                wasDot = True
        else:
            notKeep += orig[c]
            wasDot = False
    notKeep += "==>"
    pos = 0
    wasDot = False
    for c in range(0, len(mutant)):
        if (pos < len(keep)) and mutant[c] == keep[pos]:
            pos += 1
            if not wasDot:
                notKeep += "..."
                wasDot = True
        else:
            notKeep += mutant[c]
            wasDot = False
    return notKeep
Example #17
0
def main(args):
    logging.info('Starting main...')
    _start_time = time.clock()

    if not os.path.isfile(args.true):
        raise OSError('True file not found: {0}'.format(args.true))
    if not os.path.isfile(args.prediction):
        raise OSError('Prediction file not found: {0}'.format(args.prediction))

    conversion_script = os.path.join(args.m2ly_path, 'musicxml2ly')
    if not os.path.isfile(conversion_script):
        raise OSError('Cannot find musicxml2ly conversion script: {0}'.format(
            conversion_script))

    conversion_cmd = '{0} --lxml -a -o - '.format(conversion_script)

    true_ly = postprocess_ly(execute(conversion_cmd + args.true))
    pred_ly = postprocess_ly(execute(conversion_cmd + args.prediction))

    edits = Levenshtein.editops(true_ly, pred_ly)
    print('{0}'.format(len(edits)))

    if args.export_pred:
        with codecs.open(args.prediction + '.ly', 'w', 'utf-8') as export_h:
            export_h.write(pred_ly + u'\n')

    _end_time = time.clock()
    logging.info('lilypond_eval.py done in {0:.3f} s'.format(_end_time -
                                                             _start_time))
def edit_distance_list(url, topN=300):
    o = urlparse(url)
    ext = (tldextract.extract(o.netloc))
    url_ext = ext.subdomain + (ext.domain)

    features = []
    global minx
    global mina
    mina = 99999999
    minx = 0
    for x in website[:topN]:
        comp = urlsimpler(x[0])
        compo = urlparse(comp)
        comp_ext = (tldextract.extract(compo.netloc))
        comp_domain = comp_ext.domain
        ed = Levenshtein.editops(url_ext, comp_domain)
        insert = 0
        replace = 0
        delete = 0
        for g in ed:
            if (g[0] == "insert"):
                insert += 1
            if (g[0] == "replace"):
                replace += 1
            if (g[0] == "delete"):
                delete += 1
        weight = insert + replace + delete
        if (weight < mina):
            mina = weight
            minx = x
        features.extend([insert, replace, delete])
    return features
Example #19
0
def error_statistics(correct_sentences: List[str],
                     written_sentences: List[str],
                     keystroke_stats: schemas.keystroke_stats):
    """
        Use Levenshtein.editops to get types of mistakes and calculate number of specific mistakes
    """
    #Levenshtein leditops loop

    for sentence in range(len(correct_sentences)):
        for word in range(len(correct_sentences[sentence])):
            differences = Levenshtein.editops(
                correct_sentences[sentence][word],
                written_sentences[sentence][word])
            #Uncomment to show differences that has been found
            #print(differences)

            if len(differences):
                for difference in differences:
                    diff_type = difference[0]

                    if diff_type is 'delete':
                        keystroke_stats.so += 1
                    elif diff_type is 'insert':
                        keystroke_stats.sa += 1
                    else:
                        keystroke_stats.sch += 1
                        correct_index = difference[1]
                        written_index = difference[2]
                        correct_word = correct_sentences[sentence][word]
                        written_word = written_sentences[sentence][word]
                        sch(correct_index, written_index, correct_word,
                            written_word, keystroke_stats)

    keystroke_stats.enc = keystroke_stats.sa + keystroke_stats.so + keystroke_stats.sch
def get_editops(src_name, tar_name):
    ops = Levenshtein.editops(src_name, tar_name)
    ops_pos, n1_pos, n2_pos = 0, 0, 0
    editops = []
    while True:
        if n1_pos == len(src_name) and n2_pos == len(tar_name):
            break
        c1 = src_name[n1_pos] if n1_pos < len(src_name) else ''
        c2 = tar_name[n2_pos] if n2_pos < len(tar_name) else ''
        op = ops[ops_pos] if ops_pos < len(ops) else None
        if op and op[1] == n1_pos and op[2] == n2_pos:
            if op[0] == 'replace':
                editops.append(c1 + c2)
                n1_pos += 1
                n2_pos += 1
            elif op[0] == 'insert':
                editops.append('_' + c2)
                n2_pos += 1
            elif op[0] == 'delete':
                editops.append(c1 + '_')
                n1_pos += 1
            else:
                raise Exception(f'Unexpected op {op}')
            ops_pos += 1
        else:
            editops.append(c1 + c2)
            n1_pos += 1
            n2_pos += 1
    return editops
Example #21
0
def getCloseWords(wordIn,
                  word_dicts,
                  rules,
                  max_weight,
                  threshold=3,
                  fast=True,
                  debug=False):
    import Levenshtein
    # out = difflib.get_close_matches('ἐστιν',words)
    (dict_words, words_clean, words_freq) = word_dicts
    # print "word in:"
    # print dump(wordIn)
    # wordIn = preprocess_word(wordIn)
    # print "word in pp:"
    # print dump(wordIn)
    wordInTrans = leven.transIn(wordIn)
    if (debug):
        print
        print "getCloseWords for", wordInTrans.encode(
            'utf-8'), "(", wordIn.encode('utf-8'), ")"
        dump(wordIn)
    output_words = []
    #dict_words_set = set(dict_words)
    n = 0
    # print "Now comparing to..."
    if wordInTrans in dict_words:
        pass
    #    print "short-circuting dictionary word"
    #    output_words.append((wordInTrans,0,0,0,'xxx','yyy'))
    else:
        for word in dict_words:
            # print u"*****" + words_clean[n]
            # print "word into comparison:"
            # print dump(word)
            lev_distance = Levenshtein.distance(
                wordInTrans, word
            )  # difflib.SequenceMatcher(None, word, wordInTrans).ratio()
            # print "distance: ",
            # print ratio
            if lev_distance <= threshold:
                edits = Levenshtein.editops(wordInTrans, word)
                w = weight_for_leven_edits(wordInTrans,
                                           word,
                                           edits,
                                           rules,
                                           max_weight,
                                           debug=False)
                output_words.append(
                    (word, lev_distance, len(edits), w, 'xxx', 'yyy'))
                if (lev_distance == 0) and (fast == True):
                    # In the case of an exact match, cut the search short
                    # We might have got some close matches ahead of time, so this
                    # will not create a complete list
                    output_words = sorted(
                        output_words,
                        key=lambda word: int(words_freq[word[0]]))
                    return sorted(output_words, key=lambda word: int(word[3]))
            n = n + 1
    return sorted(output_words, key=lambda word: word[3])
def distance_simliarity(entity_list):
    for name1, name2 in combinations(entity_list, 2):
        dist_lvst = lvst.distance(name1, name2)
        dist_jaro = lvst.jaro_winkler(name1, name2)
        edit_ops = lvst.editops(name1, name2)
        match_blocks = lvst.matching_blocks(edit_ops, name1, name2)

        yield ((name1, name2), dist_lvst, dist_jaro, edit_ops, match_blocks)
Example #23
0
def get_editops(hyp_phns, ref_phns):
    phn_super_set = set(hyp_phns + ref_phns)
    p2c = {ph:chr(65+i) for i, ph in enumerate(sorted(phn_super_set))}
    c2p = {chr(65+i):ph for i, ph in enumerate(sorted(phn_super_set))}
    hyp_chars = "".join([p2c[ph] for ph in hyp_phns])
    ref_chars = "".join([p2c[ph] for ph in ref_phns])

    return lev.editops(hyp_chars, ref_chars)
Example #24
0
def mapJuliusPronunciationToCabocha(juliusPhonesTxt, cabochaPhonesByWord):
    '''
    Aligns the phones in a julius pronunciation list into phones chunked into words by cabocha

    The phonetisation is a little different, so the mapping tries to do so gracefully.

    Basically, modify cabochaPhonesByWord to have the same number of phones as juliusPhonesTxt.
    This is done by finding the edits necessary to make the two strings the same and then applying
    those edits.
    Once the two strings contain the same number of phones, dump the phones from julius into the slots for
    the phones inside of the modified cabocha words.
    '''
    def _buildWordIndicies(cabochaPhonesByWord):
        startI = 0
        wordIndicies = []
        for word in cabochaPhonesByWord:
            wordIndicies.append([startI, startI + len(word)])
            startI += len(word)

        return wordIndicies

    def _getWordForCharIndex(indiciesForWords, targetI):
        returnI = None
        for i, indicies in enumerate(indiciesForWords):
            start, stop = indicies
            if targetI >= start and targetI < stop:
                returnI = i
                break
        return returnI

    cabochaPhonesByWord = [
        phones.replace(":", "") for phones in cabochaPhonesByWord
    ]
    cabochaPhonesTxt = " ".join(cabochaPhonesByWord)

    # Mutate cabochaPhonesByWord to contain the same number
    # of phones as juliusPhonesTxt
    edits = Levenshtein.editops(cabochaPhonesTxt, juliusPhonesTxt)
    wordIndicies = _buildWordIndicies(cabochaPhonesByWord)
    for operation, startIndex, _ in edits:
        wordI = _getWordForCharIndex(wordIndicies, startIndex)
        if operation == 'delete':
            cabochaPhonesByWord[wordI] = cabochaPhonesByWord[wordI][:-1]
        elif operation == 'insert':
            cabochaPhonesByWord[wordI] += '-'

    # Chunk juliusPhonesByWord according to the number
    # of phones in the now aligned cabochaPhonesByWord
    juliusPhonesByWord = []
    startI = 0
    for wordNum, phones in enumerate(cabochaPhonesByWord):
        endI = startI + len(phones)
        juliusWordPhones = juliusPhonesTxt[startI:endI]

        juliusPhonesByWord.append(juliusWordPhones)
        startI = endI + 1  # Add 1 space for the space between words

    return juliusPhonesByWord
def choose_best_match(word_meta, word_alternatives):
    #word_alternatives = sort_word_alternatives(word_alternatives)
    read_word = co.word_from_meta_array(word_meta)
    #    print "Checking -- ", read_word
    chosen_word = read_word

    # Traverse through alternatives, received from elasticsearch
    for word_alt in word_alternatives:
        #        print 'Word alternative', word_alt['word'], word_alt['score']

        word_alt_is_wrong = False
        modifying_word_meta = copy.deepcopy(word_meta)

        # Take edit operations from read word to alternative.
        # Check if alternative is better than original.
        editops = lev.editops(read_word, word_alt['word'])

        for editop in editops:
            (op, source_index, dest_index) = editop

            #            print op, source_index, dest_index, read_word, word_alt['word']
            if op == 'replace':
                if len(modifying_word_meta) <= source_index:
                    print 'Asking to replace unknown index in word. Skipping alternative word'
                    word_alt_is_wrong = True
                    break

                if replacing_letter_is_wrong(modifying_word_meta[source_index],
                                             word_alt['word'][dest_index]):
                    word_alt_is_wrong = True
                    break

                modifying_word_meta[source_index]['char'] = word_alt['word'][
                    dest_index]
            elif op == 'delete':
                if len(modifying_word_meta) <= source_index:
                    print 'Asking to delete unknown index in word. Skipping alternative word'
                    word_alt_is_wrong = True
                    break

                if deleting_letter_is_wrong(modifying_word_meta[source_index]):
                    word_alt_is_wrong = True
                    break

                del modifying_word_meta[source_index]
            elif op == 'insert':
                modifying_word_meta.insert(
                    source_index, {'char': word_alt['word'][dest_index]})

        if not word_alt_is_wrong:
            # Word alternative passed all the checks, so we replace the original
            chosen_word = word_alt['word']
            if read_word != chosen_word:
                print 'Word was corrected ' + read_word + ' with ' + chosen_word
            break

    return chosen_word
Example #26
0
def edit_str(a,b):
    arr = []
    op_dict = {'replace': 'R', 'delete': 'D', 'insert': 'I'}
    for op, i, j in Levenshtein.editops(a, b):
        arr += ['%s%d:%s->%s' % (op_dict[op], i, a[i], b[j])]
        # need to track actual insertions/deletions from here on...
        if op in ('insert', 'delete'):
            arr += ['***']
            break
    return '\n'.join(arr)
Example #27
0
def wordDistance(sentence1, sentence2):
    '''get the damerau levenshtein distance between sentences, in terms of words'''
    symbolset = list("abcdefghijklmnopqrstuvwxyz")
    symbolset = symbolset + [x.upper() for x in symbolset]
    symbolset = symbolset + list("1234567890")
    s1 = sentence1.lower().split(' ')
    s2 = sentence2.lower().split(' ')
    vocab = list(set(s1).union(set(s2)))
    symbol_to_word = dict(zip([symbolset[x] for x in range(len(vocab))],
                              vocab))
    word_to_symbol = dict(zip(symbol_to_word.values(), symbol_to_word.keys()))
    s1_translated = ''.join([word_to_symbol[x] for x in s1])
    s2_translated = ''.join([word_to_symbol[x] for x in s2])
    ls1 = list(s1_translated)
    ls2 = list(s2_translated)

    #dist = pyxdameraulevenshtein.damerau_levenshtein_distance(s1_translated, s2_translated)
    editops = Levenshtein.editops(s1_translated, s2_translated)
    translated_editops = []
    for editop in editops:
        if editop[0] == 'replace':
            translated_editops.append({
                'operation':
                editop[0],
                'in_input':
                symbol_to_word[ls1[editop[1]]],
                'in_output':
                symbol_to_word[ls2[editop[2]]]
            })
        elif editop[0] == 'insert':
            translated_editops.append({
                'operation':
                editop[0],
                'in_input':
                '',
                'in_output':
                symbol_to_word[ls2[editop[2]]]
            })
        elif editop[0] == 'delete':
            translated_editops.append({
                'operation':
                editop[0],
                'in_input':
                symbol_to_word[ls1[editop[1]]],
                'in_output':
                ''
            })
        else:
            raise NotImplementedError

    return ({
        'num_edits': len(translated_editops),
        'edit_ops': translated_editops,
        'normalized_dist': len(translated_editops) / float(len(list(s1)))
    })
Example #28
0
 def error_distribution(self, src, tgt, errors):
     edits = Levenshtein.editops(src, tgt)
     for edit in edits:
         if edit[0] == "replace":
             errors[("replace", src[edit[1]], tgt[edit[2]])] += 1
         elif edit[0] == "delete":
             errors[("delete", src[edit[1]])] += 1
         elif edit[0] == "insert":
             errors[("insert", tgt[edit[2]])] += 1
         else:
             print(edit)
Example #29
0
    def _get_word_errors(s1, s2):
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        ops = Lev.editops(''.join(w1), ''.join(w2))
        errors = {"delete": 0, "insert": 0, "replace": 0}
        for x in ops:
            errors[x[0]] += 1
        return errors
Example #30
0
def edit_str(a, b):
    arr = []
    op_dict = {'replace': 'R', 'delete': 'D', 'insert': 'I'}
    for op, i, j in Levenshtein.editops(a, b):
        if op in ('insert', 'delete'):
            arr += ['***']
            break
        arr += ['%s%d:%s->%s' % (op_dict[op], i, a[i], b[j])]
        # need to track actual insertions/deletions from here on...

    return '\n'.join(arr)
Example #31
0
File: dbotu.py Project: swo/dbotu2
    def distance_to(self, other):
        '''
        Length-adjusted Levenshtein "distance" to other OTU

        other: OTU
          distance to this OTU

        returns: float
        '''
        ops = Levenshtein.editops(self.sequence, other.sequence)
        return len(ops) / (len(self.sequence) + len([o for o in ops if o[0] == 'delete']))
Example #32
0
def nbeditops(s1, s2):
    d = 0
    i = 0
    s = 0
    for op in L.editops(s1, s2):
        if op[0] == 'delete':
            d += 1
        elif op[0] == 'insert':
            i += 1
        elif op[0] == 'replace':
            s += 1
    return d, i, s
Example #33
0
 def encode(cls, fullname, fullname_true):
     target = [''] * len(fullname)
     edit_opts = Levenshtein.editops(fullname, fullname_true)
     edit_opts = sorted(edit_opts, key=lambda x: (x[0], -x[1]), reverse=True)
     for op, src, dst in edit_opts:
         if op == 'delete':
             target[src] = '--'
         if op == 'replace':
             target[src] = fullname_true[dst]
         if op == 'insert':
             target[src] = '+' + fullname_true[dst]
     return target
Example #34
0
    def distance_to(self, other):
        '''
        Length-adjusted Levenshtein "distance" to other OTU

        other: OTU
          distance to this OTU

        returns: float
        '''
        ops = Levenshtein.editops(self.sequence, other.sequence)
        return len(ops) / (len(self.sequence) +
                           len([o for o in ops if o[0] == 'delete']))
Example #35
0
def _condProbName(name1, name2, edit_count, total_edits, smoothing, cp_memoize):
    # computes the conditional probability of arriving at name1
    # by performing a series of operation on name2.
    temp_count = defaultdict(float)
    holder = 0.0
    for k, v in edit_count.iteritems():
        temp_count[k] = v / total_edits
    edits = edist.editops(name1, name2)
    for e in edits:
        holder += np.log(temp_count[e] + smoothing)
    log_cnd_prob = np.sum(holder)
    cp_memoize[(name1, name2)] = np.exp(log_cnd_prob)
    return cp_memoize
Example #36
0
def _condProbName(name1, name2, edit_count, total_edits, smoothing, cp_memoize):
    # computes the conditional probability of arriving at name1
    # by performing a series of operation on name2.
    temp_count = defaultdict(float)
    holder = 0.0
    for k, v in edit_count.iteritems():
        temp_count[k] = v / total_edits
    edits = edist.editops(name1, name2)
    for e in edits:
        holder += np.log(temp_count[e] + smoothing)
    log_cnd_prob = np.sum(holder)
    cp_memoize[(name1, name2)] = np.exp(log_cnd_prob)
    return cp_memoize
def make_improved_old(old, new):
    """ 3. Modify the old version of the hunk by these typo edits, so
    that it looks more like the new version."""
    # To avoid MemoryErroring out, we calculate

    # Calculate the edit moves necessary
    eo = lev.editops(old, new)

    # Now, filter those through something that looks for only "typo edits"
    do_these = only_typo_editops(eo)

    # Now, do them to old
    return lev.apply_edit(do_these, old, new)
Example #38
0
def _get_editops(
    source_string: str, destination_string: str
):
    
    editops = Levenshtein.editops(source_string, destination_string)
    # type(editops)

    # substitutions = sum(1 if op[0] == "replace" else 0 for op in editops)
    # deletions = sum(1 if op[0] == "delete" else 0 for op in editops)
    # insertions = sum(1 if op[0] == "insert" else 0 for op in editops)
    # hits = len(source_string) - (substitutions + deletions)

    return editops
Example #39
0
def correct(match):
    word = match.group(0)
    normed_word = word.lower()
    if normed_word in correction_list:
        new_word = correction_list[normed_word]
        if word.isupper():
            return new_word.upper()
        else:
            edits = Levenshtein.editops(normed_word, new_word)
            return Levenshtein.apply_edit(edits, word, new_word)
    elif normed_word in dictionary:
        return word
    else:
        return word
Example #40
0
def _editCounts(name_samp):
    # to compute probability of edit operations use a subsample of names
    edit_count = defaultdict(int)
    p = len(name_samp)
    total_edits = 0
    for i in range(p):
        for j in range(i + 1, p):
            if i < j:
                edits = edist.editops(name_samp[i], name_samp[j])
                p = len(edits)
                lene = p
                total_edits += len(edits)
                for k in range(lene):
                    edit_count[edits[k]] += 1
    return edit_count, total_edits
Example #41
0
def pitch_sequence_edits(true_pitches, pred_pitches):
    """Given two lists of <pitch> elements, computes their edit
    distance.

    :param pitches1: First sequence of pitches.
    :param pitches2: Second sequence of pitches.

    :return: The Levenshtein edits.
    """
    coder = PitchCoder()

    true_code = coder.pitches2string(true_pitches)
    pred_code = coder.pitches2string(pred_pitches)

    edits = Levenshtein.editops(true_code, pred_code)
    return edits
Example #42
0
def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False):
    import Levenshtein
    # out = difflib.get_close_matches('ἐστιν',words)
    (dict_words, words_clean, words_freq) = word_dicts
    # print "word in:"
    # print dump(wordIn)
    # wordIn = preprocess_word(wordIn)
    # print "word in pp:"
    # print dump(wordIn)
    wordInTrans = leven.transIn(wordIn)
    if (debug):
      print
      print "getCloseWords for", wordInTrans.encode('utf-8'), "(", wordIn.encode('utf-8'),")"
      dump(wordIn)
    output_words = []
    #dict_words_set = set(dict_words)
    n = 0
    # print "Now comparing to..."
    if wordInTrans in dict_words:
        pass
    #    print "short-circuting dictionary word"
    #    output_words.append((wordInTrans,0,0,0,'xxx','yyy'))
    else:
      for word in dict_words:
          # print u"*****" + words_clean[n]
          # print "word into comparison:"
          # print dump(word)
          lev_distance = Levenshtein.distance(
              wordInTrans, word)  # difflib.SequenceMatcher(None, word, wordInTrans).ratio()
          # print "distance: ",
          # print ratio
          if lev_distance <= threshold:
              edits = Levenshtein.editops(wordInTrans, word)
              w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False)
              output_words.append(
                  (word, lev_distance, len(edits), w, 'xxx', 'yyy'))
              if (lev_distance == 0) and (fast == True):
                  # In the case of an exact match, cut the search short
                  # We might have got some close matches ahead of time, so this
                  # will not create a complete list
                  output_words = sorted(
                      output_words, key=lambda word: int(words_freq[word[0]]))
                  return sorted(output_words, key=lambda word: int(word[3]))
          n = n + 1
    return sorted(output_words, key=lambda word: word[3])
def test_edit_dist(x):
    s1 = '12012014321231200112211'
    s2 = '1300201231200112211'
    seq1 = [1,2,0,1,2,0,1,4,3,2,1,2,3,1,2,0,0,1,1,2,2,1,1]
    seq2 = [1,3,0,0,2,0,1,2,3,1,2,0,0,1,1,2,2,1,1]
    pos = np.asarray([[0,0],[0,1],   #0 and 1 are nn
                      [2,0],[2,1],   #2 and 3 are nn
                      [4,0],[4,1],   #4 and 5 are nn
                      [6,0],[6,1],   #6 and 7 are nn
                      [8,0],[8,1],   #8 and 9 are nn
                      [9,0],[9,1],   #10 and 11 are nn
                      [10,0],[10,1]],#12 and 13 are nn
                      dtype=float)
                      
    #modify this to ensure it is a non-connected k-nn
    nn = distance.ann(pos,1)[1][:,1:]
    k = 0
    rp = 1
    w = {'M':lambda x:0,'I':lambda x:1,'D':lambda x:1,
         'S':lambda x:2, 'P':lambda x:0.5 }
    a = align.Align(w,rp,nn,k)
    
    u,v = 0,0
    t0 = time.time()
    for i in range(0,int(x)):
        u = jellyfish.levenshtein_distance(s1,s2)
    t1 = time.time()
    t2 = time.time()
    for i in range(0,int(x)):
        v = Levenshtein.editops(s1,s2)
    v = Levenshtein.distance(s1,s2)
    t3 = time.time()
    t4 = time.time()
    for i in range(0,int(x)):
        #v = a.edit_dist(seq1,seq2)
        #w = a.edit_graph(seq1,seq2)
        #w = a.levenshtein(seq1,seq2)
        w = 1
    w = a.edit_dist(seq1,seq2)
    t5 = time.time()
    #w = a.edit_dist(seq1,seq2)
    print('editdist  dist = %s'%v)
    print('seq edit  dist = %s'%w)
    print('editdist  runtime is %s seconds'%(t3-t2))
    print('seq edit  dist = %s'%(t5-t4))
Example #44
0
 def compile_channels(self):
     """
     Compiles the list of channels found.
     This will attempt to group channels by edit distance.
     """
     group_name_omits = ["train_", "valid_", "test_"]
     edit_thresh = 6
     for channel in self.channels:
         edit_distances = dict((c, Levenshtein.distance(channel, c))
                           for c in self.channel_groups.keys())
         if len(edit_distances) == 0:
             group_name = channel
             for omit in group_name_omits:
                 group_name = group_name.replace(omit, "")
             self.channel_groups[group_name] = [channel]
         else:
             group = None
             min_ed = len(channel)
             for c, d in edit_distances.iteritems():
                 if d <= min_ed:
                     min_ed = d
                     group = c
             if min_ed > edit_thresh or group is None:
                 group_name = channel
                 for omit in group_name_omits:
                     group_name = group_name.replace(omit, "")
                 self.channel_groups[group_name] = [channel]
             else:
                 # Now we reduce the group to the minimum shared string
                 # mb = matching blocks (see Levenshtein docs).
                 mb =\
                     Levenshtein.matching_blocks(
                     Levenshtein.editops(channel, group), channel, group)
                 new_group = "".join([group[x[1]:x[1]+x[2]] for x in mb])
                 if new_group != group:
                     self.channel_groups[new_group] =\
                         copy.deepcopy(self.channel_groups[group])
                     self.channel_groups.pop(group)
                 self.channel_groups[new_group].append(channel)
     for group, channels in self.channel_groups.iteritems():
         self.d["logs"][group] = {}
         for channel in channels:
             self.d["logs"][group][channel] = []
     self.logger.info("Channels: %r" % self.d["logs"].keys())
def print_error_analysis():
    options = config.options(read=True)
    output = get_output(options.run_dir, 'eval')
    errors = [(inst['input'], pred, inst['output'])
              for inst, pred in zip(output.data, output.predictions)
              if inst['output'] != pred]
    if 0 < options.max_examples < len(errors):
        indices = np.random.choice(np.arange(len(errors)), size=options.max_examples, replace=False)
    else:
        indices = range(len(errors))

    if options.html:
        print('<!DOCTYPE html>')
        print('<html><head><title>Error analysis</title><meta charset="utf-8" /></head><body>')
    for i in indices:
        inp, pred, gold = [unicode(s).strip() for s in errors[i]]
        editops = lev.editops(gold, pred)
        print_visualization(inp, pred, gold, editops, html=options.html)
    if options.html:
        print('</body></html>')
def editops(w1, w2):

  # print >>sys.stderr, w1, w2, ':\t',

  if (w1,w2) in editops_dict:
    return editops_dict[(w1,w2)]

  ops_str = ''
  uw1 = w1.decode('utf-8')
  uw2 = w2.decode('utf-8')
  # >>>> Levenshtein.editops('Iwentu', 'I-want')
  # [('insert', 1, 1), ('replace', 2, 3), ('delete', 5, 6)]

  # apply_edit(edit_operations, source_string, destination_string)
  # In the case of editops, the sequence can be arbitrary ordered subset
  # of the edit sequence transforming source_string to destination_string.

  # Examples:
  # >>> e = editops('man', 'scotsman')
  # >>> apply_edit(e, 'man', 'scotsman')
  # 'scotsman'
  # >>> apply_edit(e[:3], 'man', 'scotsman')
  # 'scoman'
  ops = Levenshtein.editops(uw1, uw2)
  for opnum in range(len(ops)):
    (opname, sub1, sub2) = ops[opnum]
    if opname == 'delete':
      ops_str += opname[0] + uw1[sub1].encode('utf-8') + '&'
    elif opname == 'insert':
      ops_str += opname[0] + uw2[sub2].encode('utf-8') + '&'
    else:
      ops_str += opname[0] + uw1[sub1].encode('utf-8') + uw2[sub2].encode('utf-8') + '&'

    # except:
    #   print >>sys.stderr, uw1, uw2, sub1, sub2, ops

  editops_dict[(w1,w2)] = ops_str
  # print >>sys.stderr, ops_str
  return ops_str
Example #47
0
def get_parts(string1, string2):
    length1 = len(string1)
    length2 = len(string2)
    editops = lev.editops(string1, string2)

    # only include strings which are different?

    equal_blocks = lev.matching_blocks(editops, length1, length2)
    get_distance1 = functools.partial(get_index_distance, length=length1)
    get_distance2 = functools.partial(get_index_distance, length=length2)

    # there is always one zero-length 'matching block' at the end
    if len(equal_blocks) > 1:
        # for each matching block, get the corresponding substring
        # and store the indexes from both strings
        # this will allow us to keep track of where the blocks come from in the strings
        equal_parts = [(string1[index1:index1 + block_length],
                        get_distance1(index1), get_distance2(index2))
                       for index1, index2, block_length in equal_blocks if block_length]
        return equal_parts
    else:
        return []
    def differential_encode (self, form_non_tonal, form_tonal, seperator = True) :

        self.p_src = -1
        self.p_dst = -1

        self.src = reshaping(form_non_tonal, False)

        if not self.src :
            if seperator:
                return [u"", [token_seperator]]
            else :
                return [u"", []]

        self.chunks = chunking(self.src)
        self.ret = [u"" for i in range(len(self.chunks))]

        self.dst = reshaping(form_tonal, False)
        ops = Levenshtein.editops(self.src, self.dst)
        self.stat.form_non_tonal[self.src] += 1
        self.stat.form_tonal    [self.dst] += 1
        self.stat.dict_form_tonal.setdefault(self.src, []).append(self.dst)

        for op in ops :

            mode, self.p_src, self.p_dst = op
            if mode == "delete" :
                self.delete()

            elif mode == "insert" :
                self.insert()

            else : # mode == "replace"
                self.insert()
                self.delete()

        # enlèvement du séparateur du code à la fin du chunk
        tmp = []
                for ret2 in self.ret :
 def get_str_simis(self, str1, str2):
     return [Levenshtein.jaro(str1, str2), Levenshtein.ratio(str1,str2), len(Levenshtein.editops(str1, str2))]
for line in target_f.xreadlines():
    units = line.split()
    gene=units[0]
    seq=units[1]
    target_dic[gene] = seq

for each in target_dic.keys():
    each_file = './' + each + '.txt'
    each_f = open (each_file, 'r')
    each_out = open ('./result_'+each+'.txt','w')
    header = each_f.readline()
    each_out.write(header.strip('\n')+'\t'+'bulge:1'+'\t'+'3'+'\t'+'5'+'\n')
    for line in each_f.xreadlines():
        units = line.split()
        target_seq = units[4]
        value ={}
        for nuc in "ATGC":
            target_seq.replace('N',nuc)
            value[nuc] = (l.distance(target_seq,target_dic[each]),l.editops(target_seq,target_dic[each]))
        max_dist, max_editops = value[max(value, key=value.get)]
        step = []
        for each_editop in max_editops:
            a,b,c = each_editop
            step.append(a)
        deletion = step.count('delete')

        each_out.write(line.strip()+'\t'+str(max_dist)+'\t'+str(max_dist+deletion*2)+'\t'+str(max_dist+deletion*4)+'\n')
    each_f.close()
    each_out.close()
        
def alignChars( source, target, ErrStats = None, ErrStats_lock = None ):
    """
    alignChars takes a pair of words from parallel corpora that have been word aligned.
    Errors introduced by the noisy channel (OCR) are revealed by finding the sequence of edit operations that
    map source to target using Levenshtein Edit Distance module.  The edit sequence can
    be used to generate character alignments.

    @param source: original word from corrected corpora
    @type source: str
    @param target: OCR output word from uncorrected corpora
    @type target: str
    @return: source and target words represented character aligned in a
            list of tuples e.g. [(s_1,t_1),...,(s_n,t_n)]
    @rtype: list
    """

    editops = Levenshtein.editops( source, target )

    SPOS = 1
    TPOS = 2
    OP = 0

    sourceArray = [ char for char in source ]
    targetArray = [ char for char in target ]

    substituteCount = 0
    insertCount = 0
    deleteCount = 0

    for element in editops:
        if element[ OP ] == 'insert':
            sourceArray.insert( element[ TPOS ], '' )
            insertCount += 1
        if element[ OP ] == 'delete':
            targetArray.insert( element[ SPOS ], '' )
            deleteCount += 1
        if element[ OP ] == 'replace':
            substituteCount += 1

    if ErrStats:
        with ErrStats_lock:


            ErrStats.updateDistribution( 'editDist_correctLen', len( editops ), len( source ) )

            ErrStats.updateDistribution( 'editDist_errorLen', len( editops ), len( target ) )

            ErrStats.updateDistribution( 'errorLen_correctLen', len( target ), len( source ) )

            ErrStats.updateDistribution( 'errorLen_editDist', len( target ), len( editops ) )

            ErrStats.updateDistribution( 'errorLen_editOps', len( target ),
                                         (insertCount, deleteCount, substituteCount) )

            ErrStats.updateDistribution( 'errorLen_insertOp', len( target ), insertCount )

            ErrStats.updateDistribution( 'errorLen_deleteOp', len( target ), deleteCount )

            ErrStats.updateDistribution( 'errorLen_substituteOp', len( target ), substituteCount )

            ErrStats.updateDistribution( 'correctLen_editDist', len( source ), len( editops ) )

            ErrStats.updateDistribution( 'errorLens', len( target ) )

            ErrStats.updateDistribution( 'correctLens', len( source ) )

            ErrStats.updateDistribution( 'editDists', len( editops ) )

            ErrStats.updateDistribution( 'insertEdits', insertCount )

            ErrStats.updateDistribution( 'deleteEdits', deleteCount )

            ErrStats.updateDistribution( 'substituteEdits', substituteCount )

    output = [ w for w in zip( targetArray, sourceArray ) ]

    return output