Python Levenshtein.editopsの例、Levenshtein.editops, spark-data-repair-plugin Pythonの例

コード例 #1

0

ファイルを表示

ファイル: edit.step.bulge.weight.py プロジェクト: ermanz/digenome-toolkit2

def get_edit_dist(input_file, output_file, target_seq):
    fi = open(input_file, 'r')
    fo = open(output_file, 'w')
    fo.write(
        "Chr\tLocation\tForward29\tReverse29\tEdit dist for\tEdit dist rev\tStep for\tStep rev\tDeletion# for\tDeletion# rev\tBulge dist for\tBulge dist rev\n"
    )
    for line in fi.xreadlines():
        units = line.split()
        chrm = units[0]
        loc = units[1]
        forseq = units[2]
        revseq = units[3]

        value = {}
        for nuc in "ATGC":
            target_seq.replace('N', nuc)
            value[nuc] = (l.distance(target_seq,
                                     forseq), l.editops(target_seq, forseq))
        for_max = max(value, key=value.get)
        for_dist, for_editops = value[for_max]
        for_step = []
        for each in for_editops:
            a, b, c = each
            for_step.append(a)
        for_deletion = for_step.count('delete')
        value = {}
        for nuc in "ATGC":
            target_seq.replace('N', nuc)
            value[nuc] = (l.distance(target_seq,
                                     revseq), l.editops(target_seq, revseq))
        rev_max = max(value, key=value.get)
        rev_dist, rev_editops = value[rev_max]
        rev_step = []
        for each in rev_editops:
            a, b, c = each
            rev_step.append(a)
        rev_deletion = rev_step.count('delete')
        #print int(rev_deletion)

        bulge_l = [for_dist + for_deletion * 2, rev_dist + rev_deletion * 2]
        del_l = [for_deletion, rev_deletion]
        fo.write(
            "%s\t%s\t\
        %s\t%s\t\
        %d\t%d\t\
        %s\t%s\t\
        %d\t%d\t\
        %d\t%d\t\
        %d\t%d\n" %
            (chrm, loc, forseq, revseq, for_dist, rev_dist, for_editops,
             rev_editops, for_deletion, rev_deletion, bulge_l[0], bulge_l[1],
             min(bulge_l), del_l[bulge_l.index(min(bulge_l))]))
        #print rev_dist+rev_deletion*4
    fi.close()
    fo.close()

コード例 #2

0

ファイルを表示

ファイル: test_align.py プロジェクト: simwiki/circleseq

def main():

    # a = 'GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC'
    # b = 'GAGTCGAGCAGAAGAAGAANGG'

    a = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA'
    b = 'TTTNCTGATGGTCCATGTCTGTTACTC'

    print(l.distance(a, b))
    print(l.editops(a, b))
    print(l.matching_blocks(l.editops(a, b), a, b))

コード例 #3

0

ファイルを表示

ファイル: edit.step.bulge.weight.py プロジェクト: ibscge/digenome-toolkit2

def get_edit_dist(input_file, output_file, target_seq):
    fi = open (input_file,'r')
    fo = open (output_file, 'w')
    fo.write("Chr\tLocation\tForward29\tReverse29\tEdit dist for\tEdit dist rev\tStep for\tStep rev\tDeletion# for\tDeletion# rev\tBulge dist for\tBulge dist rev\n")
    for line in fi.xreadlines():
        units = line.split()
        chrm = units[0]
        loc = units[1]
        forseq = units[2]
        revseq = units[3]
        
        value = {}
        for nuc in "ATGC":
            target_seq.replace('N',nuc)
            value[nuc] = (l.distance(target_seq,forseq),l.editops(target_seq,forseq))
        for_max = max(value, key=value.get)
        for_dist, for_editops = value[for_max]
        for_step=[]        
        for each in for_editops:
            a,b,c =each
            for_step.append(a)
        for_deletion=for_step.count('delete')
        value = {}
        for nuc in "ATGC":
            target_seq.replace('N',nuc)
            value[nuc] = (l.distance(target_seq,revseq),l.editops(target_seq,revseq))
        rev_max = max(value, key=value.get)
        rev_dist, rev_editops = value[rev_max]
        rev_step=[]        
        for each in rev_editops:
            a,b,c=each
            rev_step.append(a)
        rev_deletion=rev_step.count('delete')
        #print int(rev_deletion)
        
        bulge_l = [for_dist+for_deletion*2,rev_dist+rev_deletion*2]
        del_l = [for_deletion,rev_deletion]
        fo.write(
        "%s\t%s\t\
        %s\t%s\t\
        %d\t%d\t\
        %s\t%s\t\
        %d\t%d\t\
        %d\t%d\t\
        %d\t%d\n"%
        (chrm,loc,
         forseq,revseq,
         for_dist,rev_dist,
         for_editops,rev_editops,
         for_deletion,rev_deletion,
         bulge_l[0],bulge_l[1],
         min(bulge_l),del_l[bulge_l.index(min(bulge_l))]))
        #print rev_dist+rev_deletion*4
    fi.close();fo.close()

コード例 #4

0

ファイルを表示

ファイル: emalgo.py プロジェクト: kakitone/MetaFinisherSC

def debugging(folderName):
    referenceDic = IORobot.loadContigsFromFile(folderName, "reference.fasta")
    interiorsDic = IORobot.loadContigsFromFile(folderName, "interiors.fasta")
    GTDic = IORobot.loadContigsFromFile(folderName, "GTDic.fasta")

    str1 = referenceDic["Segkk0"][2500000:2500000 + 12000]
    str2 = referenceDic["Segkk1"][2500000:2500000 + 12000]
    print Levenshtein.distance(str1, str2)
    print Levenshtein.editops(str1, str2)

    str3 = interiorsDic["Segkk0"][7000:7000 + 12000]
    str4 = interiorsDic["Segkk1"][7000:7000 + 12000]
    print Levenshtein.distance(str1, str4)
    print Levenshtein.distance(str2, str3)
    print Levenshtein.editops(str1, str4)

    print ""
    offset = 4000
    print Levenshtein.editops(str2, str3)
    print str1[offset - 10:offset + 10]
    print str2[offset - 10:offset + 10]
    print str3[offset - 10:offset + 10]
    print str4[offset - 10:offset + 10]

    str5 = GTDic["Segkk0"][7000:7000 + 12000]
    str6 = GTDic["Segkk1"][7000:7000 + 12000]

    print str5[offset - 10:offset + 10]
    print str6[offset - 10:offset + 10]

    print Levenshtein.editops(str2, str4)

コード例 #5

0

ファイルを表示

    def mappingtxtans(self, speech_results):
        ans_txt = config['Answer'].get('ans_txt')
        f = open(ans_txt)
        text = []
        for line in f:
            text.append(line.rstrip())
        print(text)

        if self.read_num > len(text):
            self.read_num == 0

        txt_ans = text[self.read_num]
        speech_ans_list = list(speech_results)
        speech_ans_range = range(len(speech_ans_list))
        e = Levenshtein.editops(txt_ans, speech_results)
        index = []
        com = '，'
        com_index = speech_results.find(com)
        e = list(filter(lambda x: x[0] != 'delete', e))
        for item in e:
            if item[2] != com_index:
                index.append(item[2])
        if index == []:
            return "no different"
        else:
            return index

コード例 #6

0

ファイルを表示

ファイル: DataInformation.py プロジェクト: NgocTanLE/MORPH_SEGMENT

 def operations(self):
     """
     Metrics to determine the functionality of the surface segmentation generator
     :return: number of edited words, total words in dataset, number of operations performed, number of deletions,
     number of replacements
     """
     edited_words, total_words, operations, delete, replace = 0, 0, 0, 0, 0
     for file in self.all_files:
         input_file = open(os.path.join(sys.path[0], file), 'r')
         for line in input_file.readlines():
             content = line.rstrip('\n').split(" | ")
             orthographic = de_segment(removeLabels(content[3]))
             word = content[0]
             total_words += 1
             if not word.__eq__(orthographic):
                 edited_words += 1
                 edits = LevenshteinDistance.editops(orthographic, word)
                 for ed in edits:
                     if ed[0] == 'delete':
                         operations += 1
                         delete += 1
                     elif ed[0] == 'replace':
                         operations += 1
                         replace += 1
     return edited_words, total_words, operations, delete, replace

コード例 #7

0

ファイルを表示

ファイル: lilypond_eval.py プロジェクト: ufal/omreval

def main(args):
    logging.info('Starting main...')
    _start_time = time.clock()

    if not os.path.isfile(args.true):
        raise OSError('True file not found: {0}'.format(args.true))
    if not os.path.isfile(args.prediction):
        raise OSError('Prediction file not found: {0}'.format(args.prediction))

    conversion_script = os.path.join(args.m2ly_path, 'musicxml2ly')
    if not os.path.isfile(conversion_script):
        raise OSError('Cannot find musicxml2ly conversion script: {0}'.format(conversion_script))

    conversion_cmd = '{0} --lxml -a -o - '.format(conversion_script)

    true_ly = postprocess_ly(execute(conversion_cmd + args.true))
    pred_ly = postprocess_ly(execute(conversion_cmd + args.prediction))

    edits = Levenshtein.editops(true_ly, pred_ly)
    print('{0}'.format(len(edits)))

    if args.export_pred:
        with codecs.open(args.prediction + '.ly', 'w', 'utf-8') as export_h:
            export_h.write(pred_ly + u'\n')

    _end_time = time.clock()
    logging.info('lilypond_eval.py done in {0:.3f} s'.format(_end_time - _start_time))

コード例 #8

0

ファイルを表示

ファイル: genome_registration_2020.py プロジェクト: hangsu01/SARS-COV-2-Pangenome-Construction

 def makeCigar(seq, ref):
     if (len(seq) > 16384) or (len(ref) > 16384):
         rmid = len(ref) / 2
         smid = len(seq) / 2
         prox = makeCigar(seq[:smid], ref[:rmid])
         dist = makeCigar(seq[smid:], ref[rmid:])
         return prox + dist
     ops = Levenshtein.editops(seq, ref)
     code = ['=' for i in xrange(len(seq))]
     offset = 0
     for op, si, di in ops:
         if (op == "replace"):
             code[si + offset] = 'X'
         elif (op == "insert"):
             code.insert(si + offset, 'D')
             offset += 1
         elif (op == "delete"):
             code[si + offset] = 'I'  # LM: fixed bug here 2019-04-15
     cigar = ''
     count = 1
     prev = code[0]
     for c in code[1:]:
         if (c == prev):
             count += 1
         else:
             cigar += "%d%c" % (count, prev)
             count = 1
             prev = c
     cigar += "%d%c" % (count, prev)
     return cigar

コード例 #9

0

ファイルを表示

def get_cost(source_word, target_word, action_count=None):
    ops = lev.editops(source_word, target_word)
    if flag: print(ops)
    total_cost = 0
    for op in ops:
        action = op[0]
        if action_count: action_count[action] += 1
        source_pos = op[1]
        target_pos = op[2]
        char = source_word[source_pos] if action == 'delete' else target_word[
            target_pos]
        if flag: print(char)
        part1 = source_word[:source_pos]
        part2 = source_word[source_pos:] if action == 'insert' else source_word[
            source_pos + 1:]  #if pos < len(source_word) else ''
        if flag: print(action, part1, part2)
        consecutive_count_1 = get_consecutive_count(part1[::-1], char)
        consecutive_count_2 = get_consecutive_count(part2, char)
        total = consecutive_count_1 + consecutive_count_2 + 1  #min 1 (denote the inserted/replaced/deleted char)
        source_len = len(source_word)
        if flag:
            print(op, "total", total, "weight", COST_WEIGHTS[action],
                  "source len", source_len)
        cost = COST_WEIGHTS[action] / (
            total / source_len
        )  #if action == 'delete' else COST_WEIGHTS[action]*(total/source_len)
        total_cost += cost
        if flag: print("cost", cost)
    if flag: print(total_cost)
    if not action_count:
        return total_cost
    return total_cost, action_count

コード例 #10

0

ファイルを表示

def editOpts(correct, raw, threshold=2):
    opts = Levenshtein.editops(correct, raw)
    opsLength = len(opts)

    transOpts = []
    if opsLength == 0:
        transOpts = None
    elif opsLength >= 2:
        idx = 0
        while idx < opsLength - 1:
            opt1 = opts[idx]
            opt2 = opts[idx + 1]
            idx1 = opt1[2]
            idx2 = opt2[2]
            try:
                if opt1[0] == opt2[0] == 'replace' and abs(idx1 - idx2) == 1 and correct[idx1] == raw[idx2] and correct[idx2] == raw[idx1]:
                    transOpts.append(('transposition', idx1, idx2))
                    idx += 2
                else:
                    transOpts.append(opt1)
                    idx += 1
            except:
                transOpts.append(opt1)
                idx += 1

        if idx == opsLength - 1:
            transOpts.append(opts[idx])
    else:
        transOpts = opts

    if transOpts and len(transOpts) > threshold:
        return None
    else:
        return transOpts

コード例 #11

0

ファイルを表示

def add_bi_query(count, ss, sd):
    global N
    ss = '^' + ss
    sd = '^' + sd
    i_s = 0
    i_d = 0
    eds = Levenshtein.editops(ss, sd)
    for ed in eds:
        if ed[0] == 'insert':
            ss = string_insert(ss, ed[1] + i_s, '_')
            i_s += 1
        if ed[0] == 'delete':
            sd = string_insert(sd, ed[2] + i_d, '_')
            i_d += 1
    assert (len(ss) == len(sd))
    sym = '='.decode('utf8')
    for i in range(len(ss) - 2):
        if sd[i + 1] == ss[i + 1] or sym == sd[i] or sym == sd[
                i + 1] or sym == ss[i] or sym == ss[i + 1]:
            continue
        if sd[i:i + 2] not in count:
            count[sd[i:i + 2]] = {}
        if ss[i] not in count[sd[i:i + 2]]:
            count[sd[i:i + 2]][ss[i]] = Counter()
        count[sd[i:i + 2]][ss[i]][ss[i + 1]] += 1
        N += 1

コード例 #12

0

ファイルを表示

ファイル: patient_graph.py プロジェクト: cbib/MICADo

def identify_anchor_kmer_in_reference_graph(reference_graph, kmer_to_anchor, leftmost=None, rightmost=None, path_length=None):
	"""

	:type reference_graph: nx.DiGraph
	"""
	toposort = {v: k for k, v in enumerate(nx.topological_sort(reference_graph))}
	# print "Righmost is ",rightmost,toposort[rightmost]
	nodes_to_consider = reference_graph.nodes()
	if rightmost:
		idx = toposort[rightmost]
		nodes_to_consider = ifilter(lambda x: toposort[x] <= idx, nodes_to_consider)
	# print "Max is ", idx
	if leftmost:
		idx = toposort[leftmost]
		nodes_to_consider = ifilter(lambda x: toposort[x] >= idx, nodes_to_consider)
	# print "Min is ", idx
	nodes_to_consider = list(nodes_to_consider)

	node_dists = [(node, Levenshtein.distance(node, kmer_to_anchor), Levenshtein.editops(node, kmer_to_anchor)) for node in
				  nodes_to_consider]
	# print "Will search anchor in ",list(node_dists)
	min_dist = min(node_dists, key=itemgetter(1))[1]
	node_dists = [x for x in node_dists if x[1] == min_dist]
	print "Min possible dist is", min_dist
	if rightmost:
		score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[rightmost] - path_length))
	elif leftmost:
		score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[leftmost] + path_length))
	dist_sorted = sorted(node_dists, key=score_func)
	# identify the rightmost node with minimal distance
	return dist_sorted[0][0]

コード例 #13

0

ファイルを表示

ファイル: patient_graph.py プロジェクト: cbib/MICADo

def decompose_multiple_alterations(reference_path, alternative_path, kmer_length):
	reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length)
	multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length)

	edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence)
	if len(edit_ops) > 2:
		logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence)
		logger.info("Globally apply %s", edit_ops)
	start, end = 0, 0
	while start < len(edit_ops):
		if edit_ops[start] == 'replace':
			atomic_sequence = Levenshtein.apply_edit([edit_ops[start]], reference_sequence, multi_alternative_sequence)
			# print atomic_sequence
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
			start += 1
		else:
			start_e = edit_ops[start]
			end = start + 1
			while (end < len(edit_ops)
				   and edit_ops[end][0] == start_e[0]
				   and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])):
				end += 1
			edit_op_to_apply = edit_ops[start:end]
			start = end
			logger.info("Will apply %s", edit_op_to_apply)
			atomic_sequence = Levenshtein.apply_edit(edit_op_to_apply, reference_sequence, multi_alternative_sequence)
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
		# record each atomic alteration
		logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence)
		yield atomic_sequence, atomic_path

コード例 #14

0

ファイルを表示

ファイル: transcription_error.py プロジェクト: waltandrews/patter

 def _get_char_errors(s1, s2):
     s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
     ops = Lev.editops(s1, s2)
     errors = {"delete": 0, "insert": 0, "replace": 0}
     for x in ops:
         errors[x[0]] += 1
     return errors

コード例 #15

0

ファイルを表示

def compute_uer_confusion_matrix(predictions_dict, labels_dict, unit_dict):

    slim_dict = {key:val for key, val in unit_dict.items() if val not in ['GO', 'EOS', 'MASK', 'END']}
    vocab_size = len(slim_dict)
    invdict = {v: k for k, v in slim_dict.items()}

    conf_matrix = np.zeros(shape=(vocab_size, vocab_size + 2))  # plus deletions, insertions
    edit_ops_indices = []
    edit_ops_at_word_boundaries = []
    edit_ops_not_at_word_boundaries = []

    for (id, label) in labels_dict.items():
        label_str = ''.join(_strip_extra_chars(label))
        prediction_str = ''.join(_strip_extra_chars(predictions_dict[id]))
        edit_ops = Levenshtein.editops(prediction_str, label_str)

        seen_positions = []
        for op in edit_ops:
            opname = op[0]
            if len(prediction_str) >= 40:
                edit_ops_indices.append(op[1] / len(prediction_str))  # store all errors in the source (prediction) string

            if opname == 'delete':
                source_unit = prediction_str[op[1]]
                mat_col = vocab_size
                seen_positions.append(op[1])

                if source_unit == ' ':
                    edit_ops_at_word_boundaries.append(source_unit)
                else:
                    edit_ops_not_at_word_boundaries.append(source_unit)

            elif opname == 'insert':
                source_unit = label_str[op[2]]  # the inserted unit does not exist in the source string
                mat_col = vocab_size + 1
            elif opname == 'replace':
                source_unit = prediction_str[op[1]]
                dest_unit = label_str[op[2]]
                mat_col = invdict[dest_unit] - 1
                seen_positions.append(op[1])

                if source_unit == ' ':
                    edit_ops_at_word_boundaries.append(source_unit)
                else:
                    edit_ops_not_at_word_boundaries.append(source_unit)

            else:
                raise Exception('unknown opname {}'.format(opname))

            mat_row = invdict[source_unit] - 1
            conf_matrix[mat_row, mat_col] += 1


        for idx, symbol in enumerate(prediction_str):
            if idx not in seen_positions:  # correct match
                mat_pos = invdict[symbol] - 1
                conf_matrix[mat_pos, mat_pos] += 1

    # plot_confusion_matrix(conf_matrix, invdict)
    plot_edit_ops_histogram(edit_ops_indices)

コード例 #16

0

ファイルを表示

ファイル: utils.py プロジェクト: vallant/universalmutator

def change(m):
    (mfile, sourcefile, pos, orig, mutant) = m
    eops = Levenshtein.editops(orig, mutant)
    blocks = Levenshtein.matching_blocks(eops, orig, mutant)
    if len(blocks) > 4:
        return mutant[:-1]
    keep = ''.join([orig[x[0]:x[0] + x[2]] for x in blocks])
    notKeep = ""
    pos = 0
    wasDot = False
    for c in range(0, len(orig)):
        if orig[c] == keep[pos]:
            pos += 1
            if not wasDot:
                notKeep += "..."
                wasDot = True
        else:
            notKeep += orig[c]
            wasDot = False
    notKeep += "==>"
    pos = 0
    wasDot = False
    for c in range(0, len(mutant)):
        if (pos < len(keep)) and mutant[c] == keep[pos]:
            pos += 1
            if not wasDot:
                notKeep += "..."
                wasDot = True
        else:
            notKeep += mutant[c]
            wasDot = False
    return notKeep

コード例 #17

0

ファイルを表示

ファイル: lilypond_eval.py プロジェクト: ufal/omreval

def main(args):
    logging.info('Starting main...')
    _start_time = time.clock()

    if not os.path.isfile(args.true):
        raise OSError('True file not found: {0}'.format(args.true))
    if not os.path.isfile(args.prediction):
        raise OSError('Prediction file not found: {0}'.format(args.prediction))

    conversion_script = os.path.join(args.m2ly_path, 'musicxml2ly')
    if not os.path.isfile(conversion_script):
        raise OSError('Cannot find musicxml2ly conversion script: {0}'.format(
            conversion_script))

    conversion_cmd = '{0} --lxml -a -o - '.format(conversion_script)

    true_ly = postprocess_ly(execute(conversion_cmd + args.true))
    pred_ly = postprocess_ly(execute(conversion_cmd + args.prediction))

    edits = Levenshtein.editops(true_ly, pred_ly)
    print('{0}'.format(len(edits)))

    if args.export_pred:
        with codecs.open(args.prediction + '.ly', 'w', 'utf-8') as export_h:
            export_h.write(pred_ly + u'\n')

    _end_time = time.clock()
    logging.info('lilypond_eval.py done in {0:.3f} s'.format(_end_time -
                                                             _start_time))

コード例 #18

0

ファイルを表示

ファイル: codeLinux.py プロジェクト: Computer-CGuy/ScamWebsiteDetection

def edit_distance_list(url, topN=300):
    o = urlparse(url)
    ext = (tldextract.extract(o.netloc))
    url_ext = ext.subdomain + (ext.domain)

    features = []
    global minx
    global mina
    mina = 99999999
    minx = 0
    for x in website[:topN]:
        comp = urlsimpler(x[0])
        compo = urlparse(comp)
        comp_ext = (tldextract.extract(compo.netloc))
        comp_domain = comp_ext.domain
        ed = Levenshtein.editops(url_ext, comp_domain)
        insert = 0
        replace = 0
        delete = 0
        for g in ed:
            if (g[0] == "insert"):
                insert += 1
            if (g[0] == "replace"):
                replace += 1
            if (g[0] == "delete"):
                delete += 1
        weight = insert + replace + delete
        if (weight < mina):
            mina = weight
            minx = x
        features.extend([insert, replace, delete])
    return features

コード例 #19

0

ファイルを表示

def error_statistics(correct_sentences: List[str],
                     written_sentences: List[str],
                     keystroke_stats: schemas.keystroke_stats):
    """
        Use Levenshtein.editops to get types of mistakes and calculate number of specific mistakes
    """
    #Levenshtein leditops loop

    for sentence in range(len(correct_sentences)):
        for word in range(len(correct_sentences[sentence])):
            differences = Levenshtein.editops(
                correct_sentences[sentence][word],
                written_sentences[sentence][word])
            #Uncomment to show differences that has been found
            #print(differences)

            if len(differences):
                for difference in differences:
                    diff_type = difference[0]

                    if diff_type is 'delete':
                        keystroke_stats.so += 1
                    elif diff_type is 'insert':
                        keystroke_stats.sa += 1
                    else:
                        keystroke_stats.sch += 1
                        correct_index = difference[1]
                        written_index = difference[2]
                        correct_word = correct_sentences[sentence][word]
                        written_word = written_sentences[sentence][word]
                        sch(correct_index, written_index, correct_word,
                            written_word, keystroke_stats)

    keystroke_stats.enc = keystroke_stats.sa + keystroke_stats.so + keystroke_stats.sch

コード例 #20

0

ファイルを表示

ファイル: randomforest.py プロジェクト: colonialjelly/name-matching

def get_editops(src_name, tar_name):
    ops = Levenshtein.editops(src_name, tar_name)
    ops_pos, n1_pos, n2_pos = 0, 0, 0
    editops = []
    while True:
        if n1_pos == len(src_name) and n2_pos == len(tar_name):
            break
        c1 = src_name[n1_pos] if n1_pos < len(src_name) else ''
        c2 = tar_name[n2_pos] if n2_pos < len(tar_name) else ''
        op = ops[ops_pos] if ops_pos < len(ops) else None
        if op and op[1] == n1_pos and op[2] == n2_pos:
            if op[0] == 'replace':
                editops.append(c1 + c2)
                n1_pos += 1
                n2_pos += 1
            elif op[0] == 'insert':
                editops.append('_' + c2)
                n2_pos += 1
            elif op[0] == 'delete':
                editops.append(c1 + '_')
                n1_pos += 1
            else:
                raise Exception(f'Unexpected op {op}')
            ops_pos += 1
        else:
            editops.append(c1 + c2)
            n1_pos += 1
            n2_pos += 1
    return editops

コード例 #21

0

ファイルを表示

def getCloseWords(wordIn,
                  word_dicts,
                  rules,
                  max_weight,
                  threshold=3,
                  fast=True,
                  debug=False):
    import Levenshtein
    # out = difflib.get_close_matches('ἐστιν',words)
    (dict_words, words_clean, words_freq) = word_dicts
    # print "word in:"
    # print dump(wordIn)
    # wordIn = preprocess_word(wordIn)
    # print "word in pp:"
    # print dump(wordIn)
    wordInTrans = leven.transIn(wordIn)
    if (debug):
        print
        print "getCloseWords for", wordInTrans.encode(
            'utf-8'), "(", wordIn.encode('utf-8'), ")"
        dump(wordIn)
    output_words = []
    #dict_words_set = set(dict_words)
    n = 0
    # print "Now comparing to..."
    if wordInTrans in dict_words:
        pass
    #    print "short-circuting dictionary word"
    #    output_words.append((wordInTrans,0,0,0,'xxx','yyy'))
    else:
        for word in dict_words:
            # print u"*****" + words_clean[n]
            # print "word into comparison:"
            # print dump(word)
            lev_distance = Levenshtein.distance(
                wordInTrans, word
            )  # difflib.SequenceMatcher(None, word, wordInTrans).ratio()
            # print "distance: ",
            # print ratio
            if lev_distance <= threshold:
                edits = Levenshtein.editops(wordInTrans, word)
                w = weight_for_leven_edits(wordInTrans,
                                           word,
                                           edits,
                                           rules,
                                           max_weight,
                                           debug=False)
                output_words.append(
                    (word, lev_distance, len(edits), w, 'xxx', 'yyy'))
                if (lev_distance == 0) and (fast == True):
                    # In the case of an exact match, cut the search short
                    # We might have got some close matches ahead of time, so this
                    # will not create a complete list
                    output_words = sorted(
                        output_words,
                        key=lambda word: int(words_freq[word[0]]))
                    return sorted(output_words, key=lambda word: int(word[3]))
            n = n + 1
    return sorted(output_words, key=lambda word: word[3])

コード例 #22

0

ファイルを表示

ファイル: Normalization_Helper.py プロジェクト: praty170122032/Knowledge-Panel

def distance_simliarity(entity_list):
    for name1, name2 in combinations(entity_list, 2):
        dist_lvst = lvst.distance(name1, name2)
        dist_jaro = lvst.jaro_winkler(name1, name2)
        edit_ops = lvst.editops(name1, name2)
        match_blocks = lvst.matching_blocks(edit_ops, name1, name2)

        yield ((name1, name2), dist_lvst, dist_jaro, edit_ops, match_blocks)

コード例 #23

0

ファイルを表示

ファイル: mispro.py プロジェクト: dzubke/speech-lite

def get_editops(hyp_phns, ref_phns):
    phn_super_set = set(hyp_phns + ref_phns)
    p2c = {ph:chr(65+i) for i, ph in enumerate(sorted(phn_super_set))}
    c2p = {chr(65+i):ph for i, ph in enumerate(sorted(phn_super_set))}
    hyp_chars = "".join([p2c[ph] for ph in hyp_phns])
    ref_chars = "".join([p2c[ph] for ph in ref_phns])

    return lev.editops(hyp_chars, ref_chars)

コード例 #24

0

ファイルを表示

ファイル: juliusAlignment.py プロジェクト: sujoyu/pyJuliusAlign

def mapJuliusPronunciationToCabocha(juliusPhonesTxt, cabochaPhonesByWord):
    '''
    Aligns the phones in a julius pronunciation list into phones chunked into words by cabocha

    The phonetisation is a little different, so the mapping tries to do so gracefully.

    Basically, modify cabochaPhonesByWord to have the same number of phones as juliusPhonesTxt.
    This is done by finding the edits necessary to make the two strings the same and then applying
    those edits.
    Once the two strings contain the same number of phones, dump the phones from julius into the slots for
    the phones inside of the modified cabocha words.
    '''
    def _buildWordIndicies(cabochaPhonesByWord):
        startI = 0
        wordIndicies = []
        for word in cabochaPhonesByWord:
            wordIndicies.append([startI, startI + len(word)])
            startI += len(word)

        return wordIndicies

    def _getWordForCharIndex(indiciesForWords, targetI):
        returnI = None
        for i, indicies in enumerate(indiciesForWords):
            start, stop = indicies
            if targetI >= start and targetI < stop:
                returnI = i
                break
        return returnI

    cabochaPhonesByWord = [
        phones.replace(":", "") for phones in cabochaPhonesByWord
    ]
    cabochaPhonesTxt = " ".join(cabochaPhonesByWord)

    # Mutate cabochaPhonesByWord to contain the same number
    # of phones as juliusPhonesTxt
    edits = Levenshtein.editops(cabochaPhonesTxt, juliusPhonesTxt)
    wordIndicies = _buildWordIndicies(cabochaPhonesByWord)
    for operation, startIndex, _ in edits:
        wordI = _getWordForCharIndex(wordIndicies, startIndex)
        if operation == 'delete':
            cabochaPhonesByWord[wordI] = cabochaPhonesByWord[wordI][:-1]
        elif operation == 'insert':
            cabochaPhonesByWord[wordI] += '-'

    # Chunk juliusPhonesByWord according to the number
    # of phones in the now aligned cabochaPhonesByWord
    juliusPhonesByWord = []
    startI = 0
    for wordNum, phones in enumerate(cabochaPhonesByWord):
        endI = startI + len(phones)
        juliusWordPhones = juliusPhonesTxt[startI:endI]

        juliusPhonesByWord.append(juliusWordPhones)
        startI = endI + 1  # Add 1 space for the space between words

    return juliusPhonesByWord

コード例 #25

0

ファイルを表示

ファイル: word_corrector.py プロジェクト: IButskhrikidze/georgian-ocr-v2

def choose_best_match(word_meta, word_alternatives):
    #word_alternatives = sort_word_alternatives(word_alternatives)
    read_word = co.word_from_meta_array(word_meta)
    #    print "Checking -- ", read_word
    chosen_word = read_word

    # Traverse through alternatives, received from elasticsearch
    for word_alt in word_alternatives:
        #        print 'Word alternative', word_alt['word'], word_alt['score']

        word_alt_is_wrong = False
        modifying_word_meta = copy.deepcopy(word_meta)

        # Take edit operations from read word to alternative.
        # Check if alternative is better than original.
        editops = lev.editops(read_word, word_alt['word'])

        for editop in editops:
            (op, source_index, dest_index) = editop

            #            print op, source_index, dest_index, read_word, word_alt['word']
            if op == 'replace':
                if len(modifying_word_meta) <= source_index:
                    print 'Asking to replace unknown index in word. Skipping alternative word'
                    word_alt_is_wrong = True
                    break

                if replacing_letter_is_wrong(modifying_word_meta[source_index],
                                             word_alt['word'][dest_index]):
                    word_alt_is_wrong = True
                    break

                modifying_word_meta[source_index]['char'] = word_alt['word'][
                    dest_index]
            elif op == 'delete':
                if len(modifying_word_meta) <= source_index:
                    print 'Asking to delete unknown index in word. Skipping alternative word'
                    word_alt_is_wrong = True
                    break

                if deleting_letter_is_wrong(modifying_word_meta[source_index]):
                    word_alt_is_wrong = True
                    break

                del modifying_word_meta[source_index]
            elif op == 'insert':
                modifying_word_meta.insert(
                    source_index, {'char': word_alt['word'][dest_index]})

        if not word_alt_is_wrong:
            # Word alternative passed all the checks, so we replace the original
            chosen_word = word_alt['word']
            if read_word != chosen_word:
                print 'Word was corrected ' + read_word + ' with ' + chosen_word
            break

    return chosen_word

コード例 #26

0

ファイルを表示

ファイル: sequence.py プロジェクト: feldman4/lasagna

def edit_str(a,b):
    arr = []
    op_dict = {'replace': 'R', 'delete': 'D', 'insert': 'I'}
    for op, i, j in Levenshtein.editops(a, b):
        arr += ['%s%d:%s->%s' % (op_dict[op], i, a[i], b[j])]
        # need to track actual insertions/deletions from here on...
        if op in ('insert', 'delete'):
            arr += ['***']
            break
    return '\n'.join(arr)

コード例 #27

0

ファイルを表示

def wordDistance(sentence1, sentence2):
    '''get the damerau levenshtein distance between sentences, in terms of words'''
    symbolset = list("abcdefghijklmnopqrstuvwxyz")
    symbolset = symbolset + [x.upper() for x in symbolset]
    symbolset = symbolset + list("1234567890")
    s1 = sentence1.lower().split(' ')
    s2 = sentence2.lower().split(' ')
    vocab = list(set(s1).union(set(s2)))
    symbol_to_word = dict(zip([symbolset[x] for x in range(len(vocab))],
                              vocab))
    word_to_symbol = dict(zip(symbol_to_word.values(), symbol_to_word.keys()))
    s1_translated = ''.join([word_to_symbol[x] for x in s1])
    s2_translated = ''.join([word_to_symbol[x] for x in s2])
    ls1 = list(s1_translated)
    ls2 = list(s2_translated)

    #dist = pyxdameraulevenshtein.damerau_levenshtein_distance(s1_translated, s2_translated)
    editops = Levenshtein.editops(s1_translated, s2_translated)
    translated_editops = []
    for editop in editops:
        if editop[0] == 'replace':
            translated_editops.append({
                'operation':
                editop[0],
                'in_input':
                symbol_to_word[ls1[editop[1]]],
                'in_output':
                symbol_to_word[ls2[editop[2]]]
            })
        elif editop[0] == 'insert':
            translated_editops.append({
                'operation':
                editop[0],
                'in_input':
                '',
                'in_output':
                symbol_to_word[ls2[editop[2]]]
            })
        elif editop[0] == 'delete':
            translated_editops.append({
                'operation':
                editop[0],
                'in_input':
                symbol_to_word[ls1[editop[1]]],
                'in_output':
                ''
            })
        else:
            raise NotImplementedError

    return ({
        'num_edits': len(translated_editops),
        'edit_ops': translated_editops,
        'normalized_dist': len(translated_editops) / float(len(list(s1)))
    })

コード例 #28

0

ファイルを表示

 def error_distribution(self, src, tgt, errors):
     edits = Levenshtein.editops(src, tgt)
     for edit in edits:
         if edit[0] == "replace":
             errors[("replace", src[edit[1]], tgt[edit[2]])] += 1
         elif edit[0] == "delete":
             errors[("delete", src[edit[1]])] += 1
         elif edit[0] == "insert":
             errors[("insert", tgt[edit[2]])] += 1
         else:
             print(edit)

コード例 #29

0

ファイルを表示

ファイル: transcription_error.py プロジェクト: waltandrews/patter

    def _get_word_errors(s1, s2):
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        ops = Lev.editops(''.join(w1), ''.join(w2))
        errors = {"delete": 0, "insert": 0, "replace": 0}
        for x in ops:
            errors[x[0]] += 1
        return errors

コード例 #30

0

ファイルを表示

def edit_str(a, b):
    arr = []
    op_dict = {'replace': 'R', 'delete': 'D', 'insert': 'I'}
    for op, i, j in Levenshtein.editops(a, b):
        if op in ('insert', 'delete'):
            arr += ['***']
            break
        arr += ['%s%d:%s->%s' % (op_dict[op], i, a[i], b[j])]
        # need to track actual insertions/deletions from here on...

    return '\n'.join(arr)

コード例 #31

0

ファイルを表示

ファイル: dbotu.py プロジェクト: swo/dbotu2

    def distance_to(self, other):
        '''
        Length-adjusted Levenshtein "distance" to other OTU

        other: OTU
          distance to this OTU

        returns: float
        '''
        ops = Levenshtein.editops(self.sequence, other.sequence)
        return len(ops) / (len(self.sequence) + len([o for o in ops if o[0] == 'delete']))

コード例 #32

0

ファイルを表示

ファイル: xer.py プロジェクト: gchrupala/platalea

def nbeditops(s1, s2):
    d = 0
    i = 0
    s = 0
    for op in L.editops(s1, s2):
        if op[0] == 'delete':
            d += 1
        elif op[0] == 'insert':
            i += 1
        elif op[0] == 'replace':
            s += 1
    return d, i, s

コード例 #33

0

ファイルを表示

 def encode(cls, fullname, fullname_true):
     target = [''] * len(fullname)
     edit_opts = Levenshtein.editops(fullname, fullname_true)
     edit_opts = sorted(edit_opts, key=lambda x: (x[0], -x[1]), reverse=True)
     for op, src, dst in edit_opts:
         if op == 'delete':
             target[src] = '--'
         if op == 'replace':
             target[src] = fullname_true[dst]
         if op == 'insert':
             target[src] = '+' + fullname_true[dst]
     return target

コード例 #34

0

ファイルを表示

ファイル: dbotu.py プロジェクト: TankMermaid/dbotu3

    def distance_to(self, other):
        '''
        Length-adjusted Levenshtein "distance" to other OTU

        other: OTU
          distance to this OTU

        returns: float
        '''
        ops = Levenshtein.editops(self.sequence, other.sequence)
        return len(ops) / (len(self.sequence) +
                           len([o for o in ops if o[0] == 'delete']))

コード例 #35

0

ファイルを表示

def _condProbName(name1, name2, edit_count, total_edits, smoothing, cp_memoize):
    # computes the conditional probability of arriving at name1
    # by performing a series of operation on name2.
    temp_count = defaultdict(float)
    holder = 0.0
    for k, v in edit_count.iteritems():
        temp_count[k] = v / total_edits
    edits = edist.editops(name1, name2)
    for e in edits:
        holder += np.log(temp_count[e] + smoothing)
    log_cnd_prob = np.sum(holder)
    cp_memoize[(name1, name2)] = np.exp(log_cnd_prob)
    return cp_memoize

コード例 #36

0

ファイルを表示

ファイル: counter.py プロジェクト: Libardo1/name-probability

def _condProbName(name1, name2, edit_count, total_edits, smoothing, cp_memoize):
    # computes the conditional probability of arriving at name1
    # by performing a series of operation on name2.
    temp_count = defaultdict(float)
    holder = 0.0
    for k, v in edit_count.iteritems():
        temp_count[k] = v / total_edits
    edits = edist.editops(name1, name2)
    for e in edits:
        holder += np.log(temp_count[e] + smoothing)
    log_cnd_prob = np.sum(holder)
    cp_memoize[(name1, name2)] = np.exp(log_cnd_prob)
    return cp_memoize

コード例 #37

0

ファイルを表示

ファイル: sentence_matching.py プロジェクト: sethwoodworth/wikipedia-style-edits

def make_improved_old(old, new):
    """ 3. Modify the old version of the hunk by these typo edits, so
    that it looks more like the new version."""
    # To avoid MemoryErroring out, we calculate

    # Calculate the edit moves necessary
    eo = lev.editops(old, new)

    # Now, filter those through something that looks for only "typo edits"
    do_these = only_typo_editops(eo)

    # Now, do them to old
    return lev.apply_edit(do_these, old, new)

コード例 #38

0

ファイルを表示

def _get_editops(
    source_string: str, destination_string: str
):
    
    editops = Levenshtein.editops(source_string, destination_string)
    # type(editops)

    # substitutions = sum(1 if op[0] == "replace" else 0 for op in editops)
    # deletions = sum(1 if op[0] == "delete" else 0 for op in editops)
    # insertions = sum(1 if op[0] == "insert" else 0 for op in editops)
    # hits = len(source_string) - (substitutions + deletions)

    return editops

コード例 #39

0

ファイルを表示

ファイル: spellcheck.py プロジェクト: SamEisenstat/amgno

def correct(match):
    word = match.group(0)
    normed_word = word.lower()
    if normed_word in correction_list:
        new_word = correction_list[normed_word]
        if word.isupper():
            return new_word.upper()
        else:
            edits = Levenshtein.editops(normed_word, new_word)
            return Levenshtein.apply_edit(edits, word, new_word)
    elif normed_word in dictionary:
        return word
    else:
        return word

コード例 #40

0

ファイルを表示

ファイル: counter.py プロジェクト: Libardo1/name-probability

def _editCounts(name_samp):
    # to compute probability of edit operations use a subsample of names
    edit_count = defaultdict(int)
    p = len(name_samp)
    total_edits = 0
    for i in range(p):
        for j in range(i + 1, p):
            if i < j:
                edits = edist.editops(name_samp[i], name_samp[j])
                p = len(edits)
                lene = p
                total_edits += len(edits)
                for k in range(lene):
                    edit_count[edits[k]] += 1
    return edit_count, total_edits

コード例 #41

0

ファイルを表示

ファイル: pitch_counter.py プロジェクト: ufal/omreval

def pitch_sequence_edits(true_pitches, pred_pitches):
    """Given two lists of <pitch> elements, computes their edit
    distance.

    :param pitches1: First sequence of pitches.
    :param pitches2: Second sequence of pitches.

    :return: The Levenshtein edits.
    """
    coder = PitchCoder()

    true_code = coder.pitches2string(true_pitches)
    pred_code = coder.pitches2string(pred_pitches)

    edits = Levenshtein.editops(true_code, pred_code)
    return edits

コード例 #42

0

ファイルを表示

ファイル: read_dict5.py プロジェクト: brobertson/rigaudon

def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False):
    import Levenshtein
    # out = difflib.get_close_matches('ἐστιν',words)
    (dict_words, words_clean, words_freq) = word_dicts
    # print "word in:"
    # print dump(wordIn)
    # wordIn = preprocess_word(wordIn)
    # print "word in pp:"
    # print dump(wordIn)
    wordInTrans = leven.transIn(wordIn)
    if (debug):
      print
      print "getCloseWords for", wordInTrans.encode('utf-8'), "(", wordIn.encode('utf-8'),")"
      dump(wordIn)
    output_words = []
    #dict_words_set = set(dict_words)
    n = 0
    # print "Now comparing to..."
    if wordInTrans in dict_words:
        pass
    #    print "short-circuting dictionary word"
    #    output_words.append((wordInTrans,0,0,0,'xxx','yyy'))
    else:
      for word in dict_words:
          # print u"*****" + words_clean[n]
          # print "word into comparison:"
          # print dump(word)
          lev_distance = Levenshtein.distance(
              wordInTrans, word)  # difflib.SequenceMatcher(None, word, wordInTrans).ratio()
          # print "distance: ",
          # print ratio
          if lev_distance <= threshold:
              edits = Levenshtein.editops(wordInTrans, word)
              w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False)
              output_words.append(
                  (word, lev_distance, len(edits), w, 'xxx', 'yyy'))
              if (lev_distance == 0) and (fast == True):
                  # In the case of an exact match, cut the search short
                  # We might have got some close matches ahead of time, so this
                  # will not create a complete list
                  output_words = sorted(
                      output_words, key=lambda word: int(words_freq[word[0]]))
                  return sorted(output_words, key=lambda word: int(word[3]))
          n = n + 1
    return sorted(output_words, key=lambda word: word[3])

コード例 #43

0

ファイルを表示

ファイル: align_test.py プロジェクト: timothyjamesbecker/tHUB_Tools

def test_edit_dist(x):
    s1 = '12012014321231200112211'
    s2 = '1300201231200112211'
    seq1 = [1,2,0,1,2,0,1,4,3,2,1,2,3,1,2,0,0,1,1,2,2,1,1]
    seq2 = [1,3,0,0,2,0,1,2,3,1,2,0,0,1,1,2,2,1,1]
    pos = np.asarray([[0,0],[0,1],   #0 and 1 are nn
                      [2,0],[2,1],   #2 and 3 are nn
                      [4,0],[4,1],   #4 and 5 are nn
                      [6,0],[6,1],   #6 and 7 are nn
                      [8,0],[8,1],   #8 and 9 are nn
                      [9,0],[9,1],   #10 and 11 are nn
                      [10,0],[10,1]],#12 and 13 are nn
                      dtype=float)
                      
    #modify this to ensure it is a non-connected k-nn
    nn = distance.ann(pos,1)[1][:,1:]
    k = 0
    rp = 1
    w = {'M':lambda x:0,'I':lambda x:1,'D':lambda x:1,
         'S':lambda x:2, 'P':lambda x:0.5 }
    a = align.Align(w,rp,nn,k)
    
    u,v = 0,0
    t0 = time.time()
    for i in range(0,int(x)):
        u = jellyfish.levenshtein_distance(s1,s2)
    t1 = time.time()
    t2 = time.time()
    for i in range(0,int(x)):
        v = Levenshtein.editops(s1,s2)
    v = Levenshtein.distance(s1,s2)
    t3 = time.time()
    t4 = time.time()
    for i in range(0,int(x)):
        #v = a.edit_dist(seq1,seq2)
        #w = a.edit_graph(seq1,seq2)
        #w = a.levenshtein(seq1,seq2)
        w = 1
    w = a.edit_dist(seq1,seq2)
    t5 = time.time()
    #w = a.edit_dist(seq1,seq2)
    print('editdist  dist = %s'%v)
    print('seq edit  dist = %s'%w)
    print('editdist  runtime is %s seconds'%(t3-t2))
    print('seq edit  dist = %s'%(t5-t4))

コード例 #44

0

ファイルを表示

ファイル: __init__.py プロジェクト: ecastrow/pl2mind

 def compile_channels(self):
     """
     Compiles the list of channels found.
     This will attempt to group channels by edit distance.
     """
     group_name_omits = ["train_", "valid_", "test_"]
     edit_thresh = 6
     for channel in self.channels:
         edit_distances = dict((c, Levenshtein.distance(channel, c))
                           for c in self.channel_groups.keys())
         if len(edit_distances) == 0:
             group_name = channel
             for omit in group_name_omits:
                 group_name = group_name.replace(omit, "")
             self.channel_groups[group_name] = [channel]
         else:
             group = None
             min_ed = len(channel)
             for c, d in edit_distances.iteritems():
                 if d <= min_ed:
                     min_ed = d
                     group = c
             if min_ed > edit_thresh or group is None:
                 group_name = channel
                 for omit in group_name_omits:
                     group_name = group_name.replace(omit, "")
                 self.channel_groups[group_name] = [channel]
             else:
                 # Now we reduce the group to the minimum shared string
                 # mb = matching blocks (see Levenshtein docs).
                 mb =\
                     Levenshtein.matching_blocks(
                     Levenshtein.editops(channel, group), channel, group)
                 new_group = "".join([group[x[1]:x[1]+x[2]] for x in mb])
                 if new_group != group:
                     self.channel_groups[new_group] =\
                         copy.deepcopy(self.channel_groups[group])
                     self.channel_groups.pop(group)
                 self.channel_groups[new_group].append(channel)
     for group, channels in self.channel_groups.iteritems():
         self.d["logs"][group] = {}
         for channel in channels:
             self.d["logs"][group][channel] = []
     self.logger.info("Channels: %r" % self.d["logs"].keys())

コード例 #45

0

ファイルを表示

ファイル: error_analysis.py プロジェクト: arunchaganty/django-corenlp

def print_error_analysis():
    options = config.options(read=True)
    output = get_output(options.run_dir, 'eval')
    errors = [(inst['input'], pred, inst['output'])
              for inst, pred in zip(output.data, output.predictions)
              if inst['output'] != pred]
    if 0 < options.max_examples < len(errors):
        indices = np.random.choice(np.arange(len(errors)), size=options.max_examples, replace=False)
    else:
        indices = range(len(errors))

    if options.html:
        print('<!DOCTYPE html>')
        print('<html><head><title>Error analysis</title><meta charset="utf-8" /></head><body>')
    for i in indices:
        inp, pred, gold = [unicode(s).strip() for s in errors[i]]
        editops = lev.editops(gold, pred)
        print_visualization(inp, pred, gold, editops, html=options.html)
    if options.html:
        print('</body></html>')

コード例 #46

0

ファイルを表示

ファイル: ext_cand_gen_kbe.py プロジェクト: fmacias64/deepdive_ocr_app

def editops(w1, w2):

  # print >>sys.stderr, w1, w2, ':\t',

  if (w1,w2) in editops_dict:
    return editops_dict[(w1,w2)]

  ops_str = ''
  uw1 = w1.decode('utf-8')
  uw2 = w2.decode('utf-8')
  # >>>> Levenshtein.editops('Iwentu', 'I-want')
  # [('insert', 1, 1), ('replace', 2, 3), ('delete', 5, 6)]

  # apply_edit(edit_operations, source_string, destination_string)
  # In the case of editops, the sequence can be arbitrary ordered subset
  # of the edit sequence transforming source_string to destination_string.

  # Examples:
  # >>> e = editops('man', 'scotsman')
  # >>> apply_edit(e, 'man', 'scotsman')
  # 'scotsman'
  # >>> apply_edit(e[:3], 'man', 'scotsman')
  # 'scoman'
  ops = Levenshtein.editops(uw1, uw2)
  for opnum in range(len(ops)):
    (opname, sub1, sub2) = ops[opnum]
    if opname == 'delete':
      ops_str += opname[0] + uw1[sub1].encode('utf-8') + '&'
    elif opname == 'insert':
      ops_str += opname[0] + uw2[sub2].encode('utf-8') + '&'
    else:
      ops_str += opname[0] + uw1[sub1].encode('utf-8') + uw2[sub2].encode('utf-8') + '&'

    # except:
    #   print >>sys.stderr, uw1, uw2, sub1, sub2, ops

  editops_dict[(w1,w2)] = ops_str
  # print >>sys.stderr, ops_str
  return ops_str

コード例 #47

0

ファイルを表示

ファイル: substring_parser.py プロジェクト: notnami/signify

def get_parts(string1, string2):
    length1 = len(string1)
    length2 = len(string2)
    editops = lev.editops(string1, string2)

    # only include strings which are different?

    equal_blocks = lev.matching_blocks(editops, length1, length2)
    get_distance1 = functools.partial(get_index_distance, length=length1)
    get_distance2 = functools.partial(get_index_distance, length=length2)

    # there is always one zero-length 'matching block' at the end
    if len(equal_blocks) > 1:
        # for each matching block, get the corresponding substring
        # and store the indexes from both strings
        # this will allow us to keep track of where the blocks come from in the strings
        equal_parts = [(string1[index1:index1 + block_length],
                        get_distance1(index1), get_distance2(index2))
                       for index1, index2, block_length in equal_blocks if block_length]
        return equal_parts
    else:
        return []

コード例 #48

0

ファイルを表示

ファイル: differential_tone_coding.py プロジェクト: maslinych/daba

    def differential_encode (self, form_non_tonal, form_tonal, seperator = True) :

        self.p_src = -1
        self.p_dst = -1

        self.src = reshaping(form_non_tonal, False)

        if not self.src :
            if seperator:
                return [u"", [token_seperator]]
            else :
                return [u"", []]

        self.chunks = chunking(self.src)
        self.ret = [u"" for i in range(len(self.chunks))]

        self.dst = reshaping(form_tonal, False)
        ops = Levenshtein.editops(self.src, self.dst)
        self.stat.form_non_tonal[self.src] += 1
        self.stat.form_tonal    [self.dst] += 1
        self.stat.dict_form_tonal.setdefault(self.src, []).append(self.dst)

        for op in ops :

            mode, self.p_src, self.p_dst = op
            if mode == "delete" :
                self.delete()

            elif mode == "insert" :
                self.insert()

            else : # mode == "replace"
                self.insert()
                self.delete()

        # enlèvement du séparateur du code à la fin du chunk
        tmp = []
                for ret2 in self.ret :

コード例 #49

0

ファイルを表示

ファイル: FeatureGenerator0517.py プロジェクト: LiJiefei/kddcup2013_track1

 def get_str_simis(self, str1, str2):
     return [Levenshtein.jaro(str1, str2), Levenshtein.ratio(str1,str2), len(Levenshtein.editops(str1, str2))]

コード例 #50

0

ファイルを表示

ファイル: casoffinder.target_site.edit_distance.bulge.weight.py プロジェクト: ibscge/digenome-toolkit2

for line in target_f.xreadlines():
    units = line.split()
    gene=units[0]
    seq=units[1]
    target_dic[gene] = seq

for each in target_dic.keys():
    each_file = './' + each + '.txt'
    each_f = open (each_file, 'r')
    each_out = open ('./result_'+each+'.txt','w')
    header = each_f.readline()
    each_out.write(header.strip('\n')+'\t'+'bulge:1'+'\t'+'3'+'\t'+'5'+'\n')
    for line in each_f.xreadlines():
        units = line.split()
        target_seq = units[4]
        value ={}
        for nuc in "ATGC":
            target_seq.replace('N',nuc)
            value[nuc] = (l.distance(target_seq,target_dic[each]),l.editops(target_seq,target_dic[each]))
        max_dist, max_editops = value[max(value, key=value.get)]
        step = []
        for each_editop in max_editops:
            a,b,c = each_editop
            step.append(a)
        deletion = step.count('delete')

        each_out.write(line.strip()+'\t'+str(max_dist)+'\t'+str(max_dist+deletion*2)+'\t'+str(max_dist+deletion*4)+'\n')
    each_f.close()
    each_out.close()

コード例 #51

0

ファイルを表示

ファイル: CharacterAligner.py プロジェクト: jcavalieri8619/OCRerror_correct

def alignChars( source, target, ErrStats = None, ErrStats_lock = None ):
    """
    alignChars takes a pair of words from parallel corpora that have been word aligned.
    Errors introduced by the noisy channel (OCR) are revealed by finding the sequence of edit operations that
    map source to target using Levenshtein Edit Distance module.  The edit sequence can
    be used to generate character alignments.

    @param source: original word from corrected corpora
    @type source: str
    @param target: OCR output word from uncorrected corpora
    @type target: str
    @return: source and target words represented character aligned in a
            list of tuples e.g. [(s_1,t_1),...,(s_n,t_n)]
    @rtype: list
    """

    editops = Levenshtein.editops( source, target )

    SPOS = 1
    TPOS = 2
    OP = 0

    sourceArray = [ char for char in source ]
    targetArray = [ char for char in target ]

    substituteCount = 0
    insertCount = 0
    deleteCount = 0

    for element in editops:
        if element[ OP ] == 'insert':
            sourceArray.insert( element[ TPOS ], '' )
            insertCount += 1
        if element[ OP ] == 'delete':
            targetArray.insert( element[ SPOS ], '' )
            deleteCount += 1
        if element[ OP ] == 'replace':
            substituteCount += 1

    if ErrStats:
        with ErrStats_lock:


            ErrStats.updateDistribution( 'editDist_correctLen', len( editops ), len( source ) )

            ErrStats.updateDistribution( 'editDist_errorLen', len( editops ), len( target ) )

            ErrStats.updateDistribution( 'errorLen_correctLen', len( target ), len( source ) )

            ErrStats.updateDistribution( 'errorLen_editDist', len( target ), len( editops ) )

            ErrStats.updateDistribution( 'errorLen_editOps', len( target ),
                                         (insertCount, deleteCount, substituteCount) )

            ErrStats.updateDistribution( 'errorLen_insertOp', len( target ), insertCount )

            ErrStats.updateDistribution( 'errorLen_deleteOp', len( target ), deleteCount )

            ErrStats.updateDistribution( 'errorLen_substituteOp', len( target ), substituteCount )

            ErrStats.updateDistribution( 'correctLen_editDist', len( source ), len( editops ) )

            ErrStats.updateDistribution( 'errorLens', len( target ) )

            ErrStats.updateDistribution( 'correctLens', len( source ) )

            ErrStats.updateDistribution( 'editDists', len( editops ) )

            ErrStats.updateDistribution( 'insertEdits', insertCount )

            ErrStats.updateDistribution( 'deleteEdits', deleteCount )

            ErrStats.updateDistribution( 'substituteEdits', substituteCount )

    output = [ w for w in zip( targetArray, sourceArray ) ]

    return output