def train(instances, subreddits, algorithm, cluster_lambda, clustering_training_iterations): if algorithm == "lev": alg = Levenshtein(instances, cluster_lambda) alg.train() return alg elif algorithm == "lambda_means": alg = LambdaMeans(instances, subreddits, cluster_lambda, clustering_training_iterations) alg.train(instances) return alg
def test_wer(self): examples = { ("foo", "bar"): 1.0, ("foo bar", "foo baz"): 1 / 2, ("foo foo", "bar baz"): 1.0, ("", ""): 0.0 } for words, wer in examples.items(): out = StringIO() Levenshtein(words[0], words[1], " ", False, False, True, out=out) self.assertEqual("WER: " + str(wer) + '\n', out.getvalue())
def test_example(self): examples = { ("", ""): 0, ("a a a", "a a a"): 0, ("a b", "a a a"): 1, ("a b c a", "a a a"): 1, ("foo", "bar"): 6, ("foo", "fooo"): 1 } for words, distance in examples.items(): out = StringIO() Levenshtein(words[0], words[1], " ", True, False, False, out=out) self.assertEqual("Minimum edit distance: " + str(distance) + '\n', out.getvalue())
def align_hyp(self, ref, hyp): match = [] hyp_idx = 0 ref_idx = 0 lev = Levenshtein(ref, hyp) for i, op in enumerate(lev.editops()): assert hyp_idx < len(hyp) or op == Levenshtein.INS assert ref_idx < len(ref) or op == Levenshtein.DEL if op == Levenshtein.KEEP: assert hyp[hyp_idx] == ref[ref_idx] match.append(hyp[hyp_idx]) hyp_idx += 1 ref_idx += 1 elif op == Levenshtein.SUB: match.append(None) hyp_idx += 1 ref_idx += 1 elif op == Levenshtein.DEL: hyp_idx += 1 else: assert op == Levenshtein.INS match.append(None) ref_idx += 1 return match
#!/usr/bin/python3 import argparse from levenshtein import Levenshtein parser = argparse.ArgumentParser(description="Find the Levenshtein distance between two strings.") parser.add_argument("string1", help="First string.") parser.add_argument("string2", help="Second string.") parser.add_argument("-d", "--delimiter", help="Word delimiter. Default value: space", nargs='?', const=' ') parser.add_argument("-D", "--distance", help="Print edit distance.", action="store_true") parser.add_argument("-A", "--alignment", help="Print alignment.", action="store_true") parser.add_argument("-E", "--error", help="Print WER.", action="store_true") args = parser.parse_args() Levenshtein(args.string1, args.string2, args.delimiter, args.distance, args.alignment, args.error)
continue if len(word) < 5: continue yield word def anonymize(words, token='<NAME>'): return [token if w in wilhelm or w in jakob else w for w in words] def anonymize_letter(letter, token='<NAME>'): letter.words = anonymize(letter.words, token=token) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('path') args = parser.parse_args() letters = load_letters(bpath=args.path) words = set(headings(letters)) dists = Levenshtein(*words) print("Wilhelm:\n") for w, _ in sorted(dists.dists_to('Wilhelm'), key=lambda x: x[1]): print("\t%s" % w) print() print("Jakob:\n") for w, _ in sorted(dists.dists_to('Jakob'), key=lambda x: x[1]): print("\t%s" % w)