def get_probability_map_w_given_c(mistakes_file): lines = [line.strip().split(';') for line in open(mistakes_file)] levenshtein_counters = {} for line in lines: dist = levenshtein(unicode(line[0], "utf-8"), unicode(line[1], "utf-8")) if dist not in levenshtein_counters: levenshtein_counters[dist] = 1. else: levenshtein_counters[dist] += 1. for counter in levenshtein_counters: levenshtein_counters[counter] /= len(lines) return levenshtein_counters
def get_probability_w_given_c(word, dictionary): lines = [line.strip() for line in open(dictionary)] lines_map = {} for line in lines: lines_map[line] = levenshtein(unicode(line, "utf-8"), word) levenshtein_counters = {} for line in lines_map: if lines_map[line] not in levenshtein_counters: levenshtein_counters[lines_map[line]] = 1. else: levenshtein_counters[lines_map[line]] += 1. for counter in levenshtein_counters: levenshtein_counters[counter] /= len(lines_map) return levenshtein_counters
def bayes(word, corpus, dictionary, verbose=False): probability_w_given_c = get_probability_map_w_given_c('test/bledy.txt') probability_function_c = get_probability_function_c(corpus, dictionary) lines = [line.strip() for line in open(dictionary)] fixes = {} for fix_word in lines: levenshtein_distance = levenshtein(unicode(fix_word, "utf-8"), word) probability_w = probability_w_given_c[ levenshtein_distance] if levenshtein_distance in probability_w_given_c else 0 probability_c = probability_function_c(fix_word) fixes[fix_word] = probability_w**5 * probability_c if verbose: for line, score in nlargest(20, fixes.items(), key=itemgetter(1)): print line, " : ", score return max(fixes.iteritems(), key=itemgetter(1))[0]
# -*- coding: utf-8 -*- #!/usr/bin/env python # from __future__ import print_function import numpy as np import os import sys from stats import levenshtein, cost_function from operator import itemgetter from heapq import nsmallest if __name__ == '__main__': if len(sys.argv) == 3: dictionary = sys.argv[1] word = unicode(sys.argv[2], "utf-8") lines = [line.strip() for line in open(dictionary)] lines_map = {} for line in lines: lines_map[line] = levenshtein(unicode(line, "utf-8"), word) for line, score in nsmallest(5, lines_map.items(), key=itemgetter(1)): print line, " : ", score else: print("python stats.py [dictionary] [word2]")
def test_special_cases(self): self.assertEqual(levenshtein(u'żołnież', u'zolniez'), 0.75) self.assertEqual(levenshtein(u'chociaż', u'hociaż'), 0.25) self.assertEqual(levenshtein(u'chociarz', u'hociaż'), 0.75) self.assertEqual(levenshtein(u'piszę', u'pizsę'), 0.5) self.assertEqual(levenshtein(u'pizsę', u'piszę'), 0.5)
def test_levenshtein(self): self.assertEqual(levenshtein(u'abc', u'afcde'), 3) self.assertEqual(levenshtein(u'żołnież', u'rzołnież'), 0.5) self.assertEqual(levenshtein(u'rzołnież', u'żołnież'), 0.5)