Esempio n. 1
0
def get_probability_map_w_given_c(mistakes_file):
    lines = [line.strip().split(';') for line in open(mistakes_file)]
    levenshtein_counters = {}
    for line in lines:
        dist = levenshtein(unicode(line[0], "utf-8"),
                           unicode(line[1], "utf-8"))
        if dist not in levenshtein_counters:
            levenshtein_counters[dist] = 1.
        else:
            levenshtein_counters[dist] += 1.

    for counter in levenshtein_counters:
        levenshtein_counters[counter] /= len(lines)

    return levenshtein_counters
Esempio n. 2
0
def get_probability_w_given_c(word, dictionary):
    lines = [line.strip() for line in open(dictionary)]
    lines_map = {}
    for line in lines:
        lines_map[line] = levenshtein(unicode(line, "utf-8"), word)

    levenshtein_counters = {}
    for line in lines_map:
        if lines_map[line] not in levenshtein_counters:
            levenshtein_counters[lines_map[line]] = 1.
        else:
            levenshtein_counters[lines_map[line]] += 1.

    for counter in levenshtein_counters:
        levenshtein_counters[counter] /= len(lines_map)

    return levenshtein_counters
Esempio n. 3
0
def bayes(word, corpus, dictionary, verbose=False):
    probability_w_given_c = get_probability_map_w_given_c('test/bledy.txt')
    probability_function_c = get_probability_function_c(corpus, dictionary)

    lines = [line.strip() for line in open(dictionary)]
    fixes = {}
    for fix_word in lines:
        levenshtein_distance = levenshtein(unicode(fix_word, "utf-8"), word)
        probability_w = probability_w_given_c[
            levenshtein_distance] if levenshtein_distance in probability_w_given_c else 0
        probability_c = probability_function_c(fix_word)
        fixes[fix_word] = probability_w**5 * probability_c

    if verbose:
        for line, score in nlargest(20, fixes.items(), key=itemgetter(1)):
            print line, " : ", score

    return max(fixes.iteritems(), key=itemgetter(1))[0]
Esempio n. 4
0
# -*- coding: utf-8 -*-
#!/usr/bin/env python

# from __future__ import print_function
import numpy as np
import os
import sys
from stats import levenshtein, cost_function
from operator import itemgetter
from heapq import nsmallest

if __name__ == '__main__':
    if len(sys.argv) == 3:
        dictionary = sys.argv[1]
        word = unicode(sys.argv[2], "utf-8")

        lines = [line.strip() for line in open(dictionary)]
        lines_map = {}

        for line in lines:
            lines_map[line] = levenshtein(unicode(line, "utf-8"), word)
        for line, score in nsmallest(5, lines_map.items(), key=itemgetter(1)):
            print line, " : ", score

    else:
        print("python stats.py [dictionary] [word2]")
Esempio n. 5
0
 def test_special_cases(self):
     self.assertEqual(levenshtein(u'żołnież', u'zolniez'), 0.75)
     self.assertEqual(levenshtein(u'chociaż', u'hociaż'), 0.25)
     self.assertEqual(levenshtein(u'chociarz', u'hociaż'), 0.75)
     self.assertEqual(levenshtein(u'piszę', u'pizsę'), 0.5)
     self.assertEqual(levenshtein(u'pizsę', u'piszę'), 0.5)
Esempio n. 6
0
 def test_levenshtein(self):
     self.assertEqual(levenshtein(u'abc', u'afcde'), 3)
     self.assertEqual(levenshtein(u'żołnież', u'rzołnież'), 0.5)
     self.assertEqual(levenshtein(u'rzołnież', u'żołnież'), 0.5)