def edit_distance(self): ed = EditDistance() total_dist = 0 total_norm_dist = 0 op_count = {'m': 0, 'i': 0, 'd': 0, 'r': 0} op_count_norm = {'m': 0, 'i': 0, 'd': 0, 'r': 0} num_examples = len(self.gt) num_examples = max(num_examples, 1) for i in self.gt.keys(): gt = self.gt[i][0].split() gen = self.gen[i][0].split() max_len = float(max(len(gt), len(gen))) max_len = max(max_len, 1.0) dist = ed.compute(gt, gen) total_dist += dist total_norm_dist += dist / max_len ops = ed.operations() for op in ops: op_count[op] += 1 op_count_norm[op] += 1.0 / max_len mean_dist = total_dist / float(num_examples) mean_norm_dist = total_norm_dist / float(num_examples) for op in op_count: op_count[op] /= float(num_examples) op_count_norm[op] /= float(num_examples) return mean_dist, mean_norm_dist, op_count, op_count_norm
def exec_second(self, parole): e = EditDistance() a = Ngram() tempi = [] n_vicine_trovate = [] for parola in parole: with open('60000_parole_italiane.txt', 'r') as f: # print 'parola --> ', parola # edit distance # print '----- EDIT DISTANCE' e_results = [] start = timer() for line in f: p = line.rstrip() _, op = e.edit_distance(parola, p) costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, []) if costo < self.sogliaCosto: e_results.append((p, costo)) end = timer() time_edit = end - start n_edit = len(e_results) # print 'risultati (%s)' % n_edit, '-->', sorted(e_results, key=get(1)) # print 'tempo -->', time_edit # ngrams # print '----- NGRAMS' g_results = [] b = a.ngram(parola, self.numberOfGrams) with open("%s_grams.txt" % self.numberOfGrams, 'r') as r: start = timer() for line in r: s = line.split(' -> ') p, g = s[0], s[1] f = a.jaccard(b, g) if f > self.sogliaJaccard: g_results.append((p, f)) end = timer() time_gram = end - start n_gram = len(g_results) # print 'risultati (%s)' % n_gram, '-->', sorted(g_results, key=get(1), reverse=True) # print 'tempo -->', time_gram # print '\n' tempi.append([time_edit, time_gram]) n_vicine_trovate.append([n_edit, n_gram]) return [tempi, n_vicine_trovate]
def main(): while True: string_a = gen_random_string() string_b = gen_random_string() string_a = "bread" string_b = "really" print("String 1: {}\tString 2:{}".format(string_a, string_b)) edit_distance = EditDistance(string_a, string_b) ed = edit_distance.get_edit_dist() print("Edit Dist: {}".format(ed)) break
def search_candidates(self, input_string): editDistance = EditDistance() for leaf in self.leaves: for index in leaf.indices: i = leaf.start parent = leaf.parent cand = True p1 = -1 while cand and parent != None: p1 = index - (i - parent.start) p2 = index + (parent.end - i) + 1 if p1 < 0: p1 = 0 if p2 > len(input_string): p2 = len(input_string) distance = editDistance.compute(input_string[p1:p2], parent.pattern) if distance <= parent.error: parent = parent.parent else: p1 -= parent.error counter = parent.error withp1 = False while counter != 0 and not withp1: distance = editDistance.compute( input_string[p1:p2], parent.pattern) if distance <= parent.error: parent = parent.parent withp1 = True else: counter -= 1 p1 += 1 if not withp1: p2 += parent.error counter = parent.error withp2 = False while counter != 0 and not withp2: distance = editDistance.compute( input_string[p1:p2], parent.pattern) if distance <= parent.error: parent = parent.parent withp2 = True else: counter -= 1 p2 -= 1 if not withp2: cand = False if cand: self.indicesDict[p1] = input_string[p1:p2]
def exec_fifth(self): e = EditDistance() a = Ngram() originale = raw_input("**** Inserisci parola --> ") parola = self.storpia(originale) print '**** Parola storpiata -->', parola # edit distance print '----- EDIT DISTANCE' # costi: 1, 2, 3, 4, 5 for c in range(1, 6): with open('60000_parole_italiane.txt', 'r') as f: e_results = [] for line in f: p = line.rstrip() _, op = e.edit_distance(parola, p) costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, []) if costo < c: e_results.append((p, costo)) if any(originale in a for a in e_results): w = 'parola originale trovata!' else: w = 'parola originale non trovata!' print w, '(soglia costo %s, %s risultati)' % ( c, len(e_results)), '-->', sorted(e_results, key=get(1)) # ngram print '----- NGRAM' b = a.ngram(parola, self.numberOfGrams) # coefficienti: 0.5, 0.6, 0.7, 0.8, 0.9 for j in np.arange(0.5, 1.0, 0.1): with open("%s_grams.txt" % self.numberOfGrams, 'r') as f: g_results = [] for line in f: s = line.split(' -> ') p, g = s[0], s[1] f = a.jaccard(b, g) if f > j: g_results.append((p, f)) if any(originale in a for a in g_results): w = 'parola originale trovata!' else: w = 'parola originale non trovata!' print w, '(jaccard %s, %s risultati)' % ( j, len(g_results)), '-->', sorted(g_results, key=get(1), reverse=True)
def exec_third(self): e = EditDistance() a = Ngram() costi = [] coefficienti = [] risultati_edit = [] risultati_gram = [] parola = raw_input("**** Inserisci parola --> ") # edit distance # print '----- EDIT DISTANCE' # costi: 1, 2, 3, 4, 5 for c in range(1, 6): costi.append(c) with open('60000_parole_italiane.txt', 'r') as f: e_results = [] for line in f: p = line.rstrip() _, op = e.edit_distance(parola, p) costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, []) if costo < c: e_results.append((p, costo)) risultati_edit.append(len(e_results)) # print 'ho trovato %s risultati per soglia costo %s' % (len(e_results), c), '-->', sorted(e_results, key=get(1)) # ngram # print '----- NGRAM' b = a.ngram(parola, self.numberOfGrams) # coefficienti: 0.5, 0.6, 0.7, 0.8, 0.9 for j in np.arange(0.5, 1.0, 0.1): coefficienti.append(j) with open("%s_grams.txt" % self.numberOfGrams, 'r') as f: g_results = [] for line in f: s = line.split(' -> ') p, g = s[0], s[1] f = a.jaccard(b, g) if f > j: g_results.append((p, f)) risultati_gram.append(len(g_results)) # print 'ho trovato %s risultati per jaccard maggiore di %s' % (len(g_results), j), '-->', sorted(g_results, key=get(1), reverse=True) return [costi, coefficienti, risultati_edit, risultati_gram]
def exec_first(self): with open('60000_parole_italiane.txt', 'r') as f: e = EditDistance() a = Ngram() lines = f.readlines() rand = random.randint(0, len(lines)) word = lines[rand].rstrip() print 'random word -->', word # test edit distance start = timer() for line in lines: p = line.rstrip() if p == word: break _, op = e.edit_distance(word, p) _ = e.op_sequence(op, len(word) - 1, len(p) - 1, []) end = timer() time_edit = end - start # print 'tempo trascorso edit distance -->', time_edit # test ngrams b = a.ngram(word, self.numberOfGrams) with open("%s_grams.txt" % self.numberOfGrams, 'r') as r: start = timer() for line in r: s = line.split(' -> ') p, g = s[0], s[1] if p == word: break _ = a.jaccard(b, g) end = timer() time_ngram = end - start # print 'tempo trascorso ngrams -->', time_ngram return [word, time_edit, time_ngram]
class Candidate(): def __init__(self, dictionary): self.edit_distance = EditDistance(dictionary) self.telex = Telex() self.teencode = Teencode(dictionary) def generate_candidate(self, word): word_candidates = list(self.edit_distance.candidates_e1(word)) fix_telex = self.telex.uni2telex(word) if fix_telex != word: word_candidates.insert(0,fix_telex) candidates_acr = self.teencode.candidate_acronym(word) candidates_teen = self.teencode.candidate_teen(word) word_candidates += candidates_acr word_candidates += candidates_teen return word_candidates
def __init__(self, dictionary): self.edit_distance = EditDistance(dictionary) self.telex = Telex() self.teencode = Teencode(dictionary)
def test_case_1(self): word1 = 'abcd' word2 = 'abdd' e = EditDistance() result = e.calculateNumberOfChanges(word1, word2) self.assertEqual(result, 1)
def test_case_6(self): word1 = 'dinitrophenylhydrazine' word2 = 'benzalphenylhydrazone' e = EditDistance() result = e.calculateNumberOfChanges(word1, word2) self.assertEqual(result, 7)
def test_case_5(self): word1 = 'intention' word2 = 'execution' e = EditDistance() result = e.calculateNumberOfChanges(word1, word2) self.assertEqual(result, 5)