def bigram_score(w1, w2, unigrams, bigrams): c1 = grams.find_ngram([w1], *unigrams[0]) c2 = grams.find_ngram([w2], *unigrams[0]) c12 = grams.find_ngram([w1, w2], *bigrams[0]) if not c12: if not c2: return -1000000000 return math.log2(ALPHA * c2 / unigrams[1]) return math.log2((1 - ALPHA) * c12 / c1)
def possible_typos(word): if word not in DICTIONARY: return ('', []) result = (word, []) for typo, _ in generate_typos(word, 1): if typo not in DICTIONARY: continue cnt1 = grams.find_ngram([word], *grams1) cnt2 = grams.find_ngram([typo], *grams1) if cnt2 <= cnt1: result[1].append((typo, cnt2)) result = (result[0], tuple(sorted(set(result[1]), key=lambda x: -x[1]))) if len(result[1]) < 2: return ('', []) return result
if a in ALT and ALT[a] == b: break elif a in REVALT and REVALT[a] == b: break elif i + 1 < len(word) and (a, word[i + 1]) == (typo[i + 1], b): trans_ab[(a, b)] += count break else: break grams1 = grams.load_grams('../1grams_cleaned', 1) paired = defaultdict(lambda: 0) for word in grams1[0]: if word not in DICTIONARY: continue count = grams.find_ngram([word], *grams1) for i in range(len(word)): if i + 1 < len(word): paired[(word[i], word[i + 1])] += count for (key, value) in trans_ab.items(): trans_ab[key] = value / (value + paired[key]) for (key, value) in sorted(trans_ab.items(), key=lambda x: -x[1]): print(''.join(key) + ':', value)
def unigram_score(word, unigrams): c = grams.find_ngram([word], *unigrams[0]) if not c: return -1000000000 return math.log2(c / unigrams[1])
if not match in cache: cache[match] = [] for word in words: if word.endswith(match): cache[match].append(word) for word in cache[match]: yield from process_sentence(sentence, done + [word]) if __name__ == '__main__': while True: sentence = input('').lower().split() result = 0 for pos in range(len(sentence) - WINDOW + 1): res = 0 count = 0 for perm in process_sentence(sentence[pos:pos + WINDOW]): count += 1 res -= math.log( grams.find_ngram(perm, words_index, words, words_position, connections_index, connections) + 1) - math.log( grams.find_ngram(perm[:1], words_index, words, words_position, connections_index, connections) + len(words)) result += res / count print(int(result * 1000000), ' '.join(sentence))
if a in ALT and ALT[a] == b: break elif a in REVALT and REVALT[a] == b: break elif i + 1 < len(word) and (a, word[i+1]) == (typo[i+1], b): trans_ab[(a, b)] += count break else: break grams1 = grams.load_grams('../1grams_cleaned', 1) paired = defaultdict(lambda: 0) for word in grams1[0]: if word not in DICTIONARY: continue count = grams.find_ngram([word], *grams1) for i in range(len(word)): if i + 1 < len(word): paired[(word[i], word[i+1])] += count for (key, value) in trans_ab.items(): trans_ab[key] = value / (value + paired[key]) for (key, value) in sorted(trans_ab.items(), key=lambda x: -x[1]): print(''.join(key) + ':', value)