Example #1
0
def bigram_score(w1, w2, unigrams, bigrams):
    c1 = grams.find_ngram([w1], *unigrams[0])
    c2 = grams.find_ngram([w2], *unigrams[0])
    c12 = grams.find_ngram([w1, w2], *bigrams[0])
    if not c12:
        if not c2:
            return -1000000000

        return math.log2(ALPHA * c2 / unigrams[1])

    return math.log2((1 - ALPHA) * c12 / c1)
Example #2
0
def possible_typos(word):
    if word not in DICTIONARY:
        return ('', [])

    result = (word, [])
    for typo, _ in generate_typos(word, 1):
        if typo not in DICTIONARY: continue
        cnt1 = grams.find_ngram([word], *grams1)
        cnt2 = grams.find_ngram([typo], *grams1)
        if cnt2 <= cnt1:
            result[1].append((typo, cnt2))

    result = (result[0], tuple(sorted(set(result[1]), key=lambda x: -x[1])))
    if len(result[1]) < 2:
        return ('', [])

    return result
Example #3
0
def possible_typos(word):
    if word not in DICTIONARY:
        return ('', [])

    result = (word, [])
    for typo, _ in generate_typos(word, 1):
        if typo not in DICTIONARY: continue
        cnt1 = grams.find_ngram([word], *grams1)
        cnt2 = grams.find_ngram([typo], *grams1)
        if cnt2 <= cnt1:
            result[1].append((typo, cnt2))

    result = (result[0], tuple(sorted(set(result[1]), key=lambda x: -x[1])))
    if len(result[1]) < 2:
        return ('', [])

    return result
Example #4
0
                if a in ALT and ALT[a] == b:
                    break

                elif a in REVALT and REVALT[a] == b:
                    break

                elif i + 1 < len(word) and (a, word[i + 1]) == (typo[i + 1],
                                                                b):
                    trans_ab[(a, b)] += count
                    break

                else:
                    break

    grams1 = grams.load_grams('../1grams_cleaned', 1)
    paired = defaultdict(lambda: 0)
    for word in grams1[0]:
        if word not in DICTIONARY:
            continue

        count = grams.find_ngram([word], *grams1)
        for i in range(len(word)):
            if i + 1 < len(word):
                paired[(word[i], word[i + 1])] += count

    for (key, value) in trans_ab.items():
        trans_ab[key] = value / (value + paired[key])

    for (key, value) in sorted(trans_ab.items(), key=lambda x: -x[1]):
        print(''.join(key) + ':', value)
Example #5
0
def unigram_score(word, unigrams):
    c = grams.find_ngram([word], *unigrams[0])
    if not c:
        return -1000000000

    return math.log2(c / unigrams[1])
Example #6
0
    if not match in cache:
        cache[match] = []
        for word in words:
            if word.endswith(match):
                cache[match].append(word)

    for word in cache[match]:
        yield from process_sentence(sentence, done + [word])


if __name__ == '__main__':
    while True:
        sentence = input('').lower().split()
        result = 0
        for pos in range(len(sentence) - WINDOW + 1):
            res = 0
            count = 0
            for perm in process_sentence(sentence[pos:pos + WINDOW]):
                count += 1
                res -= math.log(
                    grams.find_ngram(perm, words_index, words, words_position,
                                     connections_index, connections) +
                    1) - math.log(
                        grams.find_ngram(perm[:1], words_index, words,
                                         words_position, connections_index,
                                         connections) + len(words))

            result += res / count

        print(int(result * 1000000), ' '.join(sentence))
Example #7
0
                if a in ALT and ALT[a] == b:
                    break

                elif a in REVALT and REVALT[a] == b:
                    break

                elif i + 1 < len(word) and (a, word[i+1]) == (typo[i+1], b):
                    trans_ab[(a, b)] += count
                    break

                else:
                    break

    grams1 = grams.load_grams('../1grams_cleaned', 1)
    paired = defaultdict(lambda: 0)
    for word in grams1[0]:
        if word not in DICTIONARY:
            continue

        count = grams.find_ngram([word], *grams1)
        for i in range(len(word)):
            if i + 1 < len(word):
                paired[(word[i], word[i+1])] += count

    for (key, value) in trans_ab.items():
        trans_ab[key] = value / (value + paired[key])

    for (key, value) in sorted(trans_ab.items(), key=lambda x: -x[1]):
        print(''.join(key) + ':', value)