Beispiel #1
0
def generate_typos(word, distance, d=0):
    yield (word, d)
    if d == distance:
        return

    # add letter
    for pos in range(len(word)):
        for letter in ALPHABET:
            yield from generate_typos(word[:pos] + letter + word[pos:], distance, d + 1)

    # remove letter
    for pos in range(len(word)):
        yield from generate_typos(word[:pos] + word[pos + 1:], distance, d + 1)

    # change letter
    for pos in range(len(word)):
        for letter in generate_letter_typos(word[pos]):
            yield from generate_typos(word[:pos] + letter + word[pos + 1:], distance, d + 1)

    # swap letters
    for pos in range(len(word) - 1):
        yield from generate_typos(word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:], distance, d + 1)

if __name__ == '__main__':
    grams1 = grams.load_grams('../1grams_cleaned', 1)
    pool = multiprocessing.Pool()
    data = pool.map(possible_typos, grams1[0])
    with open('typos.dat', 'wb') as f:
        pickle.dump(data, f)
Beispiel #2
0
    # remove letter
    for pos in range(len(word)):
        yield from generate_typos(word[:pos] + word[pos + 1:], distance, d + 1)

    # change letter
    for pos in range(len(word)):
        for letter in ALPHABET:
            if letter != word[pos]:
                yield from generate_typos(word[:pos] + letter + word[pos + 1:], distance, d + 1)

    # swap letters
    for pos in range(len(word) - 1):
        yield from generate_typos(word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:], distance, d + 1)

def fix_line(line):
    return fix_typos(line.strip(), dictionary, unigrams, bigrams)

if __name__ == '__main__':
    #generate_dictionary('../slownik_do_literowek.txt')
    dictionary = load_dictionary('../slownik_do_literowek.dat')
    unigrams = [grams.load_grams('../1grams_min_cleaned', 1)]
    bigrams = [grams.load_grams('../2grams_min_cleaned', 2)]
    unigrams.append(sum(unigrams[0][3][0]))
    bigrams.append(sum(bigrams[0][3][0]))
    print('Loaded!')

    pool = multiprocessing.Pool()

    print('\n'.join(pool.map(fix_line, sys.stdin)))
Beispiel #3
0
                if a in ALT and ALT[a] == b:
                    break

                elif a in REVALT and REVALT[a] == b:
                    break

                elif i + 1 < len(word) and (a, word[i + 1]) == (typo[i + 1],
                                                                b):
                    trans_ab[(a, b)] += count
                    break

                else:
                    break

    grams1 = grams.load_grams('../1grams_cleaned', 1)
    paired = defaultdict(lambda: 0)
    for word in grams1[0]:
        if word not in DICTIONARY:
            continue

        count = grams.find_ngram([word], *grams1)
        for i in range(len(word)):
            if i + 1 < len(word):
                paired[(word[i], word[i + 1])] += count

    for (key, value) in trans_ab.items():
        trans_ab[key] = value / (value + paired[key])

    for (key, value) in sorted(trans_ab.items(), key=lambda x: -x[1]):
        print(''.join(key) + ':', value)
Beispiel #4
0
from collections import defaultdict
import grams
import random
import re
import sys

WORD    = 1
TAG     = 2

WORDS = {}
WORDS2 = defaultdict(lambda: dict())
TAGS = defaultdict(lambda: list())

grams1 = grams.load_grams('../1grams_min_cleaned', 1)
grams2 = grams.load_grams('../2grams_min_cleaned', 2)
for idx in range(len(grams1[3][0])):
    WORDS[grams1[0][grams1[3][1][idx]]] = grams1[3][0][idx]

for idx in range(len(grams2[3][0])):
    WORDS2[grams2[0][grams2[3][1][idx]]][grams2[0][grams2[3][2][idx]]] = grams2[3][0][idx]

regex = re.compile(r'[^\w ]', re.UNICODE | re.IGNORECASE)
with open('../morfeuszTagsAndBasesForNKJP.txt', 'r') as f:
    for line in f:
        word, base, *tags = line.strip().split()
        word = regex.sub('', word).strip()
        if word not in WORDS:
            continue

        for tag in tags:
            TAGS[tag].append(word)
Beispiel #5
0
#!/bin/env python3
import grams
import random
import sys

words, words_index, words_position, connections, connections_index = grams.load_grams(
    '../2grams', 2)


def upper_bound(val, arr, s=0, e=None, key=lambda x: x):
    if e is None:
        e = len(arr)

    while s < e:
        mid = (s + e) // 2
        if val >= key(arr[mid]):
            s = mid + 1

        else:
            e = mid

    return s


def choose_simple(i, s, e):
    if s == e:
        return None

    cid = connections_index[random.randint(s, e - 1)]
    wid = connections[i + 1][cid]
    return wid
Beispiel #6
0
#!/bin/env python3
import grams
import itertools
import math
import random

WINDOW = 2
words, words_index, words_position, connections, connections_index = grams.load_grams(
    '../{}grams'.format(WINDOW), WINDOW)

cache = {}


def process_sentence(sentence, done=None):
    if done is None:
        done = []

    if not sentence:
        yield done
        return

    for i in range(len(sentence)):
        if sentence[i].startswith('-'):
            break

        done.append(sentence[i])

    else:
        yield done
        return
Beispiel #7
0
#!/bin/env python3
import grams
import random
import sys

words, words_index, words_position, connections, connections_index = grams.load_grams('../2grams', 2)

def upper_bound(val, arr, s=0, e=None, key=lambda x: x):
    if e is None:
        e = len(arr)

    while s < e:
        mid = (s + e) // 2
        if val >= key(arr[mid]):
            s = mid + 1

        else:
            e = mid

    return s

def choose_simple(i, s, e):
    if s == e:
        return None

    cid = connections_index[random.randint(s, e - 1)]
    wid = connections[i+1][cid]
    return wid

def choose_ranked(i, s, e):
    if s == e:
Beispiel #8
0
#!/bin/env python3
import grams
import itertools
import random

WINDOW = 2
words, words_index, words_position, connections, connections_index = grams.load_grams('../{}grams'.format(WINDOW), WINDOW)

def upper_bound(val, arr, s=0, e=None, key=lambda x: x):
    if e is None:
        e = len(arr)

    while s < e:
        mid = (s + e) // 2
        if val >= key(arr[mid]):
            s = mid + 1

        else:
            e = mid

    return s

def find_ngram(ngram):
    s = 0
    e = len(connections_index)
    for i, word in enumerate(ngram):
        word_id = words_index[upper_bound(word, words_index, key=lambda idx: words[idx]) - 1]
        if words[word_id] != word:
            return 0

        word_position = words_position[word_id]