Esempio n. 1
0
def process_text(text, models):
    results = [0] * len(models)
    tokens = tokenizer.tokenize(text)
    lastwords = []
    lastnormalized = []
    for (kind, data) in tokens:
        if kind == tokenizer.TOKEN_WORD:
            lastwords.append(data)
            lastnormalized.append(tokenizer.normalize(data))
            lastwords = lastwords[-WORDGRAMS_SIZE:]
            lastnormalized = lastnormalized[-NORMALIZEDGRAMS_SIZE:]
            for score in enumerate(
                    map(
                        lambda mod: count_probabilities(
                            lastwords[-WORDGRAMS_SIZE:], lastnormalized[
                                -NORMALIZEDGRAMS_SIZE:], mod[0]), models)):
                results[score[0]] += WORD_SHARE * score[1][
                    0] + NORMALIZED_SHARE * score[1][1]

        #if kind == tokenizer.TOKEN_END_OF_SENTENCE:
        #    print('Results: {}'.format(results))

    return tuple(
        reversed(
            sorted(
                map(lambda res: (res[1], models[res[0]][1]),
                    enumerate(results)))))
Esempio n. 2
0
def _phrase_search(user, query):
    n = normalize(query)
    keywords = tokenize(n)
    logging.info('phrase_search: query: '+query)
    logging.info('n: '+n)
    logging.info('keywords:' + str(keywords))
    if not len(keywords):
        return []

    logging.info('%d - %s' % (0, keywords[0]));
    results = _lookup(user, keywords[0])
    if not results:
        return []
    logging.info('%s' % str(results));
    for i in range(1, len(keywords)):
        logging.info('%d - %s' % (i, keywords[i]));
        id_pos_dict = _lookup(user, keywords[i])
        logging.info('%s' % str(id_pos_dict));
        if id_pos_dict:
            for id in results.keys():
                if id not in id_pos_dict:
                    del results[id]
                else:
                    poses = []
                    for pos in id_pos_dict[id]:
                        if pos - 1 in results[id]:
                            poses.append(pos)
                    if not len(poses):
                        del results[id]
                    else:
                        results[id] = poses
        else:
            return []
    return results.keys()
Esempio n. 3
0
def calculate_unknown(tokens, model):
    word2id         = model['word2id']
    known_words     = set()
    unknown_words   = []
    unknown_count   = 0
    lastwords       = []
    for (kind, word) in tokens:
        if kind == tokenizer.TOKEN_WORD:
            normalized = tokenizer.normalize(word)
            lastwords.append(word2id[normalized])

        elif kind == tokenizer.TOKEN_END_OF_SENTENCE:
            if random.randint(1, BETA) == 1:
                unknown_words.extend(lastwords)

            else:
                for word in lastwords:
                    known_words.add(word)

            lastwords = []

    for word in unknown_words:
        if word not in known_words:
            unknown_count += 1

    model['unknown'] = unknown_count / len(unknown_words) / DELTA
    print('# Unknown: {:.9f}'.format(model['unknown']))
Esempio n. 4
0
def calculate_unknown(tokens, model):
    word2id = model['word2id']
    known_words = set()
    unknown_words = []
    unknown_count = 0
    lastwords = []
    for (kind, word) in tokens:
        if kind == tokenizer.TOKEN_WORD:
            normalized = tokenizer.normalize(word)
            lastwords.append(word2id[normalized])

        elif kind == tokenizer.TOKEN_END_OF_SENTENCE:
            if random.randint(1, BETA) == 1:
                unknown_words.extend(lastwords)

            else:
                for word in lastwords:
                    known_words.add(word)

            lastwords = []

    for word in unknown_words:
        if word not in known_words:
            unknown_count += 1

    model['unknown'] = unknown_count / len(unknown_words) / DELTA
    print('# Unknown: {:.9f}'.format(model['unknown']))
Esempio n. 5
0
def possible_replacements(word, dictionary):
    replacements = set([word])
    normalized = tokenizer.normalize(word)
    distance = 0
    if 5 <= len(word):
        distance = 3

    elif 4 <= len(word):
        distance = 2

    elif 2 <= len(word):
        distance = 1

    for (change, dist) in generate_typos(normalized, distance):
        if len(change) <= 2 and dist > 0:
            continue

        elif 3 <= len(change) <= 5 and dist > 1:
            continue

        elif 6 <= len(change) <= 8 and dist > 2:
            continue

        elif 8 < len(change) and dist > 3:
            continue

        if change in dictionary:
            for replacement in dictionary[change]:
                replacements.add(replacement)

    return list(replacements)
Esempio n. 6
0
def possible_replacements(word, dictionary):
    replacements = set([word])
    normalized = tokenizer.normalize(word)
    for (change, dist) in generate_typos(normalized, 1):
        if len(change) <= 2 and dist > 0:
            continue

        if change in dictionary:
            for replacement in dictionary[change]:
                replacements.add(replacement)

    return list(replacements)
Esempio n. 7
0
def gather_normalizedgrams(tokens, model):
    word2id         = model['word2id']
    normalizedgrams = model['normalizedgrams']
    lastnormalized  = []
    for (kind, word) in tokens:
        if kind == tokenizer.TOKEN_WORD:
            normalized = tokenizer.normalize(word)
            lastnormalized.append(word2id[normalized])
            lastnormalized = lastnormalized[-NORMALIZEDGRAMS_SIZE:]
            gather_grams(lastnormalized, normalizedgrams)

    model['normalized_count'] = len(normalizedgrams[0])
    print('# Normalizedgrams: {}'.format(
        tuple(map(lambda dct: len(dct), normalizedgrams))))
Esempio n. 8
0
def gather_normalizedgrams(tokens, model):
    word2id = model['word2id']
    normalizedgrams = model['normalizedgrams']
    lastnormalized = []
    for (kind, word) in tokens:
        if kind == tokenizer.TOKEN_WORD:
            normalized = tokenizer.normalize(word)
            lastnormalized.append(word2id[normalized])
            lastnormalized = lastnormalized[-NORMALIZEDGRAMS_SIZE:]
            gather_grams(lastnormalized, normalizedgrams)

    model['normalized_count'] = len(normalizedgrams[0])
    print('# Normalizedgrams: {}'.format(
        tuple(map(lambda dct: len(dct), normalizedgrams))))
Esempio n. 9
0
def generate_dictionary(input_filename, output_filename=None):
    if output_filename is None:
        input_file, input_ext = os.path.splitext(input_filename)
        output_filename = input_file + '.dat'

    dictionary = {}
    with open(input_filename, 'r') as _file:
        for word in _file:
            word = word.strip()
            normalized = tokenizer.normalize(word)
            if normalized not in dictionary:
                dictionary[normalized] = []

            dictionary[normalized].append(word)

    #print('Dictionary size: {}'.format(len(dictionary)))
    save_dictionary(dictionary, output_filename)
Esempio n. 10
0
def make_dictionary(tokens, model):
    word2id = model['word2id']
    id2word = model['id2word']
    for (kind, word) in tokens:
        if kind == tokenizer.TOKEN_WORD:
            normalized = tokenizer.normalize(word)
            if word not in word2id:
                word2id[word] = len(id2word)
                id2word.append(word)

            if normalized not in word2id:
                word2id[normalized] = len(id2word)
                id2word.append(normalized)

            model['words_sum'] += 1

    print('# Words: {:d}'.format(len(id2word)))
Esempio n. 11
0
def make_dictionary(tokens, model):
    word2id = model['word2id']
    id2word = model['id2word']
    for (kind, word) in tokens:
        if kind == tokenizer.TOKEN_WORD:
            normalized = tokenizer.normalize(word)
            if word not in word2id:
                word2id[word] = len(id2word)
                id2word.append(word)

            if normalized not in word2id:
                word2id[normalized] = len(id2word)
                id2word.append(normalized)

            model['words_sum'] += 1

    print('# Words: {:d}'.format(len(id2word)))
Esempio n. 12
0
def process_text(text, models):
    results = [0] * len(models)
    tokens = tokenizer.tokenize(text)
    lastwords       = []
    lastnormalized  = []
    for (kind, data) in tokens:
        if kind == tokenizer.TOKEN_WORD:
            lastwords.append(data)
            lastnormalized.append(tokenizer.normalize(data))
            lastwords = lastwords[-WORDGRAMS_SIZE:]
            lastnormalized = lastnormalized[-NORMALIZEDGRAMS_SIZE:]
            for score in enumerate(map(lambda mod: count_probabilities(lastwords[-WORDGRAMS_SIZE:], lastnormalized[-NORMALIZEDGRAMS_SIZE:], mod[0]), models)):
                results[score[0]] += WORD_SHARE * score[1][0] + NORMALIZED_SHARE * score[1][1]

        #if kind == tokenizer.TOKEN_END_OF_SENTENCE:
        #    print('Results: {}'.format(results))

    return tuple(reversed(sorted(map(lambda res: (res[1], models[res[0]][1]), enumerate(results)))))
Esempio n. 13
0
def _and_search(user, query):
    n = normalize(query)
    keywords = tokenize(n)
    logging.info('and_search: query: '+query)
    logging.info('n: '+n)
    logging.info('keywords:' + str(keywords))
    if not len(keywords):
        return []

    results = _lookup(user, keywords[0])
    if not results:
        return []

    for i in range(1, len(keywords)):
        id_pos_dict = _lookup(user, keywords[i])
        if id_pos_dict:
            for id in results.keys():
                if id not in id_pos_dict:
                    del results[id]
        else:
            return []
    return results.keys()
Esempio n. 14
0
def add_page_to_index(index, url, content):
    keywords = tokenize(normalize(content))
    pos = 0
    for keyword in keywords:
        add_to_index(index, keyword, url, pos)
        pos += 1