Esempio n. 1
0
def search_next(word, words_index):
    counter = Counter()
    next_words_count = 0

    md5 = hashlib.md5()
    md5.update(word)
    word_hash = md5.hexdigest()
    word_id = int(word_hash, 16) % 4294967296  # 2**32

    # is an list of tuples: (index, starting_pos, chunk_size)
    word_locs = words_index.get(str(word_id))
    for index, starting_pos, chunk_size in word_locs:
        words, count = extract_next_words_fast(word, index, starting_pos, chunk_size)
        for w in words:
            counter[w] += words[w]
        next_words_count += count

    return counter, next_words_count
Esempio n. 2
0
def load_hash32(hash32, words_index):
    if not isinstance(hash32, str):
        hash32 = str(hash32)

    locs = words_index.get(hash32)

    data = dict()
    if locs is None:
        return data

    print("reading %s data files" % str(len(locs)))

    for index, starting_pos, chunk_size in locs:
        word = extract_parent_word(index, starting_pos, chunk_size)
        if word is None:
            continue
        child_words, counts = extract_next_words_fast(word, index, starting_pos, chunk_size)
        if word in data:
            data[word]["counts"] += counts
            data[word]["children"].append(child_words)
        else:
            data[word] = { "counts": counts, "children": [child_words] }

    return data