def search_next(word, words_index): counter = Counter() next_words_count = 0 md5 = hashlib.md5() md5.update(word) word_hash = md5.hexdigest() word_id = int(word_hash, 16) % 4294967296 # 2**32 # is an list of tuples: (index, starting_pos, chunk_size) word_locs = words_index.get(str(word_id)) for index, starting_pos, chunk_size in word_locs: words, count = extract_next_words_fast(word, index, starting_pos, chunk_size) for w in words: counter[w] += words[w] next_words_count += count return counter, next_words_count
def load_hash32(hash32, words_index): if not isinstance(hash32, str): hash32 = str(hash32) locs = words_index.get(hash32) data = dict() if locs is None: return data print("reading %s data files" % str(len(locs))) for index, starting_pos, chunk_size in locs: word = extract_parent_word(index, starting_pos, chunk_size) if word is None: continue child_words, counts = extract_next_words_fast(word, index, starting_pos, chunk_size) if word in data: data[word]["counts"] += counts data[word]["children"].append(child_words) else: data[word] = { "counts": counts, "children": [child_words] } return data