def __double_hash(self, i, x): """ double Hashing 给定两个彼此独立的哈希函数 hasha 和 hashb,可以通过如下的哈希函数创建一个新的哈希函数: hash_i(x, m) = (hasha(x) + i * hashb(x)) mod m 依次类推,我们可以生成 第 i 个新的哈希函数 :param i: :param x: 被 Hash 的值 :return: """ if type(x) != str: x = str(x) return (murmurhash.hash(x) + i * fnvhash.fnv0_32(bytes(x, encoding="utf8"))) % self.m
def get_stream(): keys_a = [ key for key, _ in s2v_a.frequencies[:n_freq] if key not in seen ] keys_b = [ key for key, _ in s2v_b.frequencies[:n_freq] if key not in seen ] while len(keys_a): key = random.choice(keys_a) keys_a.remove(key) word, sense = s2v_a.split_key(key) if sense in exclude_senses or (senses is not None and sense not in senses): continue if key not in keys_b: continue similar_a = set( [k for k, _ in s2v_a.most_similar(key, n=n_similar)]) similar_b = set( [k for k, _ in s2v_b.most_similar(key, n=n_similar)]) overlap = similar_a.intersection(similar_b) options = [ { "id": "A", "html": get_option_html(similar_a, overlap) }, { "id": "B", "html": get_option_html(similar_b, overlap) }, ] random.shuffle(options) task_hash = murmurhash.hash(key) task = { "html": get_term_html(key), "text": key, "options": options, TASK_HASH_ATTR: task_hash, INPUT_HASH_ATTR: task_hash, } if show_mapping: opt_map = [ f"{opt['id']} ({mapping[opt['id']]})" for opt in options ] task["meta"] = {i + 1: opt for i, opt in enumerate(opt_map)} yield task
def get_stream(): keys = [key for key, _ in s2v.frequencies[:n_freq] if key not in seen] while len(keys): key = random.choice(keys) keys.remove(key) word, sense = s2v.split_key(key) if sense in exclude_senses or (senses is not None and sense not in senses): continue most_similar = s2v.most_similar(key, n=n_similar) options = [{ "id": k, "html": get_html(k, s) } for k, s in most_similar] task_hash = murmurhash.hash(key) task = { "html": get_html(key, large=True), "text": key, "options": options, "accept": [key for key, _ in most_similar], # pre-select all TASK_HASH_ATTR: task_hash, INPUT_HASH_ATTR: task_hash, } yield task
def get_stream(): strategy_func = eval_strategies.get(strategy) log(f"RECIPE: Using strategy {strategy}") # Limit to most frequent entries keys = [key for key, _ in s2v.frequencies[:n_freq]] keys_by_sense = defaultdict(set) for key in keys: try: sense = s2v.split_key(key)[1] except ValueError: continue if (senses is None or sense in senses) and sense not in exclude_senses: keys_by_sense[sense].add(key) keys_by_sense = { s: keys for s, keys in keys_by_sense.items() if len(keys) >= 3 } all_senses = list(keys_by_sense.keys()) total_keys = sum(len(keys) for keys in keys_by_sense.values()) log(f"RECIPE: Using {total_keys} entries for {len(all_senses)} senses") n_passes = 1 while True: log(f"RECIPE: Iterating over the data ({n_passes})") current_keys = copy.deepcopy(keys_by_sense) while any(len(values) >= 3 for values in current_keys.values()): sense = random.choice(all_senses) all_keys = list(current_keys[sense]) key_a, key_b, key_c, sim_ab, sim_ac = strategy_func( s2v, all_keys) if len(set([key_a.lower(), key_b.lower(), key_c.lower()])) != 3: continue if sim_ab < threshold or sim_ac < threshold: continue for key in (key_a, key_b, key_c): current_keys[sense].remove(key) confidence = 1.0 - (min(sim_ab, sim_ac) / max(sim_ab, sim_ac)) input_hash = murmurhash.hash(key_a) task_hash = murmurhash.hash(" ".join([key_a] + sorted([key_b, key_c]))) task = { "label": "Which one is more similar?", "html": get_html(key_a, large=True), "text": f"{key_a}: {key_b}, {key_c}", "key": key_a, "options": [ { "id": key_b, "html": get_html(key_b, sim_ab), "score": sim_ab, }, { "id": key_c, "html": get_html(key_c, sim_ac), "score": sim_ac, }, ], "confidence": confidence, TASK_HASH_ATTR: task_hash, INPUT_HASH_ATTR: input_hash, } if show_scores: task["meta"] = { "confidence": f"{confidence:.4}", "strategy": strategy, } yield task n_passes += 1
def minhash(self, text, num_shingels, window=25): # assume len(text) > 50 hashes = [murmurhash.hash(text[i:i + window]) for i in range(len(text) - window + 1)] return set(sorted(hashes)[0:num_shingels])
def hash(self, i, seed, ntweak, data): hseed = (i*seed + ntweak) & 0xffffffff hs = mh.hash(data, hseed) return hs % len(self.bits)
def similar_k(input_sentences, sentence_encoder, corpus_index, db_session, limit=10, method='union', group_by='cosine'): """Find similar sentences. Args: input_sentences (str/list[str]): one or more input sentences. sentence_encoder : encoder limit (int): limit result set size to ``limit``. corpus_index : type of corpus where to fetch the suggestions from db_session : Database to get neighbors from method (str): aggregation method ('union', 'mean', 'pc1', 'pc2'). group_by (str): distance metric to use to group the result set. Default is 'cosine'. Returns: list<dict> """ res = [] nearest = dict() if method == 'textrank': from nlp.textrank import calc_textrank # pylint: disable=import-outside-toplevel _, _, _, phrase_list = calc_textrank(input_sentences, num_phrases=5) input_sentences = [' '.join(phrase[0] for phrase in phrase_list)] method = Aggregation.UNION embeddings = sentence_encoder.encode(input_sentences) indices = [murmurhash.hash(sent) for sent in input_sentences] for idx, dist in corpus_index.knn_query_batch(embeddings, ids=indices, limit=limit, method=method): if idx not in nearest: nearest[idx] = dist else: nearest[idx] = min(nearest[idx], dist) for sentence in db_session.query(Sentence).filter( Sentence.id.in_(nearest.keys())).all(): sentence_dict = sentence.to_dict() encoding = sentence_encoder.encode(sentence.sentence) distances = scipy.spatial.distance.cdist(encoding, embeddings, group_by) nearest_idx = int(np.argmax(distances)) sentence_dict['nearest'] = indices[nearest_idx] sentence_dict['dist'] = nearest[sentence.id] res.append(sentence_dict) return { 'results': sorted(res, key=lambda x: x['dist']), 'sentences': [{ 'id': sent_id, 'text': sent } for sent_id, sent in zip(indices, input_sentences)] }