def smart_procrustes_align_gensim(base_embed: gensim.models.KeyedVectors, other_embed: gensim.models.KeyedVectors): """ This code, taken from https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf and modified, uses procrustes analysis to make two word embeddings compatible. :param base_embed: first embedding :param other_embed: second embedding to be changed :return other_embed: changed embedding """ base_embed.init_sims() other_embed.init_sims() base_vecs = base_embed.syn0norm other_vecs = other_embed.syn0norm # just a matrix dot product with numpy m = other_vecs.T.dot(base_vecs) # SVD method from numpy u, _, v = np.linalg.svd(m) # another matrix operation ortho = u.dot(v) # Replace original array with modified one # i.e. multiplying the embedding matrix (syn0norm)by "ortho" other_embed.syn0norm = other_embed.syn0 = other_embed.syn0norm.dot(ortho) return other_embed
def update_action_representations( i: int, action_model: gensim.models.KeyedVectors, seq: BeatmapSequence, word_id_dict: Dict[str, int], pred: Dict[str, np.ndarray], reverse_word_id_dict: Dict[int, np.ndarray], config: Config): # update all representations, to make interesting models possible without data leaking. if 'word_id' in pred.keys( ): # `word_id` is the prefered action representation word_str = reverse_word_id_dict[int(seq.data['prev_word_id'][:, i + 1])] seq.data['prev_word_vec'][:, i + 1] = action_model[word_str] word_str2per_attribute(i, word_str, seq) elif 'word_vec' in pred.keys(): closest_word_str = action_model.similar_by_vector( seq.data['prev_word_vec'][:, i + 1], topn=1, restrict_vocab=config.generation.restrict_vocab)[0][0] seq.data['prev_word_id'][:, i + 1] = word_id_dict[closest_word_str] word_str2per_attribute(i, closest_word_str, seq) else: prev_word = per_attribute2word_str(i, seq) seq.data['prev_word_vec'][:, i + 1] = action_model[prev_word] closest_word_str = action_model.similar_by_vector( seq.data['prev_word_vec'][:, i + 1], topn=1, restrict_vocab=config.generation.restrict_vocab)[0][0] seq.data['prev_word_id'][:, i + 1] = word_id_dict[closest_word_str]
def smart_procrustes_align_gensim(base_embed: gensim.models.KeyedVectors, other_embed: gensim.models.KeyedVectors): """ This code, taken from https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf and modified, uses procrustes analysis to make two word embeddings compatible. :param base_embed: first embedding :param other_embed: second embedding to be changed :return other_embed: changed embedding """ base_embed.init_sims() other_embed.init_sims() shared_vocab = list( set(base_embed.wv.vocab.keys()).intersection( other_embed.wv.vocab.keys())) base_idx2word = { num: word for num, word in enumerate(base_embed.wv.index2word) } other_idx2word = { num: word for num, word in enumerate(other_embed.wv.index2word) } base_word2idx = {word: num for num, word in base_idx2word.items()} other_word2idx = {word: num for num, word in other_idx2word.items()} base_shared_indices = [base_word2idx[word] for word in shared_vocab] other_shared_indices = [other_word2idx[word] for word in shared_vocab] base_vecs = base_embed.wv.syn0norm other_vecs = other_embed.wv.syn0norm base_shared_vecs = base_vecs[base_shared_indices] other_shared_vecs = other_vecs[other_shared_indices] m = other_shared_vecs.T @ base_shared_vecs u, _, v = np.linalg.svd(m) ortho = u @ v # Replace original array with modified one # i.e. multiplying the embedding matrix (syn0norm)by "ortho" other_embed.wv.syn0norm = other_embed.wv.syn0 = other_embed.wv.syn0norm.dot( ortho) return other_embed
def df2beatmap(df: pd.DataFrame, action_model: gensim.models.KeyedVectors, word_id_dict: Dict[str, int], config: Config, bpm: int = 60, events: Tuple = ()) -> JSON: beatmap = { '_version': '2.0.0', '_BPMChanges': [], '_notes': [], '_events': events, } df.index = df.index.to_frame()[ 'time'] # only time from the multiindex is needed inverse_word_id_dict = {val: key for key, val in word_id_dict.items()} if 'word_id' in df.columns: df['word_id'] = np.array(df['word_id'].to_list()).flatten() df = df.loc[df['word_id'] > 1] word = df['word_id'].map(lambda word_id: inverse_word_id_dict[word_id]) beatmap['_notes'] += word_ser2json(word) elif 'word_vec' in df.columns: word = df['word_vec'].map(lambda vec: action_model.similar_by_vector( vec, topn=1, restrict_vocab=config.generation.restrict_vocab)[0][0] ) beatmap['_notes'] += word_ser2json(word) else: beatmap['_notes'] += double_beat_element2json(df, config) return beatmap
def build_word_pairs(words, keyed_vectors: gensim.models.KeyedVectors): word_pairs = [] for word in words: sim_words = keyed_vectors.most_similar(positive=word, topn=3) pairs = [(word, sim_word[0]) for sim_word in sim_words] word_pairs.extend(pairs) return word_pairs
def restrict_vectors(wordvectors: gensim.models.KeyedVectors, restricted_word_set): new_vectors = [] new_vocab = {} new_index2entity = [] new_vectors_norm = [] wordvectors.init_sims() for i in tqdm(range(len(wordvectors.vocab)), desc="Vector restriction", total=len(wordvectors.vocab)): word = wordvectors.index2entity[i] vec = wordvectors.vectors[i] vocab = wordvectors.vocab[word] vec_norm = wordvectors.vectors_norm[i] if word in restricted_word_set: vocab.index = len(new_index2entity) new_index2entity.append(word) new_vocab[word] = vocab new_vectors.append(vec) new_vectors_norm.append(vec_norm) wordvectors.vocab = new_vocab wordvectors.vectors = np.array(new_vectors) wordvectors.index2entity = new_index2entity wordvectors.index2word = new_index2entity wordvectors.vectors_norm = new_vectors_norm
def get_umls_vectors_only(self, vectors: gensim.models.KeyedVectors): medical_concepts = [ word for word in vectors.index2word if word in self.umls_dict.values() ] concept_vecs = { concept: vectors.get_vector(concept) for concept in medical_concepts } return concept_vecs
def get_global_anchors(self, word: str, w2v: gensim.models.KeyedVectors): """ This takes in a word and a KeyedVectors model and returns a vector of cosine distances between this word and each word in the vocab. :param word: :param w2v: :return: np.array of distances shaped (len(w2v.vocab),) """ word_vector = w2v.get_vector(word) similarities = gensim.models.KeyedVectors.cosine_similarities(word_vector, w2v.vectors) return unitvec(similarities)
def assign_concepts_to_vecs(cls, vectors: gensim.models.KeyedVectors): if cls.umls_mapper is None: print('No UMLS defined yet. Build UMLSMapper...') cls.umls_mapper = UMLSMapper(from_dir=cls.config["PATH"]["UMLS"]) addable_concepts = [] addable_vectors = [] for concept, terms in cls.umls_mapper.umls_reverse_dict.items(): concept_vec = [] for term in terms: term_tokens = term.split() token_vecs = [] for token in term_tokens: if token in vectors.vocab: token_vecs.append(vectors.get_vector(token)) if len(term_tokens) == len(token_vecs): term_vector = sum(token_vecs) concept_vec.append(term_vector) if len(concept_vec) > 0: addable_concepts.append(concept) addable_vectors.append(sum(concept_vec) / len(concept_vec)) vectors.add(addable_concepts, addable_vectors) return vectors
def informative_output(words_and_scores, w2v1: gensim.models.KeyedVectors, w2v2: gensim.models.KeyedVectors, top_n_neighbors: int, model_name: str): print(model_name.center(40, '=')) for word, score in words_and_scores: top_n_1 = [ word for word, score in w2v1.most_similar(word, topn=top_n_neighbors) ] top_n_2 = [ word for word, score in w2v2.most_similar(word, topn=top_n_neighbors) ] print("word {word} has score {score}".format(word=word, score=score)) print("word {word} has the following neighbors in model1:".format( word=word)) print(*top_n_1, sep=',') print('_' * 40) print("word {word} has the following neighbors in model2:".format( word=word)) print(*top_n_2, sep=',') print("")
def clip_next_to_closest_existing(i: int, action_model: gensim.models.KeyedVectors, seq: BeatmapSequence, word_id_dict: Dict[str, int], config: Config): prev_word = per_attribute2word_str(i, seq) closest_word_str = action_model.similar_by_vector( action_model[prev_word], topn=1, restrict_vocab=config.generation.restrict_vocab)[0][0] seq.data['prev_word_id'][:, i + 1] = word_id_dict[closest_word_str] seq.data['prev_word_vec'][:, i + 1] = action_model[closest_word_str] closest_word_str = seq.data['prev_word'] word_str2per_attribute(i, closest_word_str, seq)
def generate_one_document( input_obj, word_emb_obj: gensim.models.KeyedVectors ) -> Optional[Tuple[str, str, numpy.ndarray, List[str], List[str]]]: """1ドキュメント分の情報を生成する""" try: vectors = [ word_emb_obj.get_vector(w[0]) for w in input_obj['title_morphs'] if w[0] in word_emb_obj.wv.vocab ] if len(vectors) == 0: logging.warning( 'No token is in word2vec model. title-morphs = {}'.format( input_obj['title_morphs'])) average_vector = numpy.zeros(word_emb_obj.wv.vector_size) else: average_vector = numpy.mean(vectors, axis=0) except Exception as e: logging.error(e) return None else: __document_morphs = [t[0] for t in input_obj['morphs']] return input_obj['file_name'], input_obj[ 'category'], average_vector, __document_morphs, input_obj[ 'title_morphs']
def generate_beatmap(beatmap_df: pd.DataFrame, seq: BeatmapSequence, stateful_model: Model, action_model: gensim.models.KeyedVectors, word_id_dict: Dict[str, int], config: Config): most_recent = { col: seq[0][0][col][:, 0:1] for col in stateful_model.input_names } # initial beat output_names = [f'prev_{name}' for name in stateful_model.output_names ] # For TF 2.1 compatibility reverse_word_id_dict = {val: key for key, val in word_id_dict.items()} # Reset the whole seq.data columns except for the first action to prevent information leaking for col in product(['', 'prev_'], ['word_id', 'word_vec'] + config.dataset.beat_elements): seq.data[''.join(col)][:, 1:, :] = 0.0 start = time() total_len = len(beatmap_df) - 1 temperature = config.generation.temperature # TODO: change toconfig.generation.temperature(0) for i in range(len(beatmap_df) - 1): elapsed = time() - start print( f'\r{i:4}: {int(elapsed):3} / ~{int(elapsed * total_len / (i + 1)):3} s', end='', flush=True) pred = stateful_model.predict(most_recent) # word_vec to word_id prob if 'word_vec' in stateful_model.output_names: closest_words = action_model.similar_by_vector( pred['word_vec'].flatten(), topn=30, restrict_vocab=None) pred['word_id'] = np.zeros( (1, 1, config.dataset.num_classes['word_id'])) for word, distance in closest_words: pred['word_id'][:, :, word_id_dict[word]] = distance update_next(i, pred, seq, temperature, config) update_action_representations(i, action_model, seq, word_id_dict, pred, reverse_word_id_dict, config) if set(stateful_model.output_names) >= set( config.dataset.beat_elements): clip_next_to_closest_existing(i, action_model, seq, word_id_dict, config) # get last action in the correct format most_recent = { col: seq[0][0][col][:, i + 1:i + 2] for col in stateful_model.input_names } # Experiment with moving temperature based on AVD distance. Needs further research # temperature = responsive_temperature(seq, temperature, i) save_velocity_hist(seq, config) beatmap_df = predictions2df(beatmap_df, seq) # beatmap_df = append_last_prediction(beatmap_df, most_recent) # TODO: Remove if unnecessary for col in stateful_model.output_names: beatmap_df[col] = beatmap_df[f'prev_{col}'] return beatmap_df[ stateful_model.output_names] # output only generated columns
def save(word_vectors: gensim.models.KeyedVectors, path: str): try: word_vectors.save(get_tmpfile(path)) except FileNotFoundError: word_vectors.save(path)
def wmd(row, model: gensim.models.KeyedVectors): swords = get_stop_words() q1 = [word for word in str(row['question1']).split() if word not in swords] q2 = [word for word in str(row['question2']).split() if word not in swords] return model.wmdistance(q1, q2)