def smart_procrustes_align_gensim(base_embed: gensim.models.KeyedVectors,
                                  other_embed: gensim.models.KeyedVectors):
    """
    This code, taken from
    https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf and modified,
    uses procrustes analysis to make two word embeddings compatible.
    :param base_embed: first embedding
    :param other_embed: second embedding to be changed
    :return other_embed: changed embedding
    """
    base_embed.init_sims()
    other_embed.init_sims()

    base_vecs = base_embed.syn0norm
    other_vecs = other_embed.syn0norm

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs)
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v)
    # Replace original array with modified one
    # i.e. multiplying the embedding matrix (syn0norm)by "ortho"
    other_embed.syn0norm = other_embed.syn0 = other_embed.syn0norm.dot(ortho)

    return other_embed
Esempio n. 2
0
def update_action_representations(
        i: int, action_model: gensim.models.KeyedVectors, seq: BeatmapSequence,
        word_id_dict: Dict[str, int], pred: Dict[str, np.ndarray],
        reverse_word_id_dict: Dict[int, np.ndarray], config: Config):
    # update all representations, to make interesting models possible without data leaking.
    if 'word_id' in pred.keys(
    ):  # `word_id` is the prefered action representation
        word_str = reverse_word_id_dict[int(seq.data['prev_word_id'][:,
                                                                     i + 1])]
        seq.data['prev_word_vec'][:, i + 1] = action_model[word_str]
        word_str2per_attribute(i, word_str, seq)
    elif 'word_vec' in pred.keys():
        closest_word_str = action_model.similar_by_vector(
            seq.data['prev_word_vec'][:, i + 1],
            topn=1,
            restrict_vocab=config.generation.restrict_vocab)[0][0]
        seq.data['prev_word_id'][:, i + 1] = word_id_dict[closest_word_str]
        word_str2per_attribute(i, closest_word_str, seq)
    else:
        prev_word = per_attribute2word_str(i, seq)
        seq.data['prev_word_vec'][:, i + 1] = action_model[prev_word]
        closest_word_str = action_model.similar_by_vector(
            seq.data['prev_word_vec'][:, i + 1],
            topn=1,
            restrict_vocab=config.generation.restrict_vocab)[0][0]
        seq.data['prev_word_id'][:, i + 1] = word_id_dict[closest_word_str]
Esempio n. 3
0
def smart_procrustes_align_gensim(base_embed: gensim.models.KeyedVectors,
                                  other_embed: gensim.models.KeyedVectors):
    """
    This code, taken from
    https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf and modified,
    uses procrustes analysis to make two word embeddings compatible.
    :param base_embed: first embedding
    :param other_embed: second embedding to be changed
    :return other_embed: changed embedding
    """
    base_embed.init_sims()
    other_embed.init_sims()

    shared_vocab = list(
        set(base_embed.wv.vocab.keys()).intersection(
            other_embed.wv.vocab.keys()))

    base_idx2word = {
        num: word
        for num, word in enumerate(base_embed.wv.index2word)
    }
    other_idx2word = {
        num: word
        for num, word in enumerate(other_embed.wv.index2word)
    }

    base_word2idx = {word: num for num, word in base_idx2word.items()}
    other_word2idx = {word: num for num, word in other_idx2word.items()}

    base_shared_indices = [base_word2idx[word] for word in shared_vocab]
    other_shared_indices = [other_word2idx[word] for word in shared_vocab]

    base_vecs = base_embed.wv.syn0norm
    other_vecs = other_embed.wv.syn0norm

    base_shared_vecs = base_vecs[base_shared_indices]
    other_shared_vecs = other_vecs[other_shared_indices]

    m = other_shared_vecs.T @ base_shared_vecs
    u, _, v = np.linalg.svd(m)
    ortho = u @ v

    # Replace original array with modified one
    # i.e. multiplying the embedding matrix (syn0norm)by "ortho"
    other_embed.wv.syn0norm = other_embed.wv.syn0 = other_embed.wv.syn0norm.dot(
        ortho)

    return other_embed
Esempio n. 4
0
def df2beatmap(df: pd.DataFrame,
               action_model: gensim.models.KeyedVectors,
               word_id_dict: Dict[str, int],
               config: Config,
               bpm: int = 60,
               events: Tuple = ()) -> JSON:
    beatmap = {
        '_version': '2.0.0',
        '_BPMChanges': [],
        '_notes': [],
        '_events': events,
    }
    df.index = df.index.to_frame()[
        'time']  # only time from the multiindex is needed
    inverse_word_id_dict = {val: key for key, val in word_id_dict.items()}
    if 'word_id' in df.columns:
        df['word_id'] = np.array(df['word_id'].to_list()).flatten()
        df = df.loc[df['word_id'] > 1]
        word = df['word_id'].map(lambda word_id: inverse_word_id_dict[word_id])
        beatmap['_notes'] += word_ser2json(word)
    elif 'word_vec' in df.columns:
        word = df['word_vec'].map(lambda vec: action_model.similar_by_vector(
            vec, topn=1, restrict_vocab=config.generation.restrict_vocab)[0][0]
                                  )
        beatmap['_notes'] += word_ser2json(word)
    else:
        beatmap['_notes'] += double_beat_element2json(df, config)

    return beatmap
Esempio n. 5
0
def build_word_pairs(words, keyed_vectors: gensim.models.KeyedVectors):
    word_pairs = []
    for word in words:
        sim_words = keyed_vectors.most_similar(positive=word, topn=3)
        pairs = [(word, sim_word[0]) for sim_word in sim_words]
        word_pairs.extend(pairs)
    return word_pairs
Esempio n. 6
0
    def restrict_vectors(wordvectors: gensim.models.KeyedVectors,
                         restricted_word_set):
        new_vectors = []
        new_vocab = {}
        new_index2entity = []
        new_vectors_norm = []
        wordvectors.init_sims()
        for i in tqdm(range(len(wordvectors.vocab)),
                      desc="Vector restriction",
                      total=len(wordvectors.vocab)):
            word = wordvectors.index2entity[i]
            vec = wordvectors.vectors[i]
            vocab = wordvectors.vocab[word]
            vec_norm = wordvectors.vectors_norm[i]
            if word in restricted_word_set:
                vocab.index = len(new_index2entity)
                new_index2entity.append(word)
                new_vocab[word] = vocab
                new_vectors.append(vec)
                new_vectors_norm.append(vec_norm)

        wordvectors.vocab = new_vocab
        wordvectors.vectors = np.array(new_vectors)
        wordvectors.index2entity = new_index2entity
        wordvectors.index2word = new_index2entity
        wordvectors.vectors_norm = new_vectors_norm
Esempio n. 7
0
 def get_umls_vectors_only(self, vectors: gensim.models.KeyedVectors):
     medical_concepts = [
         word for word in vectors.index2word
         if word in self.umls_dict.values()
     ]
     concept_vecs = {
         concept: vectors.get_vector(concept)
         for concept in medical_concepts
     }
     return concept_vecs
 def get_global_anchors(self, word: str, w2v: gensim.models.KeyedVectors):
     """
     This takes in a word and a KeyedVectors model and returns a vector of cosine distances
     between this word and each word in the vocab.
     :param word:
     :param w2v:
     :return: np.array of distances shaped (len(w2v.vocab),)
     """
     word_vector = w2v.get_vector(word)
     similarities = gensim.models.KeyedVectors.cosine_similarities(word_vector, w2v.vectors)
     return unitvec(similarities)
Esempio n. 9
0
    def assign_concepts_to_vecs(cls, vectors: gensim.models.KeyedVectors):
        if cls.umls_mapper is None:
            print('No UMLS defined yet. Build UMLSMapper...')
            cls.umls_mapper = UMLSMapper(from_dir=cls.config["PATH"]["UMLS"])
        addable_concepts = []
        addable_vectors = []
        for concept, terms in cls.umls_mapper.umls_reverse_dict.items():
            concept_vec = []
            for term in terms:
                term_tokens = term.split()
                token_vecs = []
                for token in term_tokens:
                    if token in vectors.vocab:
                        token_vecs.append(vectors.get_vector(token))
                if len(term_tokens) == len(token_vecs):
                    term_vector = sum(token_vecs)
                    concept_vec.append(term_vector)
            if len(concept_vec) > 0:
                addable_concepts.append(concept)
                addable_vectors.append(sum(concept_vec) / len(concept_vec))
        vectors.add(addable_concepts, addable_vectors)

        return vectors
Esempio n. 10
0
def informative_output(words_and_scores, w2v1: gensim.models.KeyedVectors,
                       w2v2: gensim.models.KeyedVectors, top_n_neighbors: int,
                       model_name: str):
    print(model_name.center(40, '='))

    for word, score in words_and_scores:
        top_n_1 = [
            word
            for word, score in w2v1.most_similar(word, topn=top_n_neighbors)
        ]
        top_n_2 = [
            word
            for word, score in w2v2.most_similar(word, topn=top_n_neighbors)
        ]
        print("word {word} has score {score}".format(word=word, score=score))
        print("word {word} has the following neighbors in model1:".format(
            word=word))
        print(*top_n_1, sep=',')
        print('_' * 40)
        print("word {word} has the following neighbors in model2:".format(
            word=word))
        print(*top_n_2, sep=',')
        print("")
Esempio n. 11
0
def clip_next_to_closest_existing(i: int,
                                  action_model: gensim.models.KeyedVectors,
                                  seq: BeatmapSequence,
                                  word_id_dict: Dict[str,
                                                     int], config: Config):
    prev_word = per_attribute2word_str(i, seq)
    closest_word_str = action_model.similar_by_vector(
        action_model[prev_word],
        topn=1,
        restrict_vocab=config.generation.restrict_vocab)[0][0]
    seq.data['prev_word_id'][:, i + 1] = word_id_dict[closest_word_str]
    seq.data['prev_word_vec'][:, i + 1] = action_model[closest_word_str]
    closest_word_str = seq.data['prev_word']

    word_str2per_attribute(i, closest_word_str, seq)
def generate_one_document(
    input_obj, word_emb_obj: gensim.models.KeyedVectors
) -> Optional[Tuple[str, str, numpy.ndarray, List[str], List[str]]]:
    """1ドキュメント分の情報を生成する"""
    try:
        vectors = [
            word_emb_obj.get_vector(w[0]) for w in input_obj['title_morphs']
            if w[0] in word_emb_obj.wv.vocab
        ]
        if len(vectors) == 0:
            logging.warning(
                'No token is in word2vec model. title-morphs = {}'.format(
                    input_obj['title_morphs']))
            average_vector = numpy.zeros(word_emb_obj.wv.vector_size)
        else:
            average_vector = numpy.mean(vectors, axis=0)
    except Exception as e:
        logging.error(e)
        return None
    else:
        __document_morphs = [t[0] for t in input_obj['morphs']]
        return input_obj['file_name'], input_obj[
            'category'], average_vector, __document_morphs, input_obj[
                'title_morphs']
Esempio n. 13
0
def generate_beatmap(beatmap_df: pd.DataFrame, seq: BeatmapSequence,
                     stateful_model: Model,
                     action_model: gensim.models.KeyedVectors,
                     word_id_dict: Dict[str, int], config: Config):
    most_recent = {
        col: seq[0][0][col][:, 0:1]
        for col in stateful_model.input_names
    }  # initial beat
    output_names = [f'prev_{name}' for name in stateful_model.output_names
                    ]  # For TF 2.1 compatibility
    reverse_word_id_dict = {val: key for key, val in word_id_dict.items()}

    # Reset the whole seq.data columns except for the first action to prevent information leaking
    for col in product(['', 'prev_'],
                       ['word_id', 'word_vec'] + config.dataset.beat_elements):
        seq.data[''.join(col)][:, 1:, :] = 0.0

    start = time()
    total_len = len(beatmap_df) - 1
    temperature = config.generation.temperature  # TODO: change toconfig.generation.temperature(0)
    for i in range(len(beatmap_df) - 1):
        elapsed = time() - start
        print(
            f'\r{i:4}: {int(elapsed):3} / ~{int(elapsed * total_len / (i + 1)):3} s',
            end='',
            flush=True)
        pred = stateful_model.predict(most_recent)

        # word_vec to word_id prob
        if 'word_vec' in stateful_model.output_names:
            closest_words = action_model.similar_by_vector(
                pred['word_vec'].flatten(), topn=30, restrict_vocab=None)

            pred['word_id'] = np.zeros(
                (1, 1, config.dataset.num_classes['word_id']))
            for word, distance in closest_words:
                pred['word_id'][:, :, word_id_dict[word]] = distance

        update_next(i, pred, seq, temperature, config)

        update_action_representations(i, action_model, seq, word_id_dict, pred,
                                      reverse_word_id_dict, config)

        if set(stateful_model.output_names) >= set(
                config.dataset.beat_elements):
            clip_next_to_closest_existing(i, action_model, seq, word_id_dict,
                                          config)

        # get last action in the correct format
        most_recent = {
            col: seq[0][0][col][:, i + 1:i + 2]
            for col in stateful_model.input_names
        }

        # Experiment with moving temperature based on AVD distance. Needs further research
        # temperature = responsive_temperature(seq, temperature, i)

    save_velocity_hist(seq, config)
    beatmap_df = predictions2df(beatmap_df, seq)
    # beatmap_df = append_last_prediction(beatmap_df, most_recent)    # TODO: Remove if unnecessary

    for col in stateful_model.output_names:
        beatmap_df[col] = beatmap_df[f'prev_{col}']

    return beatmap_df[
        stateful_model.output_names]  # output only generated columns
Esempio n. 14
0
 def save(word_vectors: gensim.models.KeyedVectors, path: str):
     try:
         word_vectors.save(get_tmpfile(path))
     except FileNotFoundError:
         word_vectors.save(path)
Esempio n. 15
0
def wmd(row, model: gensim.models.KeyedVectors):
    swords = get_stop_words()
    q1 = [word for word in str(row['question1']).split() if word not in swords]
    q2 = [word for word in str(row['question2']).split() if word not in swords]
    return model.wmdistance(q1, q2)