コード例 #1
0
ファイル: generate_candidates.py プロジェクト: lfurrer/tzlink
 def _create_pv(self):
     try:
         vectors = KeyedVectors()
     except TypeError:
         # Newer versions of gensim require a constructor argument.
         vectors = KeyedVectors(self._wv.shape[1])
     phrases = []
     for name in self.terminology.iter_names():
         # This iterates over unique names.
         vectors.vocab[name] = Vocab(index=len(vectors.vocab), count=None)
         vectors.index2word.append(name)
         phrases.append(self._phrase_vector(name))
     vectors.syn0 = vectors.syn0norm = np.array(phrases)
     return vectors
コード例 #2
0
ファイル: test_mlp_embdeddings.py プロジェクト: gorosgobe/nlp
def test_average_embeddings_custom():
    corpus = [
        ["the", "man", "ran"],
        ["the", "boy"],
        ["the", "man", "boy"],
    ]

    max_sent_len = 3

    model = KeyedVectors(vector_size=1)

    model.add(["the"], [np.array([1])])
    model.add(["man"], [np.array([2])])
    model.add(["ran"], [np.array([3])])
    model.add(["boy"], [np.array([6])])
    model.add([PAD_TOK], [np.array([0])])

    embedding_input = lib.embeddings.get_embedding_input(corpus, model, max_sent_len)

    avg_embedding_model = lib.mlp.get_average_embedding_model(
        input_shape=embedding_input.shape[1:],
        w2v_model=model,
    )

    avg = avg_embedding_model.predict(embedding_input)

    print(avg)

    assert avg[0][0] == (1 + 2 + 3) / 3
    assert avg[1][0] == (1 + 6) / 2
    assert avg[2][0] == (1 + 2 + 6) / 3
コード例 #3
0
def main():
    config = load_config()
    with open(os.path.join(config.cooccurrence_dir, "vocab.pkl"), "rb") as f:
        vocab = pickle.load(f)

    model = GloVe(
        vocab_size=config.vocab_size,
        embedding_size=config.embedding_size,
        x_max=config.x_max,
        alpha=config.alpha
    )
    model.load_state_dict(torch.load(config.output_filepath))
    
    keyed_vectors = KeyedVectors(vector_size=config.embedding_size)
    keyed_vectors.add_vectors(
        keys=[vocab.get_token(index) for index in range(config.vocab_size)],
        weights=(model.weight.weight.detach()
            + model.weight_tilde.weight.detach()).numpy()
    )
    
    print("How similar is man and woman:")
    print(keyed_vectors.similarity("woman", "man"))
    print("How similar is man and apple:")
    print(keyed_vectors.similarity("apple", "man"))
    print("How similar is woman and apple:")
    print(keyed_vectors.similarity("apple", "woman"))
    for word in ["computer", "united", "early"]:
        print(f"Most similar words of {word}:")
        most_similar_words = [word for word, _ in keyed_vectors.similar_by_word(word)]
        print(most_similar_words)
コード例 #4
0
def test_hierarchical_pool():
    tokens: List[str] = ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']
    kv: KeyedVectors = KeyedVectors(vector_size=200)

    word_embeds: np.ndarray = models._word_embeds(tokens, kv, (-1, 1))
    ret: np.ndarray = models._hierarchical_pool(word_embeds, num_windows=3)
    assert ret.shape == (200, )
コード例 #5
0
def test_embeddings_input():
    corpus = [
        ["the", "man", "ran", "to", "boy"],
        ["the", "boy", "pig"],
        ["the", "pig", "man", "boy"],
    ]

    # "pig" and "to" is not in the vocabulary
    model = KeyedVectors(vector_size=2)
    model.add(["the"], [np.array([1, 3])])  # idx 0
    model.add(["man"], [np.array([2, 5])])  # idx 1
    model.add(["ran"], [np.array([6, 9])])  # idx 2
    model.add(["boy"], [np.array([4, 20])])  # idx 3
    model.add([PAD_TOK], [np.array([0, 0])])  # idx 4

    embedding_input = lib.embeddings.get_embedding_input(corpus, model)

    assert (embedding_input == np.array([
        [0, 1, 2, 3],
        [0, 3, 4, 4],
        [0, 1, 3, 4],
    ])).all()

    embedding_layer = lib.embeddings.get_keras_embedding(model)

    embedded = embedding_layer(embedding_input)

    assert (embedded.numpy() == np.array([[[1., 3.], [2., 5.], [6., 9.],
                                           [4., 20.]],
                                          [[1., 3.], [4., 20.], [0., 0.],
                                           [0., 0.]],
                                          [[1., 3.], [2., 5.], [4., 20.],
                                           [0., 0.]]])).all()
コード例 #6
0
    def create_keyedvector_from_matrix(self, embedding_matrix, word2id):
        """
        Imports the necessary attributes for the Embedding object from an embedding matrix and a word2id vocabulary. Can be used for custom pre-trained embeddings.
        Parameters
        ----------
        embedding_matrix: numpy.ndarray
            Embedding matrix as a numpy object
        word2id: dict
            Word vocabulary (key: word, value: word_index)
        """

        vocab = {
            word: word2id[word]
            for word in sorted(word2id, key=word2id.__getitem__, reverse=False)
        }
        embedding_matrix = embedding_matrix
        vector_size = embedding_matrix.shape[1]

        kv = KeyedVectors(vector_size)
        kv.vector_size = vector_size
        kv.vectors = embedding_matrix

        kv.index2word = list(vocab.keys())

        kv.vocab = {
            word: Vocab(index=word_id, count=0)
            for word, word_id in vocab.items()
        }

        self.embedding = kv
コード例 #7
0
def test_hierarchical_pool_raise():
    """ ValueError: when invalid n_windows passed. """
    doc: str = '桃'
    kv: KeyedVectors = KeyedVectors(vector_size=200)
    word_embeds = models._word_embeds(doc, kv, (-1, 1))
    with pytest.raises(ValueError):
        # text_length: 1, n_windows: 3
        models._hierarchical_pool(word_embeds, num_windows=3)
コード例 #8
0
ファイル: test_keyedvectors.py プロジェクト: EricM2/venv
    def test_add_type(self):
        kv = KeyedVectors(2)
        assert kv.vectors.dtype == REAL

        words, vectors = ["a"], np.array([1., 1.], dtype=np.float64).reshape(1, -1)
        kv.add_vectors(words, vectors)

        assert kv.vectors.dtype == REAL
コード例 #9
0
ファイル: word2vec.py プロジェクト: watereals/ShallowLearn
 def __init__(self, max_vocab_size, min_count, sample,
              estimate_memory):
     self.max_vocab_size = max_vocab_size
     self.corpus_count = 0
     self.raw_vocab = None
     self.wv = KeyedVectors()
     self.min_count = min_count
     self.sample = sample
     self.estimate_memory = estimate_memory
コード例 #10
0
def truncate_w2v(w2v, new_dims):
    """Limit w2v to the specified number of dimensions, selected at random"""
    old_dims = w2v.vectors.shape[1]
    new_w2v = KeyedVectors(new_dims)
    vocab = list(w2v.vocab.keys())
    cols_idx = np.random.choice(old_dims, size=new_dims, replace=False)
    weights = w2v.vectors[:, cols_idx]
    new_w2v.add(vocab, weights)
    return new_w2v
コード例 #11
0
ファイル: test_keyedvectors.py プロジェクト: EricM2/venv
 def test_no_header(self):
     randkv = KeyedVectors(vector_size=100)
     count = 20
     keys = [str(i) for i in range(count)]
     weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)]
     randkv.add_vectors(keys, weights)
     tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt")
     randkv.save_word2vec_format(tmpfiletxt, binary=False, write_header=False)
     reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False, no_header=True)
     self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key)
     self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all())
コード例 #12
0
def test_infer_vector_functional():
    tokens: List[str] = ['私', 'は', '私', 'は']
    kv: KeyedVectors = KeyedVectors(vector_size=200)
    methods: Dict[str, Tuple[int]] = {
        'avg': (200, ),
        'max': (200, ),
        'concat': (400, ),
        'hierarchical': (200, )
    }
    for method, shape in methods.items():
        embed: np.ndarray = swem.infer_vector(tokens, kv=kv, method=method)
        assert embed.shape == shape
コード例 #13
0
ファイル: embed_base.py プロジェクト: kafku/mmeigenwords
    def _set_keyedvector(self, attrname, keys, dim, vec=None):
        keyed_vec = KeyedVectors(dim)
        dummy_max_count = len(keys) + 1
        for i, key in enumerate(keys):
            key = str(key)
            keyed_vec.vocab[key] = Vocab(index=i, count=dummy_max_count - i) # dummy count
            keyed_vec.index2word.append(key)

        if vec is not None:
            keyed_vec.vectors = vec
            keyed_vec.init_sims()

        setattr(self, attrname, keyed_vec)
コード例 #14
0
def create_small_w2v_model(num_most_common_words=500000, cache_dir=W2VDIR):
    orig_model = load_word_vector_model(small=False, cache_dir=cache_dir)
    words = orig_model.index2entity[:num_most_common_words]

    kv = KeyedVectors(vector_size=orig_model.wv.vector_size)

    vectors = []
    for word in words:
        vectors.append(orig_model.get_vector(word))

    # adds keys (words) & vectors as batch
    kv.add(words, vectors)

    w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL)
    kv.save_word2vec_format(w2v_small_filename, binary=True)
コード例 #15
0
 def update(self):
     wv = self.word_vectors_file.get_word_vectors()
     voc = self.vocabs_file.get_vocabs()['word']
     words_in_vocab = [
         k for k, _ in sorted(voc.items(), key=lambda i: i[1][0])
     ]
     word_embs = wv[words_in_vocab[1:]]
     unk_emb = np.mean(word_embs, 0, keepdims=True)
     embs = np.concatenate((unk_emb, word_embs), 0)
     kv = KeyedVectors(embs.shape[1])
     kv.syn0 = embs
     kv.vocab = dict(
         (k, Vocab(index=v[0], count=v[1])) for k, v in voc.items())
     kv.index2word = words_in_vocab
     kv.save(self.path)
コード例 #16
0
def minimize(_log, vectors_path="wiki.en.vec", output_path="wiki.min.en.vec"):
    """Minimize the given vectors file to contain only words in the given corpus."""
    samples = {wh: list(read_samples(which=wh)) for wh in ["train", "test"]}
    try:
        samples["dev"] = list(read_samples(which="dev"))
    except FileNotFoundError:
        pass  # skip if not exist

    vocab = Vocab.from_samples(chain(*samples.values()))
    kv = KeyedVectors.load_word2vec_format(vectors_path)

    _log.info("Creating new, minimized word vectors")
    min_kv = KeyedVectors(kv.vector_size)
    for w in kv.vocab:
        if w in vocab["words"]:
            min_kv[w] = kv[w]

    _log.info("Saving the new word vectors to %s", output_path)
    min_kv.save_word2vec_format(output_path)
コード例 #17
0
def _load_small_word_vector_model(cache_dir, num_most_common_words=500000):
    filename = fewshot_filename(cache_dir, W2V_SMALL)
    if not os.path.exists(filename):
        orig_model = _load_large_word_vector_model(cache_dir)
        words = orig_model.index2entity[:num_most_common_words]

        kv = KeyedVectors(vector_size=orig_model.wv.vector_size)

        vectors = []
        for word in words:
            vectors.append(orig_model.get_vector(word))

        # adds keys (words) & vectors as batch
        kv.add(words, vectors)

        w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL)
        kv.save_word2vec_format(w2v_small_filename, binary=True)

    return KeyedVectors.load_word2vec_format(filename, binary=True)
コード例 #18
0
def get_kv(model_path, embed_size=None):
    # optional ecosystem is to limit to only one ecosystem
    if embed_size is None:
        embed_size = int(''.join(c for c in model_path if c.isdigit()))

    w2v = KeyedVectors(embed_size)
    model = tf.keras.models.load_model(model_path)

    all_weights = model.get_weights()
    # input_weights = all_weights[0]
    # embed_bias = all_weights[1]
    output_weights = all_weights[2].T
    # output_bias = all_weights[3]

    index = list(id2lib)
    weights = output_weights

    w2v.add(index, weights)
    return w2v
コード例 #19
0
ファイル: test_keyedvectors.py プロジェクト: EricM2/venv
    def test_add_single(self):
        """Test that adding entity in a manual way works correctly."""
        entities = [f'___some_entity{i}_not_present_in_keyed_vectors___' for i in range(5)]
        vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)]

        # Test `add` on already filled kv.
        for ent, vector in zip(entities, vectors):
            self.vectors.add_vectors(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(self.vectors[ent], vector))

        # Test `add` on empty kv.
        kv = KeyedVectors(self.vectors.vector_size)
        for ent, vector in zip(entities, vectors):
            kv.add_vectors(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(kv[ent], vector))
コード例 #20
0
ファイル: test_keyedvectors.py プロジェクト: EricM2/venv
    def test_add_multiple(self):
        """Test that adding a bulk of entities in a manual way works correctly."""
        entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)]
        vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)]

        # Test `add` on already filled kv.
        vocab_size = len(self.vectors)
        self.vectors.add_vectors(entities, vectors, replace=False)
        self.assertEqual(vocab_size + len(entities), len(self.vectors))

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(self.vectors[ent], vector))

        # Test `add` on empty kv.
        kv = KeyedVectors(self.vectors.vector_size)
        kv[entities] = vectors
        self.assertEqual(len(kv), len(entities))

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(kv[ent], vector))
コード例 #21
0
def load_keyedvectors(**kwargs):
    embeddings = load_embeddings(**kwargs)
    vectors = KeyedVectors(int(kwargs["size"]))
    vectors.add(embeddings.index, embeddings.values)
    return vectors
コード例 #22
0
def test_word_embeds():
    tokens: List[str] = ['私', 'は']
    kv: KeyedVectors = KeyedVectors(vector_size=200)
    embed = models._word_embeds(tokens, kv=kv, uniform_range=(-0.01, 0.01))
    assert embed.shape == (2, 200)
コード例 #23
0
def test_word_embed():
    token: str = '私'
    kv: KeyedVectors = KeyedVectors(vector_size=200)
    embed = models._word_embed(token, kv=kv)
    assert embed.shape == (200, )
コード例 #24
0
ファイル: word2vec.py プロジェクト: watereals/ShallowLearn
 def __init__(self, vocab):
     self.wv = KeyedVectors()
     self.wv.vocab = vocab
コード例 #25
0
ファイル: generate.py プロジェクト: xingjian-zhang/WordEmbed
indices = np.array(list(freq_vocab.values()))
M_ = (M[indices].T)[indices].T
M_.shape

# Square root of the matrix M before Arnoldi iteration
sqrt_M = M_.sqrt()

# Construct word embeddings by Arnoldi iteration
Q_file = f"./tmp/{tag}_Q_{win_size}.npy"
if os.path.exists(Q_file):
    Q = np.load(Q_file)
    logging.info("Successfully loaded embeddings matrix from %s.", Q_file)
else:
    logging.info("Start arnoldi iterations.")
    b = np.random.random(size=max_vocab)  # initial vector
    Q, h = arnoldi_iteration(sqrt_M, b, embed_dim)
    logging.info(
        "Successfully extracted word embeddings from arnoldi iteration.")
    np.save(Q_file, Q)  # save Word embeddings
    logging.info("Successfully saved word embedding matrix Q to %s.", Q_file)

dim = embed_dim
word2vec_file = f"./tmp/arnodi_{dim}_{win_size}.kv"
Q_ = Q[:, :dim]
we_ = normalize(Q_, axis=1, norm="l2")
kv = KeyedVectors(vector_size=dim)
kv.add(list(vocab_.keys()), we_)
kv.save_word2vec_format(word2vec_file)
logging.info("Successfully saved word embeddings of dimension %d to %s.", dim,
             word2vec_file)
コード例 #26
0
 def setUp(self):
     kv: KeyedVectors = KeyedVectors(vector_size=200)
     self.swem = models.SWEM(kv)
コード例 #27
0
 def initialize_word_vectors(self):
     #word
     self.wv = KeyedVectors()
     #category
     self.cv = KeyedVectors()
コード例 #28
0
def get_random_kv(embed_size):
    w2v = KeyedVectors(embed_size)
    w2v.add(list(id2lib), np.random.random((len(id2lib), embed_size)))
    return w2v