コード例 #1
0
ファイル: test_keyedvectors.py プロジェクト: EricM2/venv
 def test_no_header(self):
     randkv = KeyedVectors(vector_size=100)
     count = 20
     keys = [str(i) for i in range(count)]
     weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)]
     randkv.add_vectors(keys, weights)
     tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt")
     randkv.save_word2vec_format(tmpfiletxt, binary=False, write_header=False)
     reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False, no_header=True)
     self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key)
     self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all())
コード例 #2
0
ファイル: doc2vec.py プロジェクト: chitang1990/Gensim-3.1.0
    def save_word2vec_format(self,
                             fname,
                             doctag_vec=False,
                             word_vec=True,
                             prefix='*dt_',
                             fvocab=None,
                             binary=False):
        """
        Store the input-hidden weight matrix.

         `fname` is the file used to save the vectors in
         `doctag_vec` is an optional boolean indicating whether to store document vectors
         `word_vec` is an optional boolean indicating whether to store word vectors
         (if both doctag_vec and word_vec are True, then both vectors are stored in the same file)
         `prefix` to uniquely identify doctags from word vocab, and avoid collision
         in case of repeated string in doctag and word vocab
         `fvocab` is an optional file used to save the vocabulary
         `binary` is an optional boolean indicating whether the data is to be saved
         in binary word2vec format (default: False)

        """
        total_vec = len(self.wv.vocab) + len(self.docvecs)
        # save word vectors
        if word_vec:
            if not doctag_vec:
                total_vec = len(self.wv.vocab)
            KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary,
                                              total_vec)
        # save document vectors
        if doctag_vec:
            with utils.smart_open(fname, 'ab') as fout:
                if not word_vec:
                    total_vec = len(self.docvecs)
                    logger.info("storing %sx%s projection weights into %s",
                                total_vec, self.vector_size, fname)
                    fout.write(
                        utils.to_utf8("%s %s\n" %
                                      (total_vec, self.vector_size)))
                # store as in input order
                for i in range(len(self.docvecs)):
                    doctag = u"%s%s" % (prefix,
                                        self.docvecs.index_to_doctag(i))
                    row = self.docvecs.doctag_syn0[i]
                    if binary:
                        fout.write(
                            utils.to_utf8(doctag) + b" " + row.tostring())
                    else:
                        fout.write(
                            utils.to_utf8("%s %s\n" %
                                          (doctag, ' '.join("%f" % val
                                                            for val in row))))
コード例 #3
0
def create_small_w2v_model(num_most_common_words=500000, cache_dir=W2VDIR):
    orig_model = load_word_vector_model(small=False, cache_dir=cache_dir)
    words = orig_model.index2entity[:num_most_common_words]

    kv = KeyedVectors(vector_size=orig_model.wv.vector_size)

    vectors = []
    for word in words:
        vectors.append(orig_model.get_vector(word))

    # adds keys (words) & vectors as batch
    kv.add(words, vectors)

    w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL)
    kv.save_word2vec_format(w2v_small_filename, binary=True)
コード例 #4
0
def _load_small_word_vector_model(cache_dir, num_most_common_words=500000):
    filename = fewshot_filename(cache_dir, W2V_SMALL)
    if not os.path.exists(filename):
        orig_model = _load_large_word_vector_model(cache_dir)
        words = orig_model.index2entity[:num_most_common_words]

        kv = KeyedVectors(vector_size=orig_model.wv.vector_size)

        vectors = []
        for word in words:
            vectors.append(orig_model.get_vector(word))

        # adds keys (words) & vectors as batch
        kv.add(words, vectors)

        w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL)
        kv.save_word2vec_format(w2v_small_filename, binary=True)

    return KeyedVectors.load_word2vec_format(filename, binary=True)
コード例 #5
0
def minimize(_log, vectors_path="wiki.en.vec", output_path="wiki.min.en.vec"):
    """Minimize the given vectors file to contain only words in the given corpus."""
    samples = {wh: list(read_samples(which=wh)) for wh in ["train", "test"]}
    try:
        samples["dev"] = list(read_samples(which="dev"))
    except FileNotFoundError:
        pass  # skip if not exist

    vocab = Vocab.from_samples(chain(*samples.values()))
    kv = KeyedVectors.load_word2vec_format(vectors_path)

    _log.info("Creating new, minimized word vectors")
    min_kv = KeyedVectors(kv.vector_size)
    for w in kv.vocab:
        if w in vocab["words"]:
            min_kv[w] = kv[w]

    _log.info("Saving the new word vectors to %s", output_path)
    min_kv.save_word2vec_format(output_path)
コード例 #6
0
ファイル: doc2vec.py プロジェクト: jMonteroMunoz/gensim
    def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False):
        """
        Store the input-hidden weight matrix.

         `fname` is the file used to save the vectors in
         `doctag_vec` is an optional boolean indicating whether to store document vectors
         `word_vec` is an optional boolean indicating whether to store word vectors
         (if both doctag_vec and word_vec are True, then both vectors are stored in the same file)
         `prefix` to uniquely identify doctags from word vocab, and avoid collision
         in case of repeated string in doctag and word vocab
         `fvocab` is an optional file used to save the vocabulary
         `binary` is an optional boolean indicating whether the data is to be saved
         in binary word2vec format (default: False)

        """
        total_vec = len(self.wv.vocab) + len(self.docvecs)
        # save word vectors
        if word_vec:
            if not doctag_vec:
                total_vec = len(self.wv.vocab)
            KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec)
        # save document vectors
        if doctag_vec:
            with utils.smart_open(fname, 'ab') as fout:
                if not word_vec:
                    total_vec = len(self.docvecs)
                    logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname)
                    fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size)))
                # store as in input order
                for i in range(len(self.docvecs)):
                    doctag = u"%s%s" % (prefix, self.docvecs.index_to_doctag(i))
                    row = self.docvecs.doctag_syn0[i]
                    if binary:
                        fout.write(utils.to_utf8(doctag) + b" " + row.tostring())
                    else:
                        fout.write(utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row))))
コード例 #7
0
ファイル: generate.py プロジェクト: xingjian-zhang/WordEmbed
indices = np.array(list(freq_vocab.values()))
M_ = (M[indices].T)[indices].T
M_.shape

# Square root of the matrix M before Arnoldi iteration
sqrt_M = M_.sqrt()

# Construct word embeddings by Arnoldi iteration
Q_file = f"./tmp/{tag}_Q_{win_size}.npy"
if os.path.exists(Q_file):
    Q = np.load(Q_file)
    logging.info("Successfully loaded embeddings matrix from %s.", Q_file)
else:
    logging.info("Start arnoldi iterations.")
    b = np.random.random(size=max_vocab)  # initial vector
    Q, h = arnoldi_iteration(sqrt_M, b, embed_dim)
    logging.info(
        "Successfully extracted word embeddings from arnoldi iteration.")
    np.save(Q_file, Q)  # save Word embeddings
    logging.info("Successfully saved word embedding matrix Q to %s.", Q_file)

dim = embed_dim
word2vec_file = f"./tmp/arnodi_{dim}_{win_size}.kv"
Q_ = Q[:, :dim]
we_ = normalize(Q_, axis=1, norm="l2")
kv = KeyedVectors(vector_size=dim)
kv.add(list(vocab_.keys()), we_)
kv.save_word2vec_format(word2vec_file)
logging.info("Successfully saved word embeddings of dimension %d to %s.", dim,
             word2vec_file)
コード例 #8
0
ファイル: convert2bin.py プロジェクト: Lundez/Summarizer
from gensim.models.keyedvectors import KeyedVectors

if __name__ == '__main__':
    w2v = KeyedVectors.load_word2vec_format('glove500_w2v.txt',
                                            binary=False,
                                            unicode_errors='ignore')
    KeyedVectors.save_word2vec_format(w2v, 'glove500_w2v.bin', binary=True)
コード例 #9
0
def save_w2v_model(file_name: str, w2v_model: KeyedVectors):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    w2v_model.save_word2vec_format(file_name, binary=True)
    print("word2vec model '%s' saved" % file_name)
    return w2v_model