def test_word_too_long(self, folder_path):
        """ Ensure we do not get an exception if
        attempting to write a word longer than
        LMDB's max key size,

        :return void
        """
        LmdbEmbeddingsWriter([
            ('a' * 1000, np.ndarray(10)),
        ]).write(folder_path)
    def test_write_embeddings(self, folder_path):
        """ Ensure we can write embeddings to disk
        without error.

        :return void
        """
        LmdbEmbeddingsWriter([('the', np.random.rand(10)),
                              ('is', np.random.rand(10))]).write(folder_path)

        assert os.listdir(folder_path)
Beispiel #3
0
    def test_word_too_long(self, tmp_path):
        """ Ensure we do not get an exception if attempting to write aword longer than LMDB's
        maximum key size.

        :param pathlib.PosixPath tmp_path:
        :return void:
        """
        directory_path = str(tmp_path)

        LmdbEmbeddingsWriter([('a' * 1000, np.random.rand(10))
                              ]).write(directory_path)
Beispiel #4
0
    def test_write_embeddings_generator(self, folder_path):
        """ Ensure we can a generator of embeddings to disk
        without error.

        :return void
        """
        embeddings_generator = ((str(i), np.ndarray(10)) for i in range(10))

        LmdbEmbeddingsWriter(embeddings_generator).write(folder_path)

        assert os.listdir(folder_path)
    def test_reading_embeddings(self, folder_path):
        """ Ensure we can retrieve embeddings from
        the database.

        :return void
        """
        the_vector = np.random.rand(10)
        LmdbEmbeddingsWriter([('the', the_vector),
                              ('is', np.random.rand(10))]).write(folder_path)

        assert LmdbEmbeddingsReader(folder_path).get_word_vector(
            'the').tolist() == the_vector.tolist()
Beispiel #6
0
    def test_write_embeddings_generator(self, tmp_path):
        """ Ensure we can a generator of embeddings to disk without error.

        :param pathlib.PosixPath tmp_path:
        :return void:
        """
        directory_path = str(tmp_path)
        embeddings_generator = ((str(i), np.random.rand(10))
                                for i in range(10))

        LmdbEmbeddingsWriter(embeddings_generator).write(directory_path)

        assert os.listdir(directory_path)
Beispiel #7
0
    def test_write_embeddings(self, tmp_path):
        """ Ensure we can write embeddings to disk without error.

        :param pathlib.PosixPath tmp_path:
        :return void:
        """
        directory_path = str(tmp_path)

        LmdbEmbeddingsWriter([('the', np.random.rand(10)),
                              ('is', np.random.rand(10))
                              ]).write(directory_path)

        assert os.listdir(directory_path)
Beispiel #8
0
    def test_reading_embeddings(self, tmp_path, reader_class):
        """ Ensure we can retrieve embeddings from the database.

        :param pathlib.PosixPath tmp_path:
        :return void:
        """
        directory_path = str(tmp_path)

        the_vector = np.random.rand(10)
        LmdbEmbeddingsWriter([('the', the_vector), ('is', np.random.rand(10))
                              ]).write(directory_path)

        assert reader_class(directory_path).get_word_vector(
            'the').tolist() == the_vector.tolist()
    def test_missing_word_error(self, folder_path):
        """ Ensure a MissingWordError exception is 
        raised if the word does not exist in the 
        database.

        :return void
        """
        LmdbEmbeddingsWriter([('the', np.random.rand(10)),
                              ('is', np.random.rand(10))]).write(folder_path)

        reader = LmdbEmbeddingsReader(folder_path)

        with pytest.raises(exceptions.MissingWordError):
            reader.get_word_vector('unknown')
Beispiel #10
0
def write_gensim_to_lmdb():
    tecent_embed_path = "./data/enwiki_20180420_300d.bin"
    lmbd_write_path = './data/tecent_lmdb'

    print("loading gensim model...")
    # gensim_model = KeyedVectors.load_word2vec_format(tecent_embed_path, binary=True)
    gensim_model = gensim.models.KeyedVectors.load(tecent_embed_path, mmap='r')

    def iter_embeddings():
        for word in gensim_model.vocab.keys():
            yield word, gensim_model[word]

    print("Writing vectors to a LMDB database...")
    writer = LmdbEmbeddingsWriter(iter_embeddings()).write(lmbd_write_path)
Beispiel #11
0
    def test_missing_word_error(self, folder_path):
        """ Ensure we can retrieve embeddings from
        a database 

        :return void
        """
        LmdbEmbeddingsWriter([
            ('the', np.ndarray(10)),
            ('is', np.ndarray(10))
        ]).write(folder_path)

        reader = LmdbEmbeddingsReader(folder_path)

        with pytest.raises(exceptions.MissingWordError):
            reader.get_word_vector('unknown')
Beispiel #12
0
    def test_msgpack_serialization(self, tmp_path, reader_class):
        """ Ensure we can save and retrieve embeddings serialized with msgpack.

        :param pathlib.PosixPath tmp_path:
        :return void:
        """
        directory_path = str(tmp_path)
        the_vector = np.random.rand(10)

        LmdbEmbeddingsWriter(
            [('the', the_vector), ('is', np.random.rand(10))],
            serializer=MsgpackSerializer().serialize).write(directory_path)

        reader = reader_class(directory_path,
                              unserializer=MsgpackSerializer().unserialize)
        assert reader.get_word_vector('the').tolist() == the_vector.tolist()
    def test_msgpack_serialization(self, folder_path):
        """ Ensure we can save and retrieve embeddings
        serialized with msgpack.

        :return void
        """
        the_vector = np.random.rand(10)

        LmdbEmbeddingsWriter(
            [('the', the_vector), ('is', np.random.rand(10))],
            serializer=MsgpackSerializer.serialize).write(folder_path)

        assert LmdbEmbeddingsReader(
            folder_path,
            unserializer=MsgpackSerializer.unserialize).get_word_vector(
                'the').tolist() == the_vector.tolist()
Beispiel #14
0
    def test_missing_word_error(self, tmp_path, reader_class):
        """ Ensure a MissingWordError exception is raised if the word does not exist in the
        database.

        :param pathlib.PosixPath tmp_path:
        :return void:
        """
        directory_path = str(tmp_path)

        LmdbEmbeddingsWriter([('the', np.random.rand(10)),
                              ('is', np.random.rand(10))
                              ]).write(directory_path)

        reader = reader_class(directory_path)

        with pytest.raises(exceptions.MissingWordError):
            reader.get_word_vector('unknown')
def _write_word_vec_lmdb(path,output):
    #print('Loading gensim model...')
    #gensim_model = KeyedVectors.load_word2vec_format(path, binary=False)
    fin = open(MAIN_PATH+'/out/tencent_word.txt','w')
    def iter_embeddings():
        '''
        for word in gensim_model.vocab.keys():
            yield word, gensim_model[word]
        '''
        count = 0
        with open(path,'r',errors='ignore') as f:
            for line in f:
                word,*vector = line.strip().split(' ')
                fin.write(word+'\n')
                count += 1
                if count % 10000 == 0:print(count)
                yield word,np.asarray(vector,dtype='float32')
    
    print('Writing vectors to a LMDB database...')
    writer = LmdbEmbeddingsWriter(iter_embeddings()).write(output)
    
    fin.close()
Beispiel #16
0
import os
import tqdm
from gensim.models.keyedvectors import KeyedVectors
from lmdb_embeddings.writer import LmdbEmbeddingsWriter


GOOGLE_NEWS_PATH = '/home/dom/.thoughtriver/GoogleNews-vectors-negative300.w2v'
OUTPUT_DIR = os.path.abspath('GoogleNews-vectors-negative300')


print('Loading gensim model...')
gensim_model = KeyedVectors.load(GOOGLE_NEWS_PATH, mmap = 'r')


def iter_embeddings():
    for word in tqdm.tqdm(gensim_model.vocab.keys()):
        yield word, gensim_model[word]

print('Writing vectors to a LMDB database...')

writer = LmdbEmbeddingsWriter(
    iter_embeddings()
).write(OUTPUT_DIR)
from gensim.models.keyedvectors import KeyedVectors
from lmdb_embeddings.writer import LmdbEmbeddingsWriter

tencent_ai_word_embeddings = "/mnt/dl/public/word_embedding/Tencent_AILab_ChineseEmbedding.txt"
output_dir = "/mnt/dl/public/lmdb_embeddings/tencent_ai"

print("Loading word2vec from text file...")
w2v = KeyedVectors.load_word2vec_format(tencent_ai_word_embeddings,
                                        binary=False)


def iter_embeddings():
    for word in w2v.vocab.keys():
        yield word, w2v[word]


print('Writing vectors to a LMDB database...')
writer = LmdbEmbeddingsWriter(iter_embeddings()).write(output_dir)
Beispiel #18
0
    if i > NUMBER_OF_WORDS:
        break

    words.append(word)

    embeddings.append(
        (word, gensim_model[word])
    )


print('Writing vectors to a LMDB database...')
with tempfile.TemporaryDirectory() as directory_path:

    writer = LmdbEmbeddingsWriter(
        embeddings
    ).write(directory_path)

    print('Clearing gensim model from memory...')
    del gensim_model


    print('Timing LMDB approach...')
    lmdb_time = timeit.timeit(
        '[reader.get_word_vector(word) for word in %s]' % words,
        number = NUMBER_OF_TIMING_ITERATIONS,
        setup = 'from lmdb_embeddings.reader import LmdbEmbeddingsReader; reader = LmdbEmbeddingsReader("%s")' % directory_path
    )
    print('LMDB approach took %s seconds.' % lmdb_time)