def test_word_too_long(self, folder_path): """ Ensure we do not get an exception if attempting to write a word longer than LMDB's max key size, :return void """ LmdbEmbeddingsWriter([ ('a' * 1000, np.ndarray(10)), ]).write(folder_path)
def test_write_embeddings(self, folder_path): """ Ensure we can write embeddings to disk without error. :return void """ LmdbEmbeddingsWriter([('the', np.random.rand(10)), ('is', np.random.rand(10))]).write(folder_path) assert os.listdir(folder_path)
def test_word_too_long(self, tmp_path): """ Ensure we do not get an exception if attempting to write aword longer than LMDB's maximum key size. :param pathlib.PosixPath tmp_path: :return void: """ directory_path = str(tmp_path) LmdbEmbeddingsWriter([('a' * 1000, np.random.rand(10)) ]).write(directory_path)
def test_write_embeddings_generator(self, folder_path): """ Ensure we can a generator of embeddings to disk without error. :return void """ embeddings_generator = ((str(i), np.ndarray(10)) for i in range(10)) LmdbEmbeddingsWriter(embeddings_generator).write(folder_path) assert os.listdir(folder_path)
def test_reading_embeddings(self, folder_path): """ Ensure we can retrieve embeddings from the database. :return void """ the_vector = np.random.rand(10) LmdbEmbeddingsWriter([('the', the_vector), ('is', np.random.rand(10))]).write(folder_path) assert LmdbEmbeddingsReader(folder_path).get_word_vector( 'the').tolist() == the_vector.tolist()
def test_write_embeddings_generator(self, tmp_path): """ Ensure we can a generator of embeddings to disk without error. :param pathlib.PosixPath tmp_path: :return void: """ directory_path = str(tmp_path) embeddings_generator = ((str(i), np.random.rand(10)) for i in range(10)) LmdbEmbeddingsWriter(embeddings_generator).write(directory_path) assert os.listdir(directory_path)
def test_write_embeddings(self, tmp_path): """ Ensure we can write embeddings to disk without error. :param pathlib.PosixPath tmp_path: :return void: """ directory_path = str(tmp_path) LmdbEmbeddingsWriter([('the', np.random.rand(10)), ('is', np.random.rand(10)) ]).write(directory_path) assert os.listdir(directory_path)
def test_reading_embeddings(self, tmp_path, reader_class): """ Ensure we can retrieve embeddings from the database. :param pathlib.PosixPath tmp_path: :return void: """ directory_path = str(tmp_path) the_vector = np.random.rand(10) LmdbEmbeddingsWriter([('the', the_vector), ('is', np.random.rand(10)) ]).write(directory_path) assert reader_class(directory_path).get_word_vector( 'the').tolist() == the_vector.tolist()
def test_missing_word_error(self, folder_path): """ Ensure a MissingWordError exception is raised if the word does not exist in the database. :return void """ LmdbEmbeddingsWriter([('the', np.random.rand(10)), ('is', np.random.rand(10))]).write(folder_path) reader = LmdbEmbeddingsReader(folder_path) with pytest.raises(exceptions.MissingWordError): reader.get_word_vector('unknown')
def write_gensim_to_lmdb(): tecent_embed_path = "./data/enwiki_20180420_300d.bin" lmbd_write_path = './data/tecent_lmdb' print("loading gensim model...") # gensim_model = KeyedVectors.load_word2vec_format(tecent_embed_path, binary=True) gensim_model = gensim.models.KeyedVectors.load(tecent_embed_path, mmap='r') def iter_embeddings(): for word in gensim_model.vocab.keys(): yield word, gensim_model[word] print("Writing vectors to a LMDB database...") writer = LmdbEmbeddingsWriter(iter_embeddings()).write(lmbd_write_path)
def test_missing_word_error(self, folder_path): """ Ensure we can retrieve embeddings from a database :return void """ LmdbEmbeddingsWriter([ ('the', np.ndarray(10)), ('is', np.ndarray(10)) ]).write(folder_path) reader = LmdbEmbeddingsReader(folder_path) with pytest.raises(exceptions.MissingWordError): reader.get_word_vector('unknown')
def test_msgpack_serialization(self, tmp_path, reader_class): """ Ensure we can save and retrieve embeddings serialized with msgpack. :param pathlib.PosixPath tmp_path: :return void: """ directory_path = str(tmp_path) the_vector = np.random.rand(10) LmdbEmbeddingsWriter( [('the', the_vector), ('is', np.random.rand(10))], serializer=MsgpackSerializer().serialize).write(directory_path) reader = reader_class(directory_path, unserializer=MsgpackSerializer().unserialize) assert reader.get_word_vector('the').tolist() == the_vector.tolist()
def test_msgpack_serialization(self, folder_path): """ Ensure we can save and retrieve embeddings serialized with msgpack. :return void """ the_vector = np.random.rand(10) LmdbEmbeddingsWriter( [('the', the_vector), ('is', np.random.rand(10))], serializer=MsgpackSerializer.serialize).write(folder_path) assert LmdbEmbeddingsReader( folder_path, unserializer=MsgpackSerializer.unserialize).get_word_vector( 'the').tolist() == the_vector.tolist()
def test_missing_word_error(self, tmp_path, reader_class): """ Ensure a MissingWordError exception is raised if the word does not exist in the database. :param pathlib.PosixPath tmp_path: :return void: """ directory_path = str(tmp_path) LmdbEmbeddingsWriter([('the', np.random.rand(10)), ('is', np.random.rand(10)) ]).write(directory_path) reader = reader_class(directory_path) with pytest.raises(exceptions.MissingWordError): reader.get_word_vector('unknown')
def _write_word_vec_lmdb(path,output): #print('Loading gensim model...') #gensim_model = KeyedVectors.load_word2vec_format(path, binary=False) fin = open(MAIN_PATH+'/out/tencent_word.txt','w') def iter_embeddings(): ''' for word in gensim_model.vocab.keys(): yield word, gensim_model[word] ''' count = 0 with open(path,'r',errors='ignore') as f: for line in f: word,*vector = line.strip().split(' ') fin.write(word+'\n') count += 1 if count % 10000 == 0:print(count) yield word,np.asarray(vector,dtype='float32') print('Writing vectors to a LMDB database...') writer = LmdbEmbeddingsWriter(iter_embeddings()).write(output) fin.close()
import os import tqdm from gensim.models.keyedvectors import KeyedVectors from lmdb_embeddings.writer import LmdbEmbeddingsWriter GOOGLE_NEWS_PATH = '/home/dom/.thoughtriver/GoogleNews-vectors-negative300.w2v' OUTPUT_DIR = os.path.abspath('GoogleNews-vectors-negative300') print('Loading gensim model...') gensim_model = KeyedVectors.load(GOOGLE_NEWS_PATH, mmap = 'r') def iter_embeddings(): for word in tqdm.tqdm(gensim_model.vocab.keys()): yield word, gensim_model[word] print('Writing vectors to a LMDB database...') writer = LmdbEmbeddingsWriter( iter_embeddings() ).write(OUTPUT_DIR)
from gensim.models.keyedvectors import KeyedVectors from lmdb_embeddings.writer import LmdbEmbeddingsWriter tencent_ai_word_embeddings = "/mnt/dl/public/word_embedding/Tencent_AILab_ChineseEmbedding.txt" output_dir = "/mnt/dl/public/lmdb_embeddings/tencent_ai" print("Loading word2vec from text file...") w2v = KeyedVectors.load_word2vec_format(tencent_ai_word_embeddings, binary=False) def iter_embeddings(): for word in w2v.vocab.keys(): yield word, w2v[word] print('Writing vectors to a LMDB database...') writer = LmdbEmbeddingsWriter(iter_embeddings()).write(output_dir)
if i > NUMBER_OF_WORDS: break words.append(word) embeddings.append( (word, gensim_model[word]) ) print('Writing vectors to a LMDB database...') with tempfile.TemporaryDirectory() as directory_path: writer = LmdbEmbeddingsWriter( embeddings ).write(directory_path) print('Clearing gensim model from memory...') del gensim_model print('Timing LMDB approach...') lmdb_time = timeit.timeit( '[reader.get_word_vector(word) for word in %s]' % words, number = NUMBER_OF_TIMING_ITERATIONS, setup = 'from lmdb_embeddings.reader import LmdbEmbeddingsReader; reader = LmdbEmbeddingsReader("%s")' % directory_path ) print('LMDB approach took %s seconds.' % lmdb_time)