def test_expand_model(self, n=10): model = Document2Vec(w2v_file) corpus = _generate_corpus(model, n=n) shape_before = model.syn0.shape model._expand_from(corpus) self.assertEqual(shape_before[0] + n, model.syn0.shape[0]) self.assertIn('SENT_0', model.index2word)
def test_checkpoint(self): model = Document2Vec(w2v_file) checksum = model.syn0.sum() model._build_checkpoint() model.syn0 *= 2.0 new_checksum = model.syn0.sum() self.assertNotEqual(new_checksum, checksum) model._reset_to_checkpoint() new_checksum = model.syn0.sum() self.assertEqual(new_checksum, checksum)
def test_transform(self): """ Test that training the model brings the document vector closer to the vectors for words in the sentence""" model = Document2Vec(w2v_file) model.workers = 1 corpus = _generate_corpus(model) # vectors = model.fit_transform(corpus) # Get the first word in the corpus vectors = model.transform(corpus) word = next(corpus.__iter__()).words[0] sent0_vector = vectors[0, :] sim = cosine(sent0_vector, model[word]) self.assertGreater(sim, 0.15)
def test_labeledlinesentence(self): model = Document2Vec(w2v_file) model.workers = 1 corpus = _generate_corpus(model) fn = '/tmp/tmp_corpus' with open(fn, 'w') as fh: for line in corpus: text = ' '.join([w for w in line.words]) try: fh.write(text + '\n') except: continue corpus = LabeledLineSentence(fn) # vectors = model.fit_transform(corpus) # Get the first word in the corpus model.fit_transform(corpus) word = next(corpus.__iter__()).words[0] sim = model.similarity('SENT_0', word) self.assertGreater(sim, 0.15)
def test_word_similarity(self): model = Document2Vec(w2v_file) sim = model.similarity('blue', 'gold') self.assertGreater(sim, 0.3)
def test_get_vector(self): model = Document2Vec(w2v_file) v = model.get_vector('the') self.assertIs(type(v), np.ndarray)
def test_load_from_w2v(self): model = Document2Vec(w2v_file) self.assertIsNot(type(model), None) self.assertIs(type(model), Document2Vec) self.assertIn('jacket', model.index2word)
def test_init(self): m = Document2Vec() assert 'train_lbls' in dir(m)