def test__str(self): sent_0 = IndexedSentence(["Hello", "there"], 0) sent_1 = IndexedSentence(["Hello", "again"], 1) self.assertEqual(0, sent_0.index) self.assertEqual(1, sent_1.index) self.assertEqual(["Hello", "there"], sent_0.words) self.assertEqual(["Hello", "again"], sent_1.words)
def test_yield(self): first = IndexedSentence( "Good stuff i just wish it lasted longer".split(), 0) last = IndexedSentence( "I am not sure if it is a tracfone problem or the battery".split(), 99) for i, obj in enumerate(self.doc): if i == 0: self.assertEqual(first, obj) if i == 99: self.assertEqual(last, obj)
def test_scan_w_wrong_input(self): se = BaseSentence2VecModel(W2V) sentences = ["the dog hit the car", "he was very fast"] with self.assertRaises(TypeError): se.scan_sentences(sentences) with self.assertRaises(TypeError): se.scan_sentences([IndexedSentence(s, i) for i,s in enumerate(sentences)]) with self.assertRaises(TypeError): se.scan_sentences([list(range(10) for _ in range(2))]) with self.assertRaises(RuntimeError): se.scan_sentences([IndexedSentence(s, i+1) for i,s in enumerate(SENTENCES)]) with self.assertRaises(ValueError): se.scan_sentences([IndexedSentence(s, i-1) for i,s in enumerate(SENTENCES)])
def similar_by_sentence(self, sentence:List[str], model, indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10, restrict_size:[int,Tuple[int, int]]=None) -> List[Tuple[int,float]]: """Find the top-N most similar sentences to a given sentence. Parameters ---------- sentence : list of str Sentence as list of strings model : :class:`~fse.models.base_s2v.BaseSentence2VecModel` This object essentially provides the infer method used to transform . indexable: list, IndexedList, IndexedLineDocument Provides an indexable object from where the most similar sentences are read topn : int or None, optional Number of top-N similar sentences to return, when `topn` is int. When `topn` is None, then similarities for all sentences are returned. restrict_size : int or Tuple(int,int), optional Optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would only check the first 10000 sentence vectors. restrict_vocab=(500, 1000) would search the sentence vectors with indices between 500 and 1000. Returns ------- list of (int, float) or list of (str, int, float) A sequence of (index, similarity) is returned. When an indexable is provided, returns (str, index, similarity) When `topn` is None, then similarities for all words are returned as a one-dimensional numpy array with the size of the vocabulary. """ vector = model.infer([IndexedSentence(sentence, 0)]) return self.most_similar(positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size)
def test_train_manager(self): se = BaseSentence2VecModel(W2V, workers=2) def temp_train_job(data_iterable, target, memory): v1 = v2 = sum(1 for _ in data_iterable) return v1*2, v2*3 se._do_train_job = temp_train_job job_output = se._train_manager(data_iterable=[IndexedSentence(s, i) for i,s in enumerate(SENTENCES)], total_sentences=len(SENTENCES),report_delay=0.01) self.assertEqual((100,200,300), job_output)
def test_scan_w_IndexedSentence(self): se = BaseSentence2VecModel(W2V) id_sent = [IndexedSentence(s, i) for i,s in enumerate(SENTENCES)] stats = se.scan_sentences(id_sent, progress_per=0) self.assertEqual(100, stats["total_sentences"]) self.assertEqual(1450, stats["total_words"]) self.assertEqual(14, stats["average_length"]) self.assertEqual(0, stats["empty_sentences"]) self.assertEqual(100, stats["max_index"])
def test_train_single_from_disk(self): p = Path("fse/test/test_data/test_vecs") p_res = Path("fse/test/test_data/test_vecs.vectors") p_target = Path("fse/test/test_data/test_vecs_wv.vectors") se1 = Average(W2V) se2 = Average(W2V, sv_mapfile_path=str(p.absolute()), wv_mapfile_path=str(p.absolute())) se1.train([IndexedSentence(s, i) for i, s in enumerate(SENTENCES)]) se2.train([IndexedSentence(s, i) for i, s in enumerate(SENTENCES)]) self.assertTrue(p_target.exists()) self.assertTrue((se1.wv.vectors == se2.wv.vectors).all()) self.assertFalse(se2.wv.vectors.flags.writeable) self.assertTrue((se1.sv.vectors == se2.sv.vectors).all()) p_res.unlink() p_target.unlink()
def setUp(self): self.sentences = [["They", "admit"], ["So", "Apple", "bought", "buds"], ["go", "12345"], ["pull", "12345678910111213"]] self.sentences = [ IndexedSentence(s, i) for i, s in enumerate(self.sentences) ] self.model = Average(W2V) self.model.prep.prepare_vectors(sv=self.model.sv, total_sentences=len(self.sentences), update=False) self.model._pre_train_calls()
def test_infer_use_norm(self): se = BaseSentence2VecModel(W2V) def temp_train_job(data_iterable, target, memory): for i in data_iterable: target += 1 return target def pass_method(**kwargs): pass se._post_inference_calls = pass_method se._do_train_job = temp_train_job output = se.infer([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)], use_norm=True) self.assertTrue(np.allclose(1., np.sqrt(np.sum(output[0]**2))))
def test_infer_many_to_one(self): se = BaseSentence2VecModel(W2V) def temp_train_job(data_iterable, target, memory): for i in data_iterable: target += 1 return target def pass_method(**kwargs): pass se._post_inference_calls = pass_method se._do_train_job = temp_train_job output = se.infer([IndexedSentence(s, 0) for i,s in enumerate(SENTENCES)]) self.assertTrue((100 == output).all()) self.assertEqual((1, 5), output.shape)
def test_do_train_job(self): self.model.prep.prepare_vectors(sv=self.model.sv, total_sentences=len(SENTENCES), update=True) mem = self.model._get_thread_working_mem() self.assertEqual( (100, 1450), self.model._do_train_job( [IndexedSentence(s, i) for i, s in enumerate(SENTENCES)], target=self.model.sv.vectors, memory=mem)) self.assertEqual((104, DIM), self.model.sv.vectors.shape)
def test_train(self): se = BaseSentence2VecModel(W2V) with self.assertRaises(NotImplementedError): se.train([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)])
def test_scan_w_many_to_one_input(self): se = BaseSentence2VecModel(W2V) output = se.scan_sentences([IndexedSentence(s, 0) for i,s in enumerate(SENTENCES)])["max_index"] self.assertEqual(1, output)
def test_scan_w_empty(self): se = BaseSentence2VecModel(W2V) for i in [5, 10, 15]: SENTENCES[i] = [] self.assertEqual(3, se.scan_sentences([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)])["empty_sentences"])
def test_scan_w_wrong_IndexedSentence(self): se = BaseSentence2VecModel(W2V) id_sent = [IndexedSentence(s, str(i)) for i,s in enumerate(SENTENCES)] with self.assertRaises(TypeError): se.scan_sentences(id_sent)
def test_train(self): self.assertEqual( (100, 1450), self.model.train( [IndexedSentence(s, i) for i, s in enumerate(SENTENCES)]))