def test__str(self):
     sent_0 = IndexedSentence(["Hello", "there"], 0)
     sent_1 = IndexedSentence(["Hello", "again"], 1)
     self.assertEqual(0, sent_0.index)
     self.assertEqual(1, sent_1.index)
     self.assertEqual(["Hello", "there"], sent_0.words)
     self.assertEqual(["Hello", "again"], sent_1.words)
 def test_yield(self):
     first = IndexedSentence(
         "Good stuff i just wish it lasted longer".split(), 0)
     last = IndexedSentence(
         "I am not sure if it is a tracfone problem or the battery".split(),
         99)
     for i, obj in enumerate(self.doc):
         if i == 0:
             self.assertEqual(first, obj)
         if i == 99:
             self.assertEqual(last, obj)
Exemple #3
0
    def test_scan_w_wrong_input(self):
        se = BaseSentence2VecModel(W2V)
        sentences = ["the dog hit the car", "he was very fast"]
        
        with self.assertRaises(TypeError):
            se.scan_sentences(sentences)
        with self.assertRaises(TypeError):
            se.scan_sentences([IndexedSentence(s, i) for i,s in enumerate(sentences)])
        with self.assertRaises(TypeError):
            se.scan_sentences([list(range(10) for _ in range(2))])

        with self.assertRaises(RuntimeError):
            se.scan_sentences([IndexedSentence(s, i+1) for i,s in enumerate(SENTENCES)])
        with self.assertRaises(ValueError):
            se.scan_sentences([IndexedSentence(s, i-1) for i,s in enumerate(SENTENCES)])
    def similar_by_sentence(self, sentence:List[str], model, indexable:[IndexedList,IndexedLineDocument]=None, topn:int=10,
                            restrict_size:[int,Tuple[int, int]]=None) -> List[Tuple[int,float]]:
        
        """Find the top-N most similar sentences to a given sentence.

        Parameters
        ----------
        sentence : list of str
            Sentence as list of strings
        model : :class:`~fse.models.base_s2v.BaseSentence2VecModel`
            This object essentially provides the infer method used to transform .
        indexable: list, IndexedList, IndexedLineDocument
            Provides an indexable object from where the most similar sentences are read
        topn : int or None, optional
            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
            then similarities for all sentences are returned.
        restrict_size : int or Tuple(int,int), optional
            Optional integer which limits the range of vectors which
            are searched for most-similar values. For example, restrict_vocab=10000 would
            only check the first 10000 sentence vectors.
            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
            500 and 1000.

        Returns
        -------
        list of (int, float) or list of (str, int, float)
            A sequence of (index, similarity) is returned.
            When an indexable is provided, returns (str, index, similarity)
            When `topn` is None, then similarities for all words are returned as a
            one-dimensional numpy array with the size of the vocabulary.

        """
        vector = model.infer([IndexedSentence(sentence, 0)])
        return self.most_similar(positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size)
Exemple #5
0
 def test_train_manager(self):
     se = BaseSentence2VecModel(W2V, workers=2)
     def temp_train_job(data_iterable, target, memory):
         v1 = v2 = sum(1 for _ in data_iterable)
         return v1*2, v2*3
     se._do_train_job = temp_train_job
     job_output = se._train_manager(data_iterable=[IndexedSentence(s, i) for i,s in enumerate(SENTENCES)], total_sentences=len(SENTENCES),report_delay=0.01)
     self.assertEqual((100,200,300), job_output)
Exemple #6
0
    def test_scan_w_IndexedSentence(self):
        se = BaseSentence2VecModel(W2V)
        id_sent = [IndexedSentence(s, i) for i,s in enumerate(SENTENCES)]
        stats = se.scan_sentences(id_sent, progress_per=0)

        self.assertEqual(100, stats["total_sentences"])
        self.assertEqual(1450, stats["total_words"])
        self.assertEqual(14, stats["average_length"])
        self.assertEqual(0, stats["empty_sentences"])
        self.assertEqual(100, stats["max_index"])
    def test_train_single_from_disk(self):
        p = Path("fse/test/test_data/test_vecs")
        p_res = Path("fse/test/test_data/test_vecs.vectors")
        p_target = Path("fse/test/test_data/test_vecs_wv.vectors")

        se1 = Average(W2V)
        se2 = Average(W2V,
                      sv_mapfile_path=str(p.absolute()),
                      wv_mapfile_path=str(p.absolute()))
        se1.train([IndexedSentence(s, i) for i, s in enumerate(SENTENCES)])
        se2.train([IndexedSentence(s, i) for i, s in enumerate(SENTENCES)])

        self.assertTrue(p_target.exists())
        self.assertTrue((se1.wv.vectors == se2.wv.vectors).all())
        self.assertFalse(se2.wv.vectors.flags.writeable)

        self.assertTrue((se1.sv.vectors == se2.sv.vectors).all())
        p_res.unlink()
        p_target.unlink()
 def setUp(self):
     self.sentences = [["They", "admit"], ["So", "Apple", "bought", "buds"],
                       ["go", "12345"], ["pull", "12345678910111213"]]
     self.sentences = [
         IndexedSentence(s, i) for i, s in enumerate(self.sentences)
     ]
     self.model = Average(W2V)
     self.model.prep.prepare_vectors(sv=self.model.sv,
                                     total_sentences=len(self.sentences),
                                     update=False)
     self.model._pre_train_calls()
Exemple #9
0
    def test_infer_use_norm(self):
        se = BaseSentence2VecModel(W2V)
        def temp_train_job(data_iterable, target, memory):
            for i in data_iterable:
                target += 1
            return target
        def pass_method(**kwargs): pass
        se._post_inference_calls = pass_method
        se._do_train_job = temp_train_job
        output = se.infer([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)], use_norm=True)

        self.assertTrue(np.allclose(1., np.sqrt(np.sum(output[0]**2))))
Exemple #10
0
 def test_infer_many_to_one(self):
     se = BaseSentence2VecModel(W2V)
     def temp_train_job(data_iterable, target, memory):
         for i in data_iterable:
             target += 1
         return target
     def pass_method(**kwargs): pass
     se._post_inference_calls = pass_method
     se._do_train_job = temp_train_job
     output = se.infer([IndexedSentence(s, 0) for i,s in enumerate(SENTENCES)])
     self.assertTrue((100 == output).all())
     self.assertEqual((1, 5), output.shape)
 def test_do_train_job(self):
     self.model.prep.prepare_vectors(sv=self.model.sv,
                                     total_sentences=len(SENTENCES),
                                     update=True)
     mem = self.model._get_thread_working_mem()
     self.assertEqual(
         (100, 1450),
         self.model._do_train_job(
             [IndexedSentence(s, i) for i, s in enumerate(SENTENCES)],
             target=self.model.sv.vectors,
             memory=mem))
     self.assertEqual((104, DIM), self.model.sv.vectors.shape)
Exemple #12
0
 def test_train(self):
     se = BaseSentence2VecModel(W2V)
     with self.assertRaises(NotImplementedError):
         se.train([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)])
Exemple #13
0
 def test_scan_w_many_to_one_input(self):
     se = BaseSentence2VecModel(W2V)
     output = se.scan_sentences([IndexedSentence(s, 0) for i,s in enumerate(SENTENCES)])["max_index"]
     self.assertEqual(1, output)
Exemple #14
0
 def test_scan_w_empty(self):
     se = BaseSentence2VecModel(W2V)
     for i in [5, 10, 15]:
         SENTENCES[i] = []
     self.assertEqual(3, se.scan_sentences([IndexedSentence(s, i) for i,s in enumerate(SENTENCES)])["empty_sentences"])
Exemple #15
0
 def test_scan_w_wrong_IndexedSentence(self):
     se = BaseSentence2VecModel(W2V)
     id_sent = [IndexedSentence(s, str(i)) for i,s in enumerate(SENTENCES)]
     with self.assertRaises(TypeError):
         se.scan_sentences(id_sent)
 def test_train(self):
     self.assertEqual(
         (100, 1450),
         self.model.train(
             [IndexedSentence(s, i) for i, s in enumerate(SENTENCES)]))