Example #1
0
    def test_deterministic_transform(self):
        """Specifying the `deterministic` option should make Doc2Vec transformation deterministic.

        By default, running inference with doc2vec is not deterministic in gensim.
        This test makes sure we can get a deterministic result when necessary.
        """
        tw = TextWiser(Embedding.Doc2Vec(deterministic=True,
                                         seed=1234,
                                         vector_size=2,
                                         min_count=1,
                                         workers=1,
                                         sample=0,
                                         negative=0,
                                         hashfxn=det_hash),
                       dtype=torch.float32)
        expected = torch.tensor(
            [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602],
             [0.1042766869, -0.0033877781]],
            dtype=torch.float32)
        self._test_fit_before_transform(tw, expected)
        tw = TextWiser(Embedding.Doc2Vec(pretrained=None,
                                         deterministic=True,
                                         seed=1234,
                                         vector_size=2,
                                         min_count=1,
                                         workers=1,
                                         sample=0,
                                         negative=0,
                                         hashfxn=det_hash),
                       dtype=torch.float32)
        self._test_fit_before_transform(tw, expected)
Example #2
0
    def test_pretrained_error(self):
        # Not a string
        with self.assertRaises(ValueError):
            TextWiser(Embedding.Doc2Vec(pretrained=3), dtype=torch.float32)

        # Not a path
        with self.assertRaises(ValueError):
            TextWiser(Embedding.Doc2Vec(pretrained='|||||||'),
                      dtype=torch.float32)

        # Not a path on the embedding object
        with self.assertRaises(ValueError):
            _Doc2VecEmbeddings(pretrained='|||||||').fit([])
Example #3
0
    def test_tokenizer_validation(self):
        # shouldn't raise an error
        try:
            TextWiser(
                Embedding.Doc2Vec(tokenizer=lambda doc: doc.lower().split()))
        except TypeError:
            self.fail("This tokenizer should pass the validation.")

        # should raise the first error
        with self.assertRaises(TypeError):
            TextWiser(Embedding.Doc2Vec(tokenizer=lambda doc: doc.lower()))

        # should raise the second error
        with self.assertRaises(TypeError):
            TextWiser(Embedding.Doc2Vec(tokenizer=lambda doc: [1]))
Example #4
0
 def test_set_params(self):
     # Set the arguments in container classes
     tw = TextWiser(Embedding.TfIdf(min_df=5),
                    Transformation.NMF(n_components=30),
                    lazy_load=True)
     tw.set_params(embedding__min_df=10,
                   transformations__0__n_components=10)
     self.assertEqual(tw.embedding.min_df, 10)
     self.assertEqual(tw.transformations[0].n_components, 10)
     # Set the arguments in implementation
     tw = TextWiser(Embedding.Doc2Vec(vector_size=2, min_count=1,
                                      workers=1))
     tw.fit(docs)
     tw.set_params(_imp__0__seed=10)
     self.assertEqual(tw._imp[0].seed, 10)
     # Set the arguments in a schema
     schema = {'transform': ['tfidf', ['nmf', {'n_components': 30}]]}
     tw = TextWiser(Embedding.Compound(schema=schema))
     tw.set_params(embedding__schema__transform__0__min_df=10,
                   embedding__schema__transform__1__n_components=10)
     self.assertEqual(tw.embedding.schema['transform'][0][1]['min_df'], 10)
     self.assertEqual(
         tw.embedding.schema['transform'][1][1]['n_components'], 10)
     # Replace a part of the schema in a list
     tw.set_params(embedding__schema__transform__0='bow')
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
     # Replace a part of the schema
     tw.set_params(embedding__schema__transform=['bow'])
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
Example #5
0
 def test_pretrained(self):
     tw = TextWiser(Embedding.Doc2Vec(deterministic=True,
                                      seed=1234,
                                      vector_size=2,
                                      min_count=1,
                                      workers=1,
                                      sample=0,
                                      negative=0,
                                      hashfxn=det_hash),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602],
          [0.1042766869, -0.0033877781]],
         dtype=torch.float32)
     self._test_fit_before_transform(tw, expected)
     # Test loading from bytes
     with NamedTemporaryFile() as file:
         pickle.dump(tw._imp[0].model, file)
         file.seek(0)
         tw = TextWiser(Embedding.Doc2Vec(pretrained=file,
                                          deterministic=True,
                                          seed=1234),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         self.assertTrue(
             torch.allclose(predicted, expected.to(device), atol=1e-6))
     # Test loading from file
     file_path = self._get_test_path('data', 'doc2vec.pkl')
     with open(file_path, 'wb') as fp:
         pickle.dump(tw._imp[0].model, fp)
     tw = TextWiser(Embedding.Doc2Vec(pretrained=file_path,
                                      deterministic=True,
                                      seed=1234),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertTrue(
         torch.allclose(predicted, expected.to(device), atol=1e-6))
     os.remove(file_path)
Example #6
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.Doc2Vec(seed=1234,
                                      vector_size=2,
                                      min_count=1,
                                      workers=1,
                                      sample=0,
                                      negative=0,
                                      hashfxn=det_hash),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602],
          [0.1042766869, -0.0033877781]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
Example #7
0
 def test_options_immutable(self):
     """The Embedding and Transformation options should be immutable"""
     embedding = Embedding.Doc2Vec(deterministic=False)
     with self.assertRaises(ValueError):
         embedding.deterministic = True
     self.assertFalse(embedding.deterministic)