Beispiel #1
0
 def test_pretrained(self):
     tw = TextWiser(Embedding.TfIdf(pretrained=None, min_df=2),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.4813341796, 0.6198053956, 0.0000000000, 0.6198053956],
          [0.4091228545, 0.5268201828, 0.5268201828, 0.5268201828],
          [0.6133555174, 0.0000000000, 0.7898069024, 0.0000000000]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     # Test loading from bytes
     with NamedTemporaryFile() as file:
         pickle.dump(tw._imp[0].vectorizer, file)
         file.seek(0)
         tw = TextWiser(Embedding.TfIdf(pretrained=file),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         self.assertTrue(
             torch.allclose(predicted, expected.to(device), atol=1e-6))
     # Test loading from file
     file_path = self._get_test_path('data', 'tfidf.pkl')
     with open(file_path, 'wb') as fp:
         pickle.dump(tw._imp[0].vectorizer, fp)
     tw = TextWiser(Embedding.TfIdf(pretrained=file_path),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertTrue(
         torch.allclose(predicted, expected.to(device), atol=1e-6))
     os.remove(file_path)
Beispiel #2
0
 def test_dtype(self):
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                   pretrained='en-turian'),
                    Transformation.Pool(pool_option=PoolOptions.max),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.dtype, torch.float32)
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                   pretrained='en-turian'),
                    Transformation.Pool(pool_option=PoolOptions.max),
                    dtype=np.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.dtype, np.float32)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         self.assertEqual(predicted[0].dtype, torch.float32)
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        dtype=np.float32)
         predicted = tw.fit_transform(docs)
         self.assertEqual(predicted[0].dtype, np.float32)
Beispiel #3
0
 def _test_index(self, pool_option):
     index = 0 if pool_option == PoolOptions.first else -1
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'),
                        dtype=torch.float32)
         expected = tw.fit_transform(docs[0])[0][index].view(1, -1)
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'),
                    Transformation.Pool(pool_option=pool_option), dtype=torch.float32)
     pooled = tw.fit_transform(docs[0])
     self.assertTrue(torch.allclose(expected.to(device), pooled.to(device)))
Beispiel #4
0
 def test_num_components(self):
     # The natural # of components is 3.
     n_components = 2  # Restrict the # of components
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=n_components),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.shape[1], n_components)
     self._reset_seed()
     n_components = 200  # Expand the # of components
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=n_components),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.shape[1], n_components)
Beispiel #5
0
 def test_list_handling(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        Transformation.SVD(n_components=2),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         expected = [
             torch.tensor([[-0.9719871283, 0.0947150663],
                           [-0.3805825114, -1.0427029133],
                           [-0.6929296255, 0.1793890595],
                           [0.0000000000, 0.0000000000]],
                          dtype=torch.float32),
             torch.tensor([[-0.9719871283, 0.0947150663],
                           [-0.3805825114, -1.0427029133],
                           [-0.7170552015, 0.0105144158],
                           [-0.9385635853, 0.6596723199],
                           [0.0000000000, 0.0000000000]],
                          dtype=torch.float32),
             torch.tensor([[-0.8687936068, -0.9333068132],
                           [-0.6859120131, 0.0732812732],
                           [-0.9385635853, 0.6596723199],
                           [0.0000000000, 0.0000000000]],
                          dtype=torch.float32)
         ]
         for p, e in zip(predicted, expected):
             self.assertTrue(torch.allclose(p, e.to(device), atol=1e-6))
Beispiel #6
0
 def test_lazy_load(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        lazy_load=True)
         self.assertIsNone(tw._imp)
         tw.fit(docs)
         self.assertIsNotNone(tw._imp)
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        lazy_load=True,
                        dtype=torch.float32,
                        is_finetuneable=True)
         self.assertIsNone(tw._imp)
         tw.fit_transform(docs)
         self.assertIsNotNone(tw._imp)
 def test_pretrained(self):
     tw = TextWiser(Embedding.Doc2Vec(deterministic=True,
                                      seed=1234,
                                      vector_size=2,
                                      min_count=1,
                                      workers=1,
                                      sample=0,
                                      negative=0,
                                      hashfxn=det_hash),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602],
          [0.1042766869, -0.0033877781]],
         dtype=torch.float32)
     self._test_fit_before_transform(tw, expected)
     # Test loading from bytes
     with NamedTemporaryFile() as file:
         pickle.dump(tw._imp[0].model, file)
         file.seek(0)
         tw = TextWiser(Embedding.Doc2Vec(pretrained=file,
                                          deterministic=True,
                                          seed=1234),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         self.assertTrue(
             torch.allclose(predicted, expected.to(device), atol=1e-6))
     # Test loading from file
     file_path = self._get_test_path('data', 'doc2vec.pkl')
     with open(file_path, 'wb') as fp:
         pickle.dump(tw._imp[0].model, fp)
     tw = TextWiser(Embedding.Doc2Vec(pretrained=file_path,
                                      deterministic=True,
                                      seed=1234),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertTrue(
         torch.allclose(predicted, expected.to(device), atol=1e-6))
     os.remove(file_path)