Ejemplo n.º 1
0
 def test_save_load(self):
     # Create a model with a downstream task
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                   pretrained='en-turian'),
                    [
                        Transformation.SVD(n_components=2),
                        Transformation.Pool(pool_option=PoolOptions.mean)
                    ],
                    dtype=torch.float32)
     tw.fit(docs)
     model = nn.Sequential(tw, nn.Linear(2, 1)).to(device)
     # Get results of the model
     expected = model(docs)
     # Save the model to a temporary file
     with NamedTemporaryFile() as file:
         torch.save(model.state_dict(), file)  # Use string name of the file
         # Get rid of the original model
         del tw
         del model
         # Create the same model
         tw = TextWiser(Embedding.Word(
             word_option=WordOptions.word2vec, pretrained='en-turian'), [
                 Transformation.SVD(n_components=2),
                 Transformation.Pool(pool_option=PoolOptions.mean)
             ],
                        dtype=torch.float32)
         tw.fit()
         model = nn.Sequential(tw, nn.Linear(2, 1)).to(device)
         # Load the model from file
         file.seek(0)
         model.load_state_dict(torch.load(file, map_location=device))
         # Do predictions with the loaded model
         predicted = model(docs)
         self.assertTrue(torch.allclose(predicted, expected, atol=1e-6))
Ejemplo n.º 2
0
 def test_dtype(self):
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                   pretrained='en-turian'),
                    Transformation.Pool(pool_option=PoolOptions.max),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.dtype, torch.float32)
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                   pretrained='en-turian'),
                    Transformation.Pool(pool_option=PoolOptions.max),
                    dtype=np.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.dtype, np.float32)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         self.assertEqual(predicted[0].dtype, torch.float32)
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        dtype=np.float32)
         predicted = tw.fit_transform(docs)
         self.assertEqual(predicted[0].dtype, np.float32)
Ejemplo n.º 3
0
 def _test_index(self, pool_option):
     index = 0 if pool_option == PoolOptions.first else -1
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'),
                        dtype=torch.float32)
         expected = tw.fit_transform(docs[0])[0][index].view(1, -1)
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'),
                    Transformation.Pool(pool_option=pool_option), dtype=torch.float32)
     pooled = tw.fit_transform(docs[0])
     self.assertTrue(torch.allclose(expected.to(device), pooled.to(device)))
Ejemplo n.º 4
0
 def test_list_handling(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        Transformation.SVD(n_components=2),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         expected = [
             torch.tensor([[-0.9719871283, 0.0947150663],
                           [-0.3805825114, -1.0427029133],
                           [-0.6929296255, 0.1793890595],
                           [0.0000000000, 0.0000000000]],
                          dtype=torch.float32),
             torch.tensor([[-0.9719871283, 0.0947150663],
                           [-0.3805825114, -1.0427029133],
                           [-0.7170552015, 0.0105144158],
                           [-0.9385635853, 0.6596723199],
                           [0.0000000000, 0.0000000000]],
                          dtype=torch.float32),
             torch.tensor([[-0.8687936068, -0.9333068132],
                           [-0.6859120131, 0.0732812732],
                           [-0.9385635853, 0.6596723199],
                           [0.0000000000, 0.0000000000]],
                          dtype=torch.float32)
         ]
         for p, e in zip(predicted, expected):
             self.assertTrue(torch.allclose(p, e.to(device), atol=1e-6))
Ejemplo n.º 5
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=torch.float32)
     expected = torch.from_numpy(np.genfromtxt(
         self._get_test_path('data', 'pooled_embeddings.csv'),
         dtype=np.float32))
     self._test_fit_transform(tw, expected)
     self._test_fit_before_transform(tw, expected)
Ejemplo n.º 6
0
 def test_lazy_load(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        lazy_load=True)
         self.assertIsNone(tw._imp)
         tw.fit(docs)
         self.assertIsNotNone(tw._imp)
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        lazy_load=True,
                        dtype=torch.float32,
                        is_finetuneable=True)
         self.assertIsNone(tw._imp)
         tw.fit_transform(docs)
         self.assertIsNotNone(tw._imp)
Ejemplo n.º 7
0
    def test_finetune_validation(self):
        # Nothing is fine-tuneable if dtype is numpy
        with self.assertRaises(TypeError):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                         pretrained='en_turian'),
                          dtype=np.float32,
                          is_finetuneable=True)

        # Word2Vec is fine-tuneable
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                         pretrained='en-turian'),
                          dtype=torch.float32,
                          is_finetuneable=True,
                          lazy_load=True)
        except ValueError:
            self.fail("Word2vec is fine tuneable")

        # ELMo is not fine-tuneable, and should raise an error
        with self.assertRaises(ValueError):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.elmo),
                          dtype=torch.float32,
                          is_finetuneable=True,
                          lazy_load=True)

        # TfIdf is not fine-tuneable, and should raise an error
        with self.assertRaises(ValueError):
            TextWiser(Embedding.TfIdf(),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)

        # TfIdf is not fine-tuneable, but SVD is
        try:
            TextWiser(Embedding.TfIdf(),
                      Transformation.SVD(),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)
        except ValueError:
            self.fail("SVD is fine tuneable")

        # LDA cannot propagate gradients, so the whole thing is not fine-tuneable
        with self.assertRaises(ValueError):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                         pretrained='en'),
                          Transformation.LDA(),
                          dtype=torch.float32,
                          is_finetuneable=True,
                          lazy_load=True)

        schema = {
            'concat': [{
                'transform': [('word2vec', {
                    'pretrained': 'en-turian'
                }), ('pool', {
                    'pool_option': 'max'
                })]
            }, {
                'transform': ['tfidf', ('nmf', {
                    'n_components': 30
                })]
            }]
        }

        # Word2Vec is fine-tuneable, therefore the whole schema is fine-tuneable
        try:
            TextWiser(Embedding.Compound(schema=schema),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)
        except ValueError:
            self.fail(
                "Any fine-tuneable weights is enough for the model to be fine-tuneable"
            )

        # TfIdf is not fine-tuneable, but SVD is
        schema = {'transform': ['tfidf', 'svd']}
        try:
            TextWiser(Embedding.Compound(schema=schema),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)
        except ValueError:
            self.fail("SVD is fine tuneable")