def test_fine_tuneable(self): tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.SVD(n_components=2), dtype=torch.float32, is_finetuneable=True) tw.fit(docs) embeddings1 = tw._imp[1].V.data.clone().detach() # Give a fake task to train embeddings on # Have a linear layer with a single output after pooling linear = nn.Linear(2, 1, bias=False) model = nn.Sequential(tw, linear).to(device).train() y_pred = model(docs) # Use ones as the target y_act = torch.ones_like(y_pred) # Optimize MSE using SGD criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=1e-3) # Calculate the loss & gradients optimizer.zero_grad() loss = criterion(y_pred, y_act) loss.backward() # The embedding layer should have gradients now self.assertIsNotNone([p for p in tw._imp[1].named_parameters() ][0][1].grad) # Update weights optimizer.step() # The weights should be updated if fine_tune is true, else it should be the same self.assertFalse(torch.allclose(embeddings1, tw._imp[1].V.data))
def test_v_in_parameters(self): n_components = 2 # Restrict the # of components tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.SVD(n_components=n_components), dtype=torch.float32) tw.fit(docs) self.assertIn('_imp.1.V', [p[0] for p in tw.named_parameters()])
def test_save_load(self): try: os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '3' # shut tensorflow up during testing # Create a model with a downstream task tw = TextWiser(Embedding.USE(), dtype=torch.float32).fit(docs) model = nn.Sequential(tw, nn.Linear(512, 1)).to(device) # Get results of the model expected = model(docs) # Save the model to a temporary file with NamedTemporaryFile() as file: state_dict = model.state_dict() self.assertNotIn('0._imp.0.use', state_dict) torch.save(state_dict, file) # Use string name of the file # Get rid of the original model del tw del model # Create the same model tw = TextWiser(Embedding.USE(), dtype=torch.float32) tw.fit() model = nn.Sequential(tw, nn.Linear(512, 1)).to(device) # Load the model from file file.seek(0) model.load_state_dict(torch.load(file, map_location=device)) # Do predictions with the loaded model predicted = model(docs) self.assertTrue(torch.allclose(predicted, expected, atol=1e-6)) except ModuleNotFoundError: print('No Tensorflow found. Skipping the test. ...', end=" ", flush=True)
def test_save_load(self): # Create a model with a downstream task tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), [ Transformation.SVD(n_components=2), Transformation.Pool(pool_option=PoolOptions.mean) ], dtype=torch.float32) tw.fit(docs) model = nn.Sequential(tw, nn.Linear(2, 1)).to(device) # Get results of the model expected = model(docs) # Save the model to a temporary file with NamedTemporaryFile() as file: torch.save(model.state_dict(), file) # Use string name of the file # Get rid of the original model del tw del model # Create the same model tw = TextWiser(Embedding.Word( word_option=WordOptions.word2vec, pretrained='en-turian'), [ Transformation.SVD(n_components=2), Transformation.Pool(pool_option=PoolOptions.mean) ], dtype=torch.float32) tw.fit() model = nn.Sequential(tw, nn.Linear(2, 1)).to(device) # Load the model from file file.seek(0) model.load_state_dict(torch.load(file, map_location=device)) # Do predictions with the loaded model predicted = model(docs) self.assertTrue(torch.allclose(predicted, expected, atol=1e-6))
def test_set_params(self): # Set the arguments in container classes tw = TextWiser(Embedding.TfIdf(min_df=5), Transformation.NMF(n_components=30), lazy_load=True) tw.set_params(embedding__min_df=10, transformations__0__n_components=10) self.assertEqual(tw.embedding.min_df, 10) self.assertEqual(tw.transformations[0].n_components, 10) # Set the arguments in implementation tw = TextWiser(Embedding.Doc2Vec(vector_size=2, min_count=1, workers=1)) tw.fit(docs) tw.set_params(_imp__0__seed=10) self.assertEqual(tw._imp[0].seed, 10) # Set the arguments in a schema schema = {'transform': ['tfidf', ['nmf', {'n_components': 30}]]} tw = TextWiser(Embedding.Compound(schema=schema)) tw.set_params(embedding__schema__transform__0__min_df=10, embedding__schema__transform__1__n_components=10) self.assertEqual(tw.embedding.schema['transform'][0][1]['min_df'], 10) self.assertEqual( tw.embedding.schema['transform'][1][1]['n_components'], 10) # Replace a part of the schema in a list tw.set_params(embedding__schema__transform__0='bow') self.assertEqual(tw.embedding.schema['transform'][0], 'bow') # Replace a part of the schema tw.set_params(embedding__schema__transform=['bow']) self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
def test_lazy_load(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), lazy_load=True) self.assertIsNone(tw._imp) tw.fit(docs) self.assertIsNotNone(tw._imp) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), lazy_load=True, dtype=torch.float32, is_finetuneable=True) self.assertIsNone(tw._imp) tw.fit_transform(docs) self.assertIsNotNone(tw._imp)