def test_fit_transform(self): tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.LDA(n_components=2), dtype=torch.float32) expected = torch.tensor( [[0.7724367976, 0.2275632024], [0.5895692706, 0.4104307294], [0.2381444573, 0.7618555427]], dtype=torch.float32) self._test_fit_transform(tw, expected) self._reset_seed() self._test_fit_before_transform(tw, expected)
def test_finetune_validation(self): # Nothing is fine-tuneable if dtype is numpy with self.assertRaises(TypeError): with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en_turian'), dtype=np.float32, is_finetuneable=True) # Word2Vec is fine-tuneable try: with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail("Word2vec is fine tuneable") # ELMo is not fine-tuneable, and should raise an error with self.assertRaises(ValueError): with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.elmo), dtype=torch.float32, is_finetuneable=True, lazy_load=True) # TfIdf is not fine-tuneable, and should raise an error with self.assertRaises(ValueError): TextWiser(Embedding.TfIdf(), dtype=torch.float32, is_finetuneable=True, lazy_load=True) # TfIdf is not fine-tuneable, but SVD is try: TextWiser(Embedding.TfIdf(), Transformation.SVD(), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail("SVD is fine tuneable") # LDA cannot propagate gradients, so the whole thing is not fine-tuneable with self.assertRaises(ValueError): with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en'), Transformation.LDA(), dtype=torch.float32, is_finetuneable=True, lazy_load=True) schema = { 'concat': [{ 'transform': [('word2vec', { 'pretrained': 'en-turian' }), ('pool', { 'pool_option': 'max' })] }, { 'transform': ['tfidf', ('nmf', { 'n_components': 30 })] }] } # Word2Vec is fine-tuneable, therefore the whole schema is fine-tuneable try: TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail( "Any fine-tuneable weights is enough for the model to be fine-tuneable" ) # TfIdf is not fine-tuneable, but SVD is schema = {'transform': ['tfidf', 'svd']} try: TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail("SVD is fine tuneable")
def test_min_components(self): with self.assertRaises(ValueError): TextWiser(Embedding.TfIdf(min_df=2), Transformation.LDA(n_components=1), dtype=torch.float32)