def test_set_params(self): # Set the arguments in container classes tw = TextWiser(Embedding.TfIdf(min_df=5), Transformation.NMF(n_components=30), lazy_load=True) tw.set_params(embedding__min_df=10, transformations__0__n_components=10) self.assertEqual(tw.embedding.min_df, 10) self.assertEqual(tw.transformations[0].n_components, 10) # Set the arguments in implementation tw = TextWiser(Embedding.Doc2Vec(vector_size=2, min_count=1, workers=1)) tw.fit(docs) tw.set_params(_imp__0__seed=10) self.assertEqual(tw._imp[0].seed, 10) # Set the arguments in a schema schema = {'transform': ['tfidf', ['nmf', {'n_components': 30}]]} tw = TextWiser(Embedding.Compound(schema=schema)) tw.set_params(embedding__schema__transform__0__min_df=10, embedding__schema__transform__1__n_components=10) self.assertEqual(tw.embedding.schema['transform'][0][1]['min_df'], 10) self.assertEqual( tw.embedding.schema['transform'][1][1]['n_components'], 10) # Replace a part of the schema in a list tw.set_params(embedding__schema__transform__0='bow') self.assertEqual(tw.embedding.schema['transform'][0], 'bow') # Replace a part of the schema tw.set_params(embedding__schema__transform=['bow']) self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
def _test_schema(self, schema): tw = TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32) expected = torch.tensor([[-1.5983865261, 1.8820908070, 0.1802073568], [-1.8616025448, -0.4420224428, -0.9159017205], [-2.0401582718, -1.0712100267, 0.6945561171]], dtype=torch.float32) self._test_fit_transform(tw, expected, atol=1e-4) self._reset_seed() self._test_fit_before_transform(tw, expected, atol=1e-4)
def _test_schema(self, schema): tw = TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32) expected = torch.tensor([[-1.5983779430, 1.8820992708, 0.1802130789], [-1.8616007566, -0.4420076311, -0.9159148335], [-2.0401744843, -1.0712141991, 0.6945576668]], dtype=torch.float32) self._test_fit_transform(tw, expected) self._reset_seed() self._test_fit_before_transform(tw, expected)
def test_immutable_schema(self): schema = { "transform": [ ["word", {"word_option": "word2vec", "pretrained": "en-turian"}], ["pool", {"pool_option": "max"}] ] } emb = Embedding.Compound(schema=schema) schema['transform'][1][1]['pool_option'] = 'min' self.assertEqual(emb.schema['transform'][1][1]['pool_option'], 'max')
def test_finetune_validation(self): # Nothing is fine-tuneable if dtype is numpy with self.assertRaises(TypeError): with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en_turian'), dtype=np.float32, is_finetuneable=True) # Word2Vec is fine-tuneable try: with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail("Word2vec is fine tuneable") # ELMo is not fine-tuneable, and should raise an error with self.assertRaises(ValueError): with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.elmo), dtype=torch.float32, is_finetuneable=True, lazy_load=True) # TfIdf is not fine-tuneable, and should raise an error with self.assertRaises(ValueError): TextWiser(Embedding.TfIdf(), dtype=torch.float32, is_finetuneable=True, lazy_load=True) # TfIdf is not fine-tuneable, but SVD is try: TextWiser(Embedding.TfIdf(), Transformation.SVD(), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail("SVD is fine tuneable") # LDA cannot propagate gradients, so the whole thing is not fine-tuneable with self.assertRaises(ValueError): with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en'), Transformation.LDA(), dtype=torch.float32, is_finetuneable=True, lazy_load=True) schema = { 'concat': [{ 'transform': [('word2vec', { 'pretrained': 'en-turian' }), ('pool', { 'pool_option': 'max' })] }, { 'transform': ['tfidf', ('nmf', { 'n_components': 30 })] }] } # Word2Vec is fine-tuneable, therefore the whole schema is fine-tuneable try: TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail( "Any fine-tuneable weights is enough for the model to be fine-tuneable" ) # TfIdf is not fine-tuneable, but SVD is schema = {'transform': ['tfidf', 'svd']} try: TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail("SVD is fine tuneable")
def test_no_pretrained(self): with self.assertRaises(ValueError): TextWiser(Embedding.Compound(schema='tfidf', pretrained='path'), dtype=torch.float32)