def setUpClass(cls): cls.features = Features() cls.features.append(Feature("A", str, 1)) A, X = list(zip(*DATA)) cls.records = [ Record(str(i), data={"features": { "A": A[i], "X": X[i] }}) for i in range(0, len(X)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model_dir = tempfile.TemporaryDirectory() cls.model = TextClassificationModel( TextClassifierConfig( directory=cls.model_dir.name, classifications=[0, 1], features=cls.features, predict=Feature("X", int, 1), add_layers=True, layers=[ "Dense(units = 120, activation='relu')", "Dense(units = 64, activation=relu)", "Dense(units = 2, activation='softmax')", ], model_path= "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1", epochs=30, ))
async def test_model(self): with tempfile.TemporaryDirectory() as tempdir, patch.object( Model, "load", new=model_load): config = parse_unknown( "--model-directory", tempdir, "--model-features", "Years:int:1", "Experiance:int:1", "--model-predict", "Salary:float:1", ) async with self.post("/configure/model/fake/salary", json=config) as r: self.assertEqual(await r.json(), OK) self.assertIn("salary", self.cli.app["models"]) self.assertEqual( self.cli.app["models"]["salary"].config, FakeModelConfig( directory=pathlib.Path(tempdir), features=Features( Feature("Years", int, 1), Feature("Experiance", int, 1), ), predict=Feature("Salary", float, 1), ), ) with self.subTest(context="salaryctx"): # Create the context async with self.get( "/context/model/salary/salaryctx") as r: self.assertEqual(await r.json(), OK) self.assertIn("salaryctx", self.cli.app["model_contexts"])
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature = Feature("starts_with_a", int, 1) cls.features = Features(cls.feature) cls.records = [ Record( "a" + str(random.random()), data={"features": { cls.feature.name: 1, "string": "a" }}, ) for _ in range(0, 1000) ] cls.records += [ Record( "b" + str(random.random()), data={"features": { cls.feature.name: 0, "string": "not a" }}, ) for _ in range(0, 1000) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model = DNNClassifierModel( DNNClassifierModelConfig( directory=cls.model_dir.name, steps=1000, epochs=40, hidden=[50, 20, 10], predict=Feature("string", str, 1), classifications=["a", "not a"], clstype=str, features=cls.features, ))
class TestFeature(AsyncTestCase): def setUp(self): self.feature = Feature() def test_default_dtype(self): self.assertEqual(self.feature.dtype(), int) def test_default_length(self): self.assertEqual(self.feature.length(), 1) async def test_default_applicable(self): self.assertEqual(await self.feature.applicable(Data("test")), True) def test_load_def(self): feature = Feature.load_def("test", "float", 10) self.assertEqual(feature.NAME, "test") self.assertEqual(feature.dtype(), float) self.assertEqual(feature.length(), 10) def test_convert_dtype(self): self.assertEqual(Feature.convert_dtype("float"), float) def test_convert_dtype_invalid(self): with self.assertRaisesRegex(TypeError, "Failed to convert"): Feature.convert_dtype("not a python data type")
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature1 = Feature("feature_1", float, 1) cls.feature2 = Feature("feature_2", float, 1) cls.features = Features(cls.feature1, cls.feature2) cls.model = DNNRegressionModel( DNNRegressionModelConfig( directory=cls.model_dir.name, steps=1000, epochs=40, hidden=[50, 20, 10], predict=Feature("TARGET", float, 1), features=cls.features, )) # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) cls.records = [ Record( "x" + str(random.random()), data={ "features": { cls.feature1.name: float(_temp_data[0][i]), cls.feature2.name: float(_temp_data[1][i]), "TARGET": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], } }, ) for i in range(0, _n_data) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)))
class TestFeature(AsyncTestCase): def setUp(self): self.feature = Feature() def test_default_dtype(self): self.assertEqual(self.feature.dtype(), int) def test_default_length(self): self.assertEqual(self.feature.length(), 1) async def test_default_applicable(self): self.assertEqual(await self.feature.applicable(Data('test')), True)
def feature_feature_column(self, feature: Feature): ''' Creates a feature column for a feature ''' dtype = feature.dtype() if not inspect.isclass(dtype): LOGGER.warning('Unknown dtype %r. Cound not create column' % (dtype)) return None if dtype is int or issubclass(dtype, int) \ or dtype is float or issubclass(dtype, float): return self._tf.feature_column.numeric_column(feature.NAME, shape=feature.length()) LOGGER.warning('Unknown dtype %r. Cound not create column' % (dtype)) return None
def setUpClass(cls): A_train, B_train, X = list(zip(*TRAIN_DATA)) A_predict, B_predict = list(zip(*PREDICT_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "sentence_id": A_train[i], "words": B_train[i], "ner_tag": X[i], } }, ) for i in range(0, len(X)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records)) ) cls.predict_records = [ Record( str(i), data={ "features": { "sentence_id": A_predict[i], "words": B_predict[i], } }, ) for i in range(0, len(A_predict)) ] cls.predict_sources = Sources( MemorySource(MemorySourceConfig(records=cls.predict_records)) ) cls.model_dir = tempfile.TemporaryDirectory() cls.model = NERModel( NERModelConfig( sid=Feature("sentence_id", int, 1), words=Feature("words", str, 1), predict=Feature("ner_tag", str, 1), output_dir=cls.model_dir.name, model_architecture_type="bert", model_name_or_path="bert-base-cased", no_cuda=True, ) )
def _feature_feature_column(self, feature: Feature): """ Creates a feature column for a feature """ dtype = feature.dtype() if not inspect.isclass(dtype): self.logger.warning("Unknown dtype %r. Cound not create column" % (dtype)) return None if (dtype is int or issubclass(dtype, int) or dtype is float or issubclass(dtype, float)): return tensorflow.feature_column.numeric_column( feature.NAME, shape=feature.length()) self.logger.warning("Unknown dtype %r. Cound not create column" % (dtype)) return None
def setUpClass(cls): cls.features = Features() cls.features.append(Feature("A", str, 1)) A, X = list(zip(*DATA)) cls.records = [ Record(str(i), data={"features": { "A": A[i], "X": X[i] }}) for i in range(len(X)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model_dir = tempfile.TemporaryDirectory() cls.model = HFClassificationModel( HFClassificationModelConfig( model_name_or_path="bert-base-cased", cache_dir=cls.model_dir.name, logging_dir=cls.model_dir.name, output_dir=cls.model_dir.name, features=cls.features, predict=Feature("X", int, 1), label_list=["0", "1"], ))
async def test_model(self): test_feature_val = [ 0, 1.5, 2, ] # inserting zero so that its 1-indexable test_target = 2 * test_feature_val[1] + 3 * test_feature_val[2] # should be same function used in TestDNN.setupclass a = Record( "a", data={ "features": { self.feature1.name: test_feature_val[1], self.feature2.name: test_feature_val[2], } }, ) target_name = self.model.config.predict.name scorer = MeanSquaredErrorAccuracy() for i in range(0, 7): await train(self.model, self.sources) res = await score(self.model, scorer, Feature("TARGET", float, 1), self.sources) # Retry because of tensorflow intermitant low accuracy if res <= 0.8 and i < 5: print("Retry i:", i, "accuracy:", res) self.model_dir.cleanup() self.model_dir = tempfile.TemporaryDirectory() self.model.config = self.model.config._replace( location=pathlib.Path(self.model_dir.name)) continue self.assertGreater(res, 0.0) res = [ record async for record in predict(self.model, a, keep_record=True) ] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) test_error_norm = abs( (test_target - res[0].prediction(target_name).value) / test_target + 1e-6) error_threshold = 0.3 self.assertLess(test_error_norm, error_threshold)
async def test_model(self): scorer = ClassificationAccuracy() for i in range(0, 7): await train(self.model, self.sources) res = await score(self.model, scorer, Feature("string", str, 1), self.sources) # Retry because of tensorflow intermitant low accuracy if res <= 0.9 and i < 5: print("Retry i:", i, "accuracy:", res) self.model_dir.cleanup() self.model_dir = tempfile.TemporaryDirectory() self.model.config = self.model.config._replace( location=self.model_dir.name) continue self.assertGreater(res, 0.9) a = Record("a", data={"features": {self.feature.name: 1}}) target_name = self.model.config.predict.name res = [ record async for record in predict(self.model, a, keep_record=True) ] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) self.assertTrue(res[0].prediction(target_name).value)
def test_load_def(self): # TODO This test should be removed or its name should be modified. feature = Feature("test", float, 10) self.assertEqual(feature.name, "test") self.assertEqual(feature.dtype, float) self.assertEqual(feature.length, 10)
def setUp(self): self.feature = Feature("name", int, 1)
async def setUp(self): await super().setUp() self.one = Feature("one", int, 1) self.two = Feature("two", float, 2) self.three = Feature("three", int, 1) self.features = Features(self.one, self.two, self.three)
async def setUp(self): await super().setUp() self.feature = Feature("name", int, 1)
def setUp(self): self.one = Feature("one", int, 1) self.two = Feature("two", float, 2) self.three = Feature("three", int, 1) self.features = Features(self.one, self.two, self.three)
def setUp(self): self.feature = Feature()
def setUpClass(cls): cls.is_multi = "MULTI_" in cls.MODEL_TYPE cls.model_dir = tempfile.TemporaryDirectory() cls.features = Features() if cls.MODEL_TYPE in classifier_types: A, B, C, D, E, F, G, H, X, Y = list( zip(*FEATURE_DATA_CLASSIFICATION) ) cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) cls.features.append(Feature("D", float, 1)) cls.features.append(Feature("E", float, 1)) cls.features.append(Feature("F", float, 1)) cls.features.append(Feature("G", float, 1)) cls.features.append(Feature("H", float, 1)) if cls.MODEL_TYPE == "CLASSIFICATION": cls.features.append(Feature("X", float, 1)) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "E": E[i], "F": F[i], "G": G[i], "H": H[i], "X": X[i], "Y": Y[i], } }, ) for i in range(0, len(A)) ] elif cls.MODEL_TYPE in regressor_types: cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) cls.features.append(Feature("D", float, 1)) if cls.MODEL_TYPE == "REGRESSION": cls.features.append(Feature("X", float, 1)) A, B, C, D, X, Y = list(zip(*FEATURE_DATA_REGRESSION)) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "X": X[i], "Y": Y[i], } }, ) for i in range(0, len(A)) ] elif cls.MODEL_TYPE == "CLUSTERING": cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) cls.features.append(Feature("D", float, 1)) A, B, C, D, X = list(zip(*FEATURE_DATA_CLUSTERING)) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "X": X[i], } }, ) for i in range(0, len(A)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)) ) properties = { "location": cls.model_dir.name, "features": cls.features, } config_fields = dict() estimator_type = cls.MODEL.SCIKIT_MODEL._estimator_type if estimator_type in supervised_estimators: if cls.is_multi: config_fields["predict"] = Features( Feature("X", float, 1), Feature("Y", float, 1) ) else: config_fields["predict"] = Feature("X", float, 1) elif estimator_type in unsupervised_estimators: # TODO If cls.TRUE_CLSTR_PRESENT then we want to use the # mutual_info_score scikit accuracy scorer. In this case we might # want to change tcluster to a boolean config property. # For more info see commit e4f523976bf37d3457cda140ceab7899420ae2c7 config_fields["predict"] = Feature("X", float, 1) cls.model = cls.MODEL( cls.MODEL_CONFIG(**{**properties, **config_fields}) ) cls.scorer = cls.SCORER()
def test_load_builtin_features(self): features = Feature.load() for mustLoad in FEATURES: with self.subTest(mustLoad=mustLoad): self.assertIn(mustLoad, features)
async def test_01_accuracy(self): res = await score( self.model, self.scorer, Feature("X", float, 1), self.sources ) self.assertTrue(isinstance(res, float))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.features = Features() cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) cls.features.append(Feature("D", float, 1)) cls.features.append(Feature("E", float, 1)) cls.features.append(Feature("F", float, 1)) cls.features.append(Feature("G", int, 1)) cls.features.append(Feature("H", int, 1)) A, B, C, D, E, F, G, H, X = list(zip(*DATA)) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "E": E[i], "F": F[i], "G": G[i], "H": H[i], "X": X[i], } }, ) for i in range(0, len(A)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)) ) cls.model = VWModel( VWConfig( location=cls.model_dir.name, features=cls.features, predict=Feature("X", float, 1), # A and B will be namespace n1 # A and C will be in namespace n2 namespace=["n1_A_B", "n2_A_C"], importance=Feature("H", int, 1), tag=Feature("G", int, 1), task="regression", vwcmd=[ "l2", "0.1", "loss_function", "squared", "passes", "10", ], ) ) cls.scorer = MeanSquaredErrorAccuracy()
def test_convert_dtype(self): self.assertEqual(Feature.convert_dtype("float"), float)
def test_convert_dtype_invalid(self): with self.assertRaisesRegex(TypeError, "Failed to convert"): Feature.convert_dtype("not a python data type")
def test_load_def(self): feature = Feature.load_def("test", "float", 10) self.assertEqual(feature.NAME, "test") self.assertEqual(feature.dtype(), float) self.assertEqual(feature.length(), 10)
async def test_01_accuracy(self): res = await score(self.model, self.scorer, Feature("X", int, 1), self.sources) self.assertGreater(res, 0)
Record( str(i), data={ "features": { "Years": A[i] * 10, "Expertise": B[i] * 10, "Trust": C[i] * 10, "Salary": D[i] * 10, } }, ) for i in range(len(A)) ] TEST_FEATURE = Features( Feature("Years", int, 1), Feature("Expertise", int, 1), Feature("Trust", float, 1), Feature("Salary", int, 1), ) TEST_DATAFLOW1 = DataFlow( operations={ "edit_feature": edit_feature, "associate_definition": AssociateDefinition, }, flow={ "edit_feature": InputFlow( inputs={ "features": [ {"seed": ["Years", "Expertise", "Trust", "Salary"]}
def test_feature(self): self.assertIn("face", json.dumps(Feature("face"), cls=JSONEncoder))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.features = Features() if cls.MODEL_TYPE is "CLASSIFICATION": cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) cls.features.append(Feature("D", float, 1)) cls.features.append(Feature("E", float, 1)) cls.features.append(Feature("F", float, 1)) cls.features.append(Feature("G", float, 1)) cls.features.append(Feature("H", float, 1)) cls.features.append(Feature("I", float, 1)) A, B, C, D, E, F, G, H, I, X = list( zip(*FEATURE_DATA_CLASSIFICATION) ) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "E": E[i], "F": F[i], "G": G[i], "H": H[i], "I": I[i], "X": X[i], } }, ) for i in range(0, len(A)) ] elif cls.MODEL_TYPE is "REGRESSION": cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) A, B, C, X = list(zip(*FEATURE_DATA_REGRESSION)) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "X": X[i], } }, ) for i in range(0, len(A)) ] elif cls.MODEL_TYPE is "CLUSTERING": cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) cls.features.append(Feature("D", float, 1)) A, B, C, D, X = list(zip(*FEATURE_DATA_CLUSTERING)) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "X": X[i], } }, ) for i in range(0, len(A)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)) ) properties = { "directory": cls.model_dir.name, "features": cls.features, } config_fields = dict() estimator_type = cls.MODEL.SCIKIT_MODEL._estimator_type if estimator_type in supervised_estimators: config_fields["predict"] = Feature("X", float, 1) elif estimator_type in unsupervised_estimators: if cls.TRUE_CLSTR_PRESENT: config_fields["tcluster"] = Feature("X", float, 1) cls.model = cls.MODEL( cls.MODEL_CONFIG(**{**properties, **config_fields}) )