def test_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = Normalizer(norm='l2').fit(scikit_data.data) spec = converter.convert(scikit_model, scikit_data.feature_names, 'out') input_data = [dict(zip(scikit_data.feature_names, row)) for row in scikit_data.data] output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)] evaluate_transformer(spec, input_data, output_data)
def test_boston_OHE_pipeline(self): data = load_boston() for categorical_features in [[3], [8], [3, 8], [8, 3]]: # Put it in a pipeline so that we can test whether the output dimension # handling is correct. model = Pipeline([ ("OHE", OneHotEncoder(categorical_features=categorical_features)), ("Normalizer", Normalizer()) ]) model.fit(data.data.copy(), data.target) # Convert the model spec = sklearn.convert(model, data.feature_names, 'out').get_spec() if macos_version() >= (10, 13): input_data = [ dict(zip(data.feature_names, row)) for row in data.data ] output_data = [{ "out": row } for row in model.transform(data.data.copy())] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def test_random(self): # Generate some random data_imputeValue.multiArrayValue[i] X = _np.random.random(size=(50, 3)) for param in ('l1', 'l2', 'max'): cur_model = Normalizer(norm=param) output = cur_model.fit_transform(X) spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out') evaluate_transformer( spec, [dict(zip(["a", "b", "c"], row)) for row in X], [{ "out": row } for row in output])
def test_random(): # Generate some random data_imputeValue.multiArrayValue[i] X = _np.random.random(size=(50, 3)) for param in ("l1", "l2", "max"): cur_model = Normalizer(norm=param) output = cur_model.fit_transform(X) spec = converter.convert(cur_model, ["a", "b", "c"], "out") evaluate_transformer( spec, [dict(zip(["a", "b", "c"], row)) for row in X], [{ "out": row } for row in output], )
def test_conversion_one_column_of_several(self): scikit_model = OneHotEncoder(categorical_features = [0]) scikit_model.fit(copy(self.scikit_data_multiple_cols)) spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec() if macos_version() >= (10, 13): test_data = [{'feature_1': row[0], 'feature_2': row[1]} for row in self.scikit_data_multiple_cols] scikit_output = [{'out': row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics['num_errors'], 0)
def _test_conversion(self, data, trained_dict_vectorizer): X = trained_dict_vectorizer.transform(data) m = sklearn.convert(trained_dict_vectorizer, input_features = "features", output_feature_names = "output") ret = evaluate_transformer( m, [{"features" : row} for row in data], [{"output" : x_r} for x_r in X], True) assert ret["num_errors"] == 0
def test_conversion_one_column(self): # Fit a single OHE scikit_model = OneHotEncoder() scikit_model.fit(self.scikit_data) spec = sklearn.convert(scikit_model, 'single_feature', 'out').get_spec() if macos_version() >= (10, 13): test_data = [{'single_feature' : row} for row in self.scikit_data] scikit_output = [{'out' : row} for row in scikit_model.transform(self.scikit_data).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics['num_errors'], 0)
def test_random(self): # Generate some random data X = _np.random.random(size = (50, 3)) cur_model = StandardScaler() output = cur_model.fit_transform(X) spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out').get_spec() if macos_version() >= (10, 13): metrics = evaluate_transformer(spec, [dict(zip(["a", "b", "c"], row)) for row in X], [{"out" : row} for row in output]) assert metrics["num_errors"] == 0
def test_conversion_one_column(self): # Fit a single OHE scikit_model = OneHotEncoder() scikit_model.fit(self.scikit_data) spec = sklearn.convert(scikit_model, "single_feature", "out").get_spec() test_data = [{"single_feature": row} for row in self.scikit_data] scikit_output = [{ "out": row } for row in scikit_model.transform(self.scikit_data).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEqual(metrics["num_errors"], 0)
def test_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = StandardScaler().fit(scikit_data.data) spec = converter.convert(scikit_model, scikit_data.feature_names, 'out').get_spec() if macos_version() >= (10, 13): input_data = [dict(zip(scikit_data.feature_names, row)) for row in scikit_data.data] output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)] metrics = evaluate_transformer(spec, input_data, output_data) assert metrics["num_errors"] == 0
def test_boston_OHE(self): data = load_boston() for categorical_features in [ [3], [8], [3, 8], [8,3] ]: model = OneHotEncoder(categorical_features = categorical_features, sparse=False) model.fit(data.data, data.target) # Convert the model spec = sklearn.convert(model, data.feature_names, 'out').get_spec() input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in model.transform(data.data)] if macos_version() >= (10, 13): result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def test_boston_OHE_plus_normalizer(self): data = load_boston() pl = Pipeline([ ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), ("Scaler",StandardScaler())]) pl.fit(data.data, data.target) # Convert the model spec = convert(pl, data.feature_names, 'out') if macos_version() >= (10, 13): input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in pl.transform(data.data)] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def test_conversion_many_columns(self): scikit_model = OneHotEncoder() scikit_model.fit(self.scikit_data_multiple_cols) spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec() test_data = [{ 'feature_1': row[0], 'feature_2': row[1] } for row in self.scikit_data_multiple_cols] scikit_output = [{ 'out': row } for row in scikit_model.transform( self.scikit_data_multiple_cols).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics['num_errors'], 0)
def test_conversion_one_column_of_several(self): scikit_model = OneHotEncoder(categorical_features=[0]) scikit_model.fit(copy(self.scikit_data_multiple_cols)) spec = sklearn.convert(scikit_model, ["feature_1", "feature_2"], "out").get_spec() test_data = [{ "feature_1": row[0], "feature_2": row[1] } for row in self.scikit_data_multiple_cols] scikit_output = [{ "out": row } for row in scikit_model.transform( self.scikit_data_multiple_cols).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEqual(metrics["num_errors"], 0)
def test_random(): # Generate some random data X = _np.random.random(size=(50, 3)) cur_model = StandardScaler() output = cur_model.fit_transform(X) spec = converter.convert(cur_model, ["a", "b", "c"], "out").get_spec() metrics = evaluate_transformer( spec, [dict(zip(["a", "b", "c"], row)) for row in X], [{ "out": row } for row in output], ) if metrics["num_errors"] != 0: raise AssertionError
def test_conversion_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() sh = scikit_data.data.shape rn.seed(0) missing_value_indices = [(rn.randint(sh[0]), rn.randint(sh[1])) for k in range(sh[0])] for strategy in ["mean", "median", "most_frequent"]: for missing_value in [0, 'NaN', -999]: X = np.array(scikit_data.data).copy() for i, j in missing_value_indices: X[i, j] = missing_value model = Imputer(missing_values=missing_value, strategy=strategy) model = model.fit(X) tr_X = model.transform(X.copy()) spec = converter.convert(model, scikit_data.feature_names, 'out') if macos_version() >= (10, 13): input_data = [ dict(zip(scikit_data.feature_names, row)) for row in X ] output_data = [{"out": row} for row in tr_X] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def test_boston(): from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = StandardScaler().fit(scikit_data.data) spec = converter.convert(scikit_model, scikit_data.feature_names, "out").get_spec() input_data = [ dict(zip(scikit_data.feature_names, row)) for row in scikit_data.data ] output_data = [{ "out": row } for row in scikit_model.transform(scikit_data.data)] metrics = evaluate_transformer(spec, input_data, output_data) if metrics["num_errors"] != 0: raise AssertionError
def test_random_sparse_data(self): n_columns = 8 n_categories = 20 import numpy.random as rn rn.seed(0) categories = rn.randint(50000, size=(n_columns, n_categories)) for dt in ["int32", "float32", "float64"]: _X = np.array( [[ categories[j, rn.randint(n_categories)] for j in range(n_columns) ] for i in range(100)], dtype=dt, ) # Test this data on a bunch of possible inputs. for sparse in (True, False): for categorical_features in [ "all", [3], [4], range(2, 8), range(0, 4), range(0, 8), ]: X = _X.copy() # This appears to be the only type now working. if X.dtype != np.dtype(dt): raise AssertionError model = OneHotEncoder( categorical_features=categorical_features, sparse=sparse) model.fit(X) # Convert the model spec = sklearn.convert(model, [("data", Array(n_columns))], "out") X_out = model.transform(X) if sparse: X_out = X_out.todense() input_data = [{"data": row} for row in X] output_data = [{"out": row} for row in X_out] result = evaluate_transformer(spec, input_data, output_data) if result["num_errors"] != 0: raise AssertionError # Test normal data inside a pipeline for sparse in (True, False): for categorical_features in [ "all", [3], [4], range(2, 8), range(0, 4), range(0, 8), ]: X = _X.copy() model = Pipeline([ ( "OHE", OneHotEncoder( categorical_features=categorical_features, sparse=sparse, ), ), ("Normalizer", Normalizer()), ]) model.fit(X) # Convert the model spec = sklearn.convert(model, [("data", Array(n_columns))], "out").get_spec() X_out = model.transform(X) if sparse: X_out = X_out.todense() input_data = [{"data": row} for row in X] output_data = [{"out": row} for row in X_out] result = evaluate_transformer(spec, input_data, output_data) if result["num_errors"] != 0: raise AssertionError
def test_random_sparse_data(self): n_columns = 8 n_categories = 20 import numpy.random as rn rn.seed(0) categories = rn.randint(50000, size=(n_columns, n_categories)) for dt in ['int32', 'float32', 'float64']: _X = np.array([[ categories[j, rn.randint(n_categories)] for j in range(n_columns) ] for i in range(100)], dtype=dt) # Test this data on a bunch of possible inputs. for sparse in (True, False): for categorical_features in [ 'all', [3], [4], range(2, 8), range(0, 4), range(0, 8) ]: X = _X.copy() # This appears to be the only type now working. assert X.dtype == np.dtype(dt) model = OneHotEncoder( categorical_features=categorical_features, sparse=sparse) model.fit(X) # Convert the model spec = sklearn.convert(model, [('data', Array(n_columns))], 'out') if macos_version() >= (10, 13): X_out = model.transform(X) if sparse: X_out = X_out.todense() input_data = [{'data': row} for row in X] output_data = [{"out": row} for row in X_out] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0 # Test normal data inside a pipeline for sparse in (True, False): for categorical_features in [ 'all', [3], [4], range(2, 8), range(0, 4), range(0, 8) ]: X = _X.copy() model = Pipeline([ ("OHE", OneHotEncoder( categorical_features=categorical_features, sparse=sparse)), ("Normalizer", Normalizer()) ]) model.fit(X) # Convert the model spec = sklearn.convert(model, [('data', Array(n_columns))], 'out').get_spec() if macos_version() >= (10, 13): X_out = model.transform(X) if sparse: X_out = X_out.todense() input_data = [{'data': row} for row in X] output_data = [{"out": row} for row in X_out] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0