def test_handles_individual_pipeline_components(): """ dont throw value error if a config with just a pre_process is passed""" pre_process_only_config = { "pre_process": [ { "name": "fillna", "field": iris_predictive_columns, } ], } pre_process_only_pipeline = pipeline_from_config(pre_process_only_config) """ dont throw value error if a config with just transforms are passed""" transform_only_config = { "transforms": [{"type":"featurizer", "transforms": [{"name":"quantile_numeric"}], "field": iris_predictive_columns}] } transform_only_pipeline = pipeline_from_config(transform_only_config) """ dont throw value error if a config with just a post_process is passed""" post_process_only_config = { "post_process": [{"name": "svd", "config": {"rank": 20}}] } post_process_only_pipeline = pipeline_from_config(post_process_only_config)
def test_throws_exception_on_invalid_inputs(): with pytest.raises(ValueError): config = { "transforms": [{"type":"featurizer", "transforms": [{"name":"quantile_numeric"}], "field":None}] } pipeline = pipeline_from_config(config) with pytest.raises(ValueError): config = { "transforms": [{"type":"featurizer", "transforms": [{"name":"quantile_numeric"}], "field":[]}] } pipeline = pipeline_from_config(config) with pytest.raises(ValueError): config = { "pre_process": [ { "name": "fillna", "field": None } ], "transforms": [{"type":"featurizer", "transforms": [{"name":"quantile_numeric"}], "field": iris_predictive_columns}] } pipeline = pipeline_from_config(config) with pytest.raises(ValueError): config = { "pre_process": [ { "name": "fillna", "field": [] } ], "transforms": [{"type":"featurizer", "transforms": [{"name":"quantile_numeric"}], "field": iris_predictive_columns}] } pipeline = pipeline_from_config(config) with pytest.raises(ValueError): config = { "foo": { "bar": "baz" } } pipeline = pipeline_from_config(config)
def test_handles_process_steps_individually(): df, target = get_iris_dataframe() """ dont throw value error if a config with just a pre_process is passed""" pre_process_only_config = { "pre_process": [ { "name": "fillna", "field": iris_predictive_columns } ], } pre_process_only_pipeline = pipeline_from_config(pre_process_only_config) pre_process_X = pre_process_only_pipeline.fit_transform(df) assert (pre_process_X.shape == df.shape) assert (np.isnan(pre_process_X).sum() == 0) """ dont throw value error if a config with just transforms are passed""" transform_only_config = { "transforms": [{"type":"featurizer", "transforms": [{"name":"quantile_numeric"}], "field": iris_predictive_columns}] } transform_only_pipeline = pipeline_from_config(transform_only_config) transform_X = transform_only_pipeline.fit_transform(df) assert (transform_X.shape[1] == 101*df.shape[1]) """ dont throw value error if a config with just a post_process is passed""" post_process_only_config = { "post_process": [{"name": "svd", "config": {"rank": 2}}] } post_process_only_pipeline = pipeline_from_config(post_process_only_config) post_process_X = post_process_only_pipeline.fit_transform(df) assert (post_process_X.shape[1] == 2)
def test_gaussian_mixture_transformer(): df, y = get_iris_dataframe() config = { "post_process": [ {"name": "gmm", "config": {"clusters": 50}} ] } pipeline = pipeline_from_config(config) X = pipeline.fit_transform(df) assert (X.shape[1] == 50) assert (X.shape[0] == df.shape[0])
def test_kernel_pca(): df, y = get_iris_dataframe() for kernel in ["linear", "poly", "rbf", "sigmoid", "cosine"]: config = { "post_process": [ {"name": "kpca", "config": {"n_components": 10, "kernel": kernel}} ] } pipeline = pipeline_from_config(config) X = pipeline.fit_transform(df) assert (X.shape[1] == 10) assert (X.shape[0] == df.shape[0])
def test_lda_convenience_transform(): records = 5000 df = artificial_text(records) lda_config = { "transforms": [ { "type": "featurizer", "field": ["text"], "transforms": [{"name": "lda", "config": {"rank": 50}}] } ], } lda_pipeline = pipeline_from_config(lda_config) X = lda_pipeline.fit_transform(df) assert (X.shape[0] == df.shape[0]) assert (X.shape[1] == 50)
def test_text_handling(): records = 5000 df = artificial_text(records) tfidf_config = { "transforms": [ { "type": "featurizer", "field": ["text"], "transforms": [{"name": "tfidf", "config": {"max_features": 5000}}] } ] } tokenizer_config = { "transforms": [ { "type": "featurizer", "field": ["text"], "transforms": [{"name": "tokenizer", "config": {"max_features": 5000}}] } ] } hashing_config = { "transforms": [ { "type": "featurizer", "field": ["text"], "transforms": [{"name": "hashing", "config": {"n_features": 5000}}] } ] } for config in [tfidf_config, tokenizer_config, hashing_config]: pipeline = pipeline_from_config(config) X = pipeline.fit_transform(df) assert (X.shape[0] == df.shape[0]) assert (X.shape[0] == records) assert (X.shape[1] == 5000) assert (X.sum() > 0)
def test_standard_numeric_transformer(): df, y = get_iris_dataframe() config = { "transforms": [ { "type" : "featurizer", "transforms" : [{"name": "standard_numeric"}], "field" : iris_predictive_columns } ] } pipeline = pipeline_from_config(config) X = pipeline.fit_transform(df) assert (X.shape[1] == df.shape[1]) assert (X.shape[0] == df.shape[0]) for i in range(X.shape[1]): m = X[:, i].mean() s = X[:, i].std() assert (abs(m) < .1) # close to 0 assert (abs(s) < 1.1) # close to 1