def test_feature_init_invalid_transformer_api(inputs): input, transformer = inputs with pytest.raises(ValueError): Feature(input, object()) with pytest.raises(ValueError): Feature(input, IdentityTransformer)
def test_discover(sample_data, expensive_stats): features = [ Feature('size', NullFiller(0), source='foo.features.contrib.user_a.feature_1'), Feature('strength', NullFiller(100), source='foo.features.contrib.user_b.feature_1') ] X_df, y_df = sample_data.X, sample_data.y y = np.asfarray(y_df) df = discover(features, X_df, y_df, y, expensive_stats=expensive_stats) expected_cols = { 'name', 'description', 'input', 'transformer', 'primitives', 'output', 'author', 'source', 'mutual_information', 'conditional_mutual_information', 'ninputs', 'nvalues', 'ncontinuous', 'ndiscrete', 'mean', 'std', 'variance', 'min', 'median', 'max', 'nunique', } actual_cols = df.columns assert not expected_cols.symmetric_difference(actual_cols) assert df.shape[0] == len(features) # test filter input = 'size' discovery_df = discover(features, X_df, y_df, y, input=input) assert discovery_df.shape[0] == len([ feature for feature in features if feature.input == input or input in feature.input ]) # test no data available # have to clear cache, as values on data already known ballet.discovery._summarize_feature.memory.clear() discovery_df = discover(features, None, None, None) assert discovery_df.shape[0] == len(features) actual_cols = discovery_df.columns assert not expected_cols.symmetric_difference(actual_cols) assert np.isnan(discovery_df['mean'].at[0])
def test_init(self): feature_1 = Feature(input='A_0', transformer=IdentityTransformer(), source='1st Feature') feature_2 = Feature(input='Z_0', transformer=IdentityTransformer(), source='2nd Feature') features = [feature_1] candidate_feature = feature_2 accepter = GFSSFAccepter(self.X, self.y, features, candidate_feature) self.assertIsNotNone(accepter)
def test_can_deepcopy(): # see GH 90 feature = Feature('size', IdentityTransformer()) pipeline = FeatureEngineeringPipeline(feature) assert hasattr(pipeline, '_ballet_features') pipeline2 = deepcopy(pipeline) assert hasattr(pipeline2, '_ballet_features')
def test_producing_missing_values_fails(self): assert has_nans(self.X) feature = Feature(input='size', transformer=IdentityTransformer()) valid, failures = check_from_class(FeatureApiCheck, feature, self.X, self.y) self.assertFalse(valid) self.assertIn(NoMissingValuesCheck.__name__, failures)
def test_feature_pipeline(inputs): input, transformer = inputs feature = Feature(input, transformer) pipeline = feature.pipeline assert isinstance(pipeline, FeatureEngineeringPipeline) pipeline2 = feature.pipeline assert pipeline is pipeline2
def test_transform(input, transformer): feature = Feature(input, transformer) mapper = FeatureEngineeringPipeline(feature) df = pd.util.testing.makeCustomDataframe(5, 2) df.columns = ['foo', 'bar'] mapper.fit(df) X = mapper.transform(df) assert np.shape(X) == (5, 2)
def test_transform(self): feature = Feature(self.input, self.transformer) mapper = FeatureEngineeringPipeline(feature) df = pd.util.testing.makeCustomDataframe(5, 2) df.columns = ['foo', 'bar'] mapper.fit(df) X = mapper.transform(df) self.assertEqual(np.shape(X), (5, 1))
def test_gfssf_pruner_keep_relevant(sample_data): X_df, y_df, y = sample_data feature_1 = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') feature_2 = Feature( input='Z_0', transformer=IdentityTransformer(), source='2nd Feature') gfssf_pruner = GFSSFPruner( X_df, y_df, X_df, y, [feature_1], feature_2) redunant_features = gfssf_pruner.prune() assert feature_1 not in redunant_features, \ 'Still relevant features should be pruned'
def test_gfssf_pruner_prune_exact_replicas(sample_data): X_df, y_df, y = sample_data feature_1 = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') feature_2 = Feature( input='A_0', transformer=IdentityTransformer(), source='2nd Feature') gfssf_pruner = GFSSFPruner( X_df, y_df, X_df, y, [feature_1], feature_2) redunant_features = gfssf_pruner.prune() assert feature_1 in redunant_features, \ 'Exact replica features should be pruned'
def test_bad_feature_transform_errors(self): # transformer throws errors feature = Feature(input='size', transformer=FragileTransformer((lambda x: True, ), (RuntimeError, ))) valid, failures = check_from_class(FeatureApiCheck, feature, self.X, self.y) self.assertFalse(valid) self.assertIn(CanTransformCheck.__name__, failures)
def test_mutual_information_accepter_nans(handle_nan_targets, expected): X_df = pd.DataFrame({'A': [1, 2, 3]}) y = np.array([np.nan, 2, 3]).reshape(-1, 1) feature = Feature( input='A', transformer=IdentityTransformer()) accepter = MutualInformationAccepter( X_df, y, X_df, y, [], feature, handle_nan_targets=handle_nan_targets) actual = accepter.judge() assert expected == actual
def test_gfssf_accepter_init(sample_data): X_df, y_df, y = sample_data feature_1 = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') feature_2 = Feature( input='Z_0', transformer=IdentityTransformer(), source='2nd Feature') features = [feature_1] candidate_feature = feature_2 accepter = GFSSFAccepter( X_df, y_df, X_df, y, features, candidate_feature) assert accepter is not None
def test_bad_feature_input(sample_data): # bad input feature = Feature( input=3, transformer=SimpleImputer(), ) valid, failures, advice = check_from_class( FeatureApiCheck, feature, sample_data.X, sample_data.y) assert not valid assert HasCorrectInputTypeCheck.__name__ in failures
def test_good_feature(sample_data): feature = Feature( input='size', transformer=SimpleImputer(), ) valid, failures, advice = check_from_class( FeatureApiCheck, feature, sample_data.X, sample_data.y) assert valid assert len(failures) == 0
def test_good_feature(self): feature = Feature( input='size', transformer=SimpleImputer(), ) valid, failures = check_from_class( FeatureApiCheck, feature, self.X, self.y) self.assertTrue(valid) self.assertEqual(len(failures), 0)
def test_producing_missing_values_fails(sample_data): assert has_nans(sample_data.X) feature = Feature( input='size', transformer=IdentityTransformer() ) valid, failures, advice = check_from_class( FeatureApiCheck, feature, sample_data.X, sample_data.y) assert not valid assert NoMissingValuesCheck.__name__ in failures
def test_bad_feature_input(self): # bad input feature = Feature( input=3, transformer=SimpleImputer(), ) valid, failures = check_from_class( FeatureApiCheck, feature, self.X, self.y) self.assertFalse(valid) self.assertIn(HasCorrectInputTypeCheck.__name__, failures)
def test_bad_feature_transform_errors(sample_data): # transformer throws errors feature = Feature( input='size', transformer=FragileTransformer( (lambda x: True, ), (RuntimeError, )) ) valid, failures, advice = check_from_class( FeatureApiCheck, feature, sample_data.X, sample_data.y) assert not valid assert CanTransformCheck.__name__ in failures
def test_discover_feature_error(sample_data): features = [ Feature('size', FragileTransformer()), ] X_df, y_df = sample_data.X, sample_data.y y = np.asfarray(y_df) discovery_df = discover(features, X_df, y_df, y) assert discovery_df.shape[0] == len(features) assert np.isnan(discovery_df['mean'].at[0])
def test_robust_transformer_desugar(): """Should be able to "desugar" multiple things into a valid transformer pipeline""" # noqa transformer = [ None, IdentityTransformer(), lambda x: x, Feature('A', IdentityTransformer()), ('A', IdentityTransformer()), ('A', [None, IdentityTransformer()]), ] robust_transformer = make_robust_transformer(transformer) assert isinstance(robust_transformer, TransformerPipeline)
def test_df_colnames(input, transformer, output): feature = Feature(input, transformer, output=output) mapper = FeatureEngineeringPipeline(feature) entities_df = pd.util.testing.makeCustomDataframe(5, 2) entities_df.columns = ['foo', 'bar'] feature_matrix = mapper.fit_transform(entities_df) feature_frame = pd.DataFrame( feature_matrix, columns=mapper.transformed_names_, index=entities_df.index, ) assert fy.all(fy.isa(str), feature_frame.columns)
def test_variance_threshold_accepter(mock_var, sample_data): expected = False X_df, y_df, y = sample_data feature = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') accepter = VarianceThresholdAccepter( X_df, y_df, X_df, y, [], feature) actual = accepter.judge() assert expected == actual
def test_bad_feature_deepcopy_fails(self): class _CopyFailsTransformer(IdentityTransformer): def __deepcopy__(self, memo): raise RuntimeError feature = Feature( input='size', transformer=_CopyFailsTransformer(), ) valid, failures = check_from_class( FeatureApiCheck, feature, self.X, self.y) self.assertFalse(valid) self.assertIn(CanDeepcopyCheck.__name__, failures)
def test_bad_feature_deepcopy_fails(sample_data): class _CopyFailsTransformer(IdentityTransformer): def __deepcopy__(self, memo): raise RuntimeError feature = Feature( input='size', transformer=_CopyFailsTransformer(), ) valid, failures, advice = check_from_class( FeatureApiCheck, feature, sample_data.X, sample_data.y) assert not valid assert CanDeepcopyCheck.__name__ in failures
def test_mutual_information_accepter(_, sample_data): expected = True X_df, y_df, y = sample_data feature = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') accepter = MutualInformationAccepter( X_df, y_df, X_df, y, [], feature) actual = accepter.judge() assert expected == actual
def test_gfssf_pruner_prune_weak_replicas(sample_data): X_df, y_df, y = sample_data def add_noise(X): X = asarray2d(X) return X + np.random.normal(0, 0.5, X.shape) noise_transformer = SimpleFunctionTransformer(add_noise) feature_weak = Feature( input='A_0', transformer=noise_transformer, source='1st Feature') feature_strong = Feature( input='A_0', transformer=IdentityTransformer(), source='2nd Feature') gfssf_pruner = GFSSFPruner( X_df, y_df, X_df, y, [feature_weak], feature_strong) redunant_features = gfssf_pruner.prune() assert feature_weak in redunant_features, \ 'Noisy features should be pruned'
def test_discover_target_nans(sample_data): features = [ Feature('size', NullFiller(0)), ] X_df, y_df = sample_data.X, sample_data.y y = np.asfarray(y_df) # introduce nan to target y[0] = np.nan discovery_df = discover(features, X_df, y_df, y) # stats with target should still be computed assert not np.isnan(discovery_df['mutual_information']).any()
def test_variance_threshold_accepter_feature_group(): expected = True # variance is 0.25 per column, > 0.05 threshold X = pd.DataFrame(np.eye(2)) y = None feature = Feature( input=[0, 1], transformer=IdentityTransformer(), source='1st Feature') accepter = VarianceThresholdAccepter( X, y, X, y, [], feature) actual = accepter.judge() assert expected == actual
def test_bad_feature_wrong_transform_length(sample_data): class _WrongLengthTransformer(BaseTransformer): def transform(self, X, **transform_kwargs): new_shape = list(X.shape) new_shape[0] += 1 output = np.arange(np.prod(new_shape)).reshape(new_shape) return output # doesn't return correct length feature = Feature( input='size', transformer=_WrongLengthTransformer(), ) valid, failures, advice = check_from_class( FeatureApiCheck, feature, sample_data.X, sample_data.y) assert not valid assert HasCorrectOutputDimensionsCheck.__name__ in failures