def test_robust_transformer_desugar(): """Should be able to "desugar" multiple things into a valid transformer pipeline""" # noqa transformer = [ None, IdentityTransformer(), lambda x: x, Feature('A', IdentityTransformer()), ('A', IdentityTransformer()), ('A', [None, IdentityTransformer()]), ] robust_transformer = make_robust_transformer(transformer) assert isinstance(robust_transformer, TransformerPipeline)
def test_init(self): feature_1 = Feature(input='A_0', transformer=IdentityTransformer(), source='1st Feature') feature_2 = Feature(input='Z_0', transformer=IdentityTransformer(), source='2nd Feature') features = [feature_1] candidate_feature = feature_2 accepter = GFSSFAccepter(self.X, self.y, features, candidate_feature) self.assertIsNotNone(accepter)
def test_producing_missing_values_fails(self): assert has_nans(self.X) feature = Feature(input='size', transformer=IdentityTransformer()) valid, failures = check_from_class(FeatureApiCheck, feature, self.X, self.y) self.assertFalse(valid) self.assertIn(NoMissingValuesCheck.__name__, failures)
def test_can_deepcopy(): # see GH 90 feature = Feature('size', IdentityTransformer()) pipeline = FeatureEngineeringPipeline(feature) assert hasattr(pipeline, '_ballet_features') pipeline2 = deepcopy(pipeline) assert hasattr(pipeline2, '_ballet_features')
def get_target_encoder(): """Get encoder for the prediction target Returns: transformer-like """ return IdentityTransformer()
def test_gfssf_pruner_keep_relevant(sample_data): X_df, y_df, y = sample_data feature_1 = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') feature_2 = Feature( input='Z_0', transformer=IdentityTransformer(), source='2nd Feature') gfssf_pruner = GFSSFPruner( X_df, y_df, X_df, y, [feature_1], feature_2) redunant_features = gfssf_pruner.prune() assert feature_1 not in redunant_features, \ 'Still relevant features should be pruned'
def test_gfssf_pruner_prune_exact_replicas(sample_data): X_df, y_df, y = sample_data feature_1 = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') feature_2 = Feature( input='A_0', transformer=IdentityTransformer(), source='2nd Feature') gfssf_pruner = GFSSFPruner( X_df, y_df, X_df, y, [feature_1], feature_2) redunant_features = gfssf_pruner.prune() assert feature_1 in redunant_features, \ 'Exact replica features should be pruned'
def test_mutual_information_accepter_nans(handle_nan_targets, expected): X_df = pd.DataFrame({'A': [1, 2, 3]}) y = np.array([np.nan, 2, 3]).reshape(-1, 1) feature = Feature( input='A', transformer=IdentityTransformer()) accepter = MutualInformationAccepter( X_df, y, X_df, y, [], feature, handle_nan_targets=handle_nan_targets) actual = accepter.judge() assert expected == actual
def test_gfssf_accepter_init(sample_data): X_df, y_df, y = sample_data feature_1 = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') feature_2 = Feature( input='Z_0', transformer=IdentityTransformer(), source='2nd Feature') features = [feature_1] candidate_feature = feature_2 accepter = GFSSFAccepter( X_df, y_df, X_df, y, features, candidate_feature) assert accepter is not None
def test_producing_missing_values_fails(sample_data): assert has_nans(sample_data.X) feature = Feature( input='size', transformer=IdentityTransformer() ) valid, failures, advice = check_from_class( FeatureApiCheck, feature, sample_data.X, sample_data.y) assert not valid assert NoMissingValuesCheck.__name__ in failures
def __init__(self, nsteps, bad_input_checks, errors, shuffle=True, seed=1): steps = [(f'IdentityTransformer{i:02d}', IdentityTransformer()) for i in range(nsteps - 1)] fragile_transformer = FragileTransformer(bad_input_checks, errors) steps.append((repr(fragile_transformer), fragile_transformer)) if shuffle: rand = random.Random() rand.seed(seed) rand.shuffle(steps) super().__init__(steps)
def test_variance_threshold_accepter(mock_var, sample_data): expected = False X_df, y_df, y = sample_data feature = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') accepter = VarianceThresholdAccepter( X_df, y_df, X_df, y, [], feature) actual = accepter.judge() assert expected == actual
def test_mutual_information_accepter(_, sample_data): expected = True X_df, y_df, y = sample_data feature = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') accepter = MutualInformationAccepter( X_df, y_df, X_df, y, [], feature) actual = accepter.judge() assert expected == actual
def test_variance_threshold_accepter_feature_group(): expected = True # variance is 0.25 per column, > 0.05 threshold X = pd.DataFrame(np.eye(2)) y = None feature = Feature( input=[0, 1], transformer=IdentityTransformer(), source='1st Feature') accepter = VarianceThresholdAccepter( X, y, X, y, [], feature) actual = accepter.judge() assert expected == actual
def test_gfssf_pruner_prune_weak_replicas(sample_data): X_df, y_df, y = sample_data def add_noise(X): X = asarray2d(X) return X + np.random.normal(0, 0.5, X.shape) noise_transformer = SimpleFunctionTransformer(add_noise) feature_weak = Feature( input='A_0', transformer=noise_transformer, source='1st Feature') feature_strong = Feature( input='A_0', transformer=IdentityTransformer(), source='2nd Feature') gfssf_pruner = GFSSFPruner( X_df, y_df, X_df, y, [feature_weak], feature_strong) redunant_features = gfssf_pruner.prune() assert feature_weak in redunant_features, \ 'Noisy features should be pruned'
def test_compound_accepter(sample_data): expected = False X_df, y_df, y = sample_data agg = 'all' specs = [ 'ballet.validation.feature_acceptance.validator.AlwaysAccepter', { 'name': 'ballet.validation.feature_acceptance.validator.RandomAccepter', # noqa 'params': { 'p': 0.00, } } ] feature = Feature( input='A_0', transformer=IdentityTransformer(), source='1st Feature') accepter = CompoundAccepter( X_df, y_df, X_df, y, [], feature, agg=agg, specs=specs ) actual = accepter.judge() assert expected == actual
def test_validation_end_to_end(quickstart): project = quickstart.project slug = quickstart.package_slug base = project.path repo = quickstart.repo pkg = project.package assert isinstance(pkg, ModuleType) api = project.api assert isinstance(api, FeatureEngineeringProject) # no features at first features = api.features assert len(features) == 0 # first providing a mock feature, call build mock_features = [Feature(input='A_1', transformer=IdentityTransformer())] with patch.object(api, 'collect', return_value=mock_features): X_df = pd.util.testing.makeCustomDataframe(5, 2) X_df.columns = ['A_0', 'A_1'] result = api.engineer_features(X_df=X_df, y_df=[]) assert np.shape(result.X) == (5, 1) assert isinstance(result.pipeline, FeatureEngineeringPipeline) # splice in a new version of foo.load_data.load_data # 1. 'src' needs to be hardcoded # 2. really bad - set load_data = load_regression_data which does not # have the same args new_load_data_str = get_source(load_regression_data) p = base.joinpath('src', slug, 'load_data.py') with p.open('w') as f: f.write(new_load_data_str) f.write('\n') f.write('load_data=load_regression_data\n') # commit changes repo.index.add([str(p)]) repo.index.commit('Load mock regression dataset') # call different validation routines def call_validate_all(ref=None): """Validate branch as if we were running on CI""" envvars = { 'TRAVIS_BUILD_DIR': repo.working_tree_dir, } if ref is None: envvars['TRAVIS_PULL_REQUEST'] = 'false' envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range( repo.commit('HEAD@{-1}').hexsha, repo.commit('HEAD').hexsha) envvars['TRAVIS_PULL_REQUEST_BRANCH'] = '' envvars['TRAVIS_BRANCH'] = repo.heads.master.name else: # TODO is this okay for testing? envvars['TRAVIS_PULL_REQUEST'] = str(1) envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range( repo.heads.master.name, repo.commit(ref).hexsha) with patch.dict(os.environ, envvars): check_call(shlex.split('ballet validate -A'), cwd=base, env=os.environ) call_validate_all() # branch and write a new feature contrib_dir = base.joinpath('src', slug, 'features', 'contrib') ref = 'bob/feature-a' logger.info(f'Switching to branch {ref}, User Bob, Feature A') switch_to_new_branch(repo, ref) new_feature_str = make_feature_str('A_0') username = '******' featurename = 'A_0' submit_feature(repo, contrib_dir, username, featurename, new_feature_str) # call different validation routines logger.info('Validating User Bob, Feature A') call_validate_all(ref=ref) # merge branch with master logger.info('Merging into master') repo.git.checkout('master') repo.git.merge(ref, no_ff=True) # call different validation routines logger.info('Validating after merge') call_validate_all() # write another new feature ref = 'charlie/feature-z1' logger.info('Switching to branch ref, User Charlie, Feature Z_1') switch_to_new_branch(repo, ref) new_feature_str = make_feature_str('Z_1') username = '******' featurename = 'Z_1' submit_feature(repo, contrib_dir, username, featurename, new_feature_str) # TODO we expect this feature to fail but it passes cm = pytest.raises(CalledProcessError) if False else nullcontext() with cm: logger.info('Validating User Charlie, Feature Z_1') call_validate_all(ref=ref) # write another new feature - redundancy ref = 'charlie/feature-a0' repo.git.checkout('master') switch_to_new_branch(repo, ref) new_feature_str = make_feature_str('A_0') username = '******' featurename = 'A_0' submit_feature(repo, contrib_dir, username, featurename, new_feature_str) with pytest.raises(CalledProcessError): call_validate_all(ref=ref)
with_input = pytest.mark.parametrize( 'input', [ ['foo', 'bar'], lambda df: ['foo', 'bar'], ], ids=[ 'list of string', 'callable to list of string' ] ) with_transformer = pytest.mark.parametrize( 'transformer', [ IdentityTransformer(), [IdentityTransformer()], [None, IdentityTransformer(), lambda x: x], Feature(['foo', 'bar'], IdentityTransformer()), [None, IdentityTransformer(), Feature( ['foo', 'bar'], IdentityTransformer())], ], ids=[ 'scalar', 'list of transformer', 'list of mixed', 'nested feature', 'list of mixed and nested features', ] )
def inputs(request): input = request.param transformer = IdentityTransformer() return input, transformer
@pytest.mark.parametrize('robust_maker', [ DelegatingRobustTransformer, lambda x: make_robust_transformer([x]), ]) def test_robust_str_repr(robust_maker): robust_transformer = robust_maker(IdentityTransformer()) for func in [str, repr]: s = func(robust_transformer) assert len(s) > 0 @pytest.mark.parametrize( 'transformer,expected', [( IdentityTransformer(), ['IdentityTransformer'], ), ( [IdentityTransformer(), IdentityTransformer()], ['IdentityTransformer', 'IdentityTransformer'], )]) def test_get_transformer_primitives(transformer, expected): robust_transformer = make_robust_transformer(transformer) primitives = get_transformer_primitives(robust_transformer) assert primitives == expected def test_robust_transformer_desugar(): """Should be able to "desugar" multiple things into a valid transformer pipeline""" # noqa transformer = [
def test_robust_str_repr(robust_maker): robust_transformer = robust_maker(IdentityTransformer()) for func in [str, repr]: s = func(robust_transformer) assert len(s) > 0
def test_validation_end_to_end(quickstart): project = quickstart.project modname = 'foo' base = project.path repo = project.repo def _import(modname): relpath = modname_to_relpath(modname, project_root=base, add_init=False) abspath = base.joinpath(relpath) return import_module_at_path(modname, abspath) foo = _import('foo') assert isinstance(foo, ModuleType) foo_features = _import('foo.features') assert isinstance(foo_features, ModuleType) collect_contrib_features = foo_features.collect_contrib_features features = collect_contrib_features() assert len(features) == 0 # first providing a mock feature, call build with patch.object( foo_features, 'collect_contrib_features', return_value=[Feature(input='A_1', transformer=IdentityTransformer())] ): X_df = pd.util.testing.makeCustomDataframe(5, 2) X_df.columns = ['A_0', 'A_1'] out = foo_features.build(X_df=X_df, y_df=[]) assert np.shape(out.X) == (5, 1) assert isinstance(out.mapper_X, FeatureEngineeringPipeline) # write a new version of foo.load_data.load_data new_load_data_str = get_source(load_regression_data) p = base.joinpath(modname, 'load_data.py') with p.open('w') as f: f.write(new_load_data_str) # commit changes repo.index.add([str(p)]) repo.index.commit('Load mock regression dataset') # call different validation routines def call_validate_all(pr=None): envvars = { 'TRAVIS_BUILD_DIR': repo.working_tree_dir, } if pr is None: envvars['TRAVIS_PULL_REQUEST'] = 'false' envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range( repo.commit('HEAD@{-1}').hexsha, repo.commit('HEAD').hexsha) envvars['TRAVIS_PULL_REQUEST_BRANCH'] = '' envvars['TRAVIS_BRANCH'] = repo.heads.master.name else: envvars['TRAVIS_PULL_REQUEST'] = str(pr) envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range( repo.heads.master.name, repo.commit('pull/{pr}'.format(pr=pr)).hexsha) with patch.dict(os.environ, envvars): cmd = 'ballet validate -A' check_call(cmd, cwd=safepath(base), env=os.environ) call_validate_all() # branch to a fake PR and write a new feature contrib_dir = base.joinpath(modname, 'features', 'contrib') logger.info('Switching to pull request 1, User Bob, Feature A') switch_to_new_branch(repo, 'pull/1') new_feature_str = make_feature_str('A_0') username = '******' featurename = 'A_0' submit_feature(repo, contrib_dir, username, featurename, new_feature_str) # call different validation routines logger.info('Validating pull request 1, User Bob, Feature A') call_validate_all(pr=1) # merge PR with master logger.info('Merging into master') repo.git.checkout('master') repo.git.merge('pull/1', no_ff=True) # call different validation routines logger.info('Validating after merge') call_validate_all() # write another new feature logger.info('Switching to pull request 2, User Charlie, Feature Z_1') switch_to_new_branch(repo, 'pull/2') new_feature_str = make_feature_str('Z_1') username = '******' featurename = 'Z_1' submit_feature(repo, contrib_dir, username, featurename, new_feature_str) # if we expect this feature to fail with pytest.raises(CalledProcessError): logger.info('Validating pull request 2, User Charlie, Feature Z_1') call_validate_all(pr=2) # write another new feature - redudancy repo.git.checkout('master') switch_to_new_branch(repo, 'pull/3') new_feature_str = make_feature_str('A_0') username = '******' featurename = 'A_0' submit_feature(repo, contrib_dir, username, featurename, new_feature_str) with pytest.raises(CalledProcessError): call_validate_all(pr=3)
def setUp(self): self.input = 'foo' self.transformer = IdentityTransformer()