def validate(self): """Collect and validate all new features""" changes = self.change_collector.collect_changes() features = [] imported_okay = True for importer, modname, modpath in changes.new_feature_info: try: mod = importer() features.extend(_collect_contrib_features(mod)) except (ImportError, SyntaxError): logger.info(f'Failed to import module at {modpath}') logger.exception('Exception details: ') imported_okay = False if not imported_okay: return False # if no features were added at all, reject if not features: logger.info('Failed to collect any new features.') return False return all( validate_feature_api(feature, self.X_df, self.y, False) for feature in features )
def submit(self): user, feature = self.feature_queue.pop(0) logger.info( "Submitting: User {user:02d}, Feature {feature:02d}".format( user=user, feature=feature)) submit_to_github(user, feature, str(self.feature_path), False, None, True, True)
def _prune_existing_features(project: Project, force: bool = False) -> List[Feature]: """Prune existing features""" if not force and not project.on_master: raise SkippedValidationTest('Not on master') try: # if on master but not after merge, then we diff master with itself # and collect no features. proposed_feature = get_proposed_feature(project) except NoFeaturesCollectedError: raise SkippedValidationTest('No features collected') X_df, y_df = project.api.load_data() X_df_val, y_df_val = _load_validation_data(project) encoder = project.api.encoder y_val = encoder.fit(y_df).transform(y_df_val) features = project.api.features accepted_features = get_accepted_features(features, proposed_feature) pruner_class = _load_validator_class_params(project, 'validation.feature_pruner') pruner = pruner_class(X_df, y_df, X_df_val, y_val, accepted_features, proposed_feature) redundant_features = pruner.prune() # "propose removal" for feature in redundant_features: logger.info(PRUNER_MESSAGE + feature.source) return redundant_features
def _log_recommended_reinstall(): logger.info( 'After a successful project template update, try re-installing the\n' 'project in case the project template requires any different \n' 'dependencies than what you have installed:\n' '\n' ' $ invoke install')
def judge(self): logger.info(f'Judging feature using {self}') outcomes = { accepter.__class__.__name__: accepter.judge() for accepter in self.accepters } logger.debug(f'Got outcomes {outcomes!r} from underlying accepters') return self.agg(outcomes.values())
def judge(self): logger.info(f'Judging feature using {self}') z = (self.candidate_feature.as_feature_engineering_pipeline().fit( self.X_df, y=self.y_df).transform(self.X_df_val)) var = np.var(z, axis=0) delta = var - self.threshold outcome = np.all(delta > 0) logger.info( f'Feature variance is {var} vs. threshold {self.threshold} ' f'({delta} above threshold)') return outcome
def validate_feature_api(feature, X, y, subsample=False): logger.debug('Validating feature {feature!r}'.format(feature=feature)) if subsample: X, y = subsample_data_for_validation(X, y) valid, failures = check_from_class(FeatureApiCheck, feature, X, y) if valid: logger.info('Feature is valid') else: logger.info( 'Feature is NOT valid; failures were {failures}' .format(failures=failures)) return valid
def judge(self): logger.info(f'Judging feature using {self}') z = (self.candidate_feature.as_feature_engineering_pipeline().fit( self.X_df, y=self.y_df).transform(self.X_df_val)) y = self.y_val z, y = asarray2d(z), asarray2d(y) z, y = self._handle_nans(z, y) if z is None and y is None: # nans were found and handle_nan_targets == 'fail' return False mi = estimate_mutual_information(z, y) delta = mi - self.threshold outcome = delta > 0 logger.info(f'Mutual information with target I(Z;Y) is {mi} vs. ' f'threshold {self.threshold} ({delta} above threshold)') return outcome
def fit(self, X, y, tune=True, **fit_kwargs): if tune: # do some tuning if btb is not None and self.tunables is not None: scorer = None def score(estimator): scores = cross_val_score(estimator, X, y, scoring=scorer, cv=self.tuning_cv, fit_params=fit_kwargs) return np.mean(scores) logger.info('Tuning model using BTB GP tuner...') tuner = btb.tuning.gp.GP(self.tunables) estimator = self._get_parent_instance() original_score = score(estimator) # TODO: this leads to an error because default value of # max_depth for RF is `None` # params = funcy.project( # estimator.get_params(), [t[0] for t in self.tunables]) # tuner.add(params, original_score) for i in range(self.tuning_iter): params = tuner.propose() estimator.set_params(**params) score_ = score(estimator) logger.debug('Iteration {}, params {}, score {}'.format( i, params, score_)) tuner.add(params, score_) best_params = tuner._best_hyperparams best_score = tuner._best_score self.set_params(**best_params) logger.info( 'Tuning complete. ' 'Cross val score changed from {:0.3f} to {:0.3f}.'.format( original_score, best_score)) else: logger.warning('Tuning requested, but either btb not ' 'installed or tunable HyperParameters not ' 'specified.') return super().fit(X, y, **fit_kwargs)
def configure_logging(output_dir): logger.setLevel(logging.DEBUG) handler = logging.FileHandler(output_dir.joinpath("info.log")) formatter = logging.Formatter(SIMPLE_LOG_FORMAT) handler.setFormatter(formatter) handler.setLevel(logging.INFO) logger.addHandler(handler) handler = logging.FileHandler(output_dir.joinpath("debug.log")) handler.setFormatter(formatter) handler.setLevel(logging.DEBUG) handler.addFilter(LevelFilter(logging.DEBUG)) logger.addHandler(handler) logger.info("***BEGIN NEW SIMULATION SESSION***.") logger.debug("***BEGIN NEW SIMULATION SESSION***.")
def judge(self): logger.info('Judging Feature using {}'.format(self)) feature_dfs_by_src = {} for feature in [self.candidate_feature] + self.features: feature_df = ( feature.as_feature_engineering_pipeline().fit_transform( self.X_df, self.y)) feature_dfs_by_src[feature.source] = feature_df candidate_source = self.candidate_feature.source candidate_df = feature_dfs_by_src[candidate_source] n_samples, n_candidate_cols = feature_df.shape lmbda_1, lmbda_2 = _compute_lmbdas(self.lmbda_1, self.lmbda_2, feature_dfs_by_src) logger.info('Candidate Feature Shape: {}'.format(candidate_df.shape)) omit_in_test = [''] + [f.source for f in self.features] for omit in omit_in_test: logger.debug('Testing with omitted feature: {}'.format(omit or 'None')) z = _concat_datasets(feature_dfs_by_src, n_samples, [candidate_source, omit]) logger.debug('Calculating CMI of candidate feature:') cmi = estimate_conditional_information(candidate_df, self.y, z) logger.debug( 'Conditional Mutual Information Score: {}'.format(cmi)) cmi_omit = 0 n_omit_cols = 0 if omit: omit_df = feature_dfs_by_src[omit] _, n_omit_cols = omit_df.shape logger.debug('Calculating CMI of ommitted feature:') cmi_omit = estimate_conditional_information(omit_df, self.y, z) logger.debug('Omitted CMI Score: {}'.format(cmi_omit)) logger.debug('Omitted Feature Shape: {}'.format(omit_df.shape)) statistic = cmi - cmi_omit threshold = _compute_threshold(lmbda_1, lmbda_2, n_candidate_cols, n_omit_cols) logger.debug('Calculated Threshold: {}'.format(threshold)) if statistic >= threshold: logger.debug('Succeeded while omitting feature: {}'.format( omit or 'None')) return True return False
def prune(self): """Prune using GFSSF Uses lines 12-13 of agGFSSF """ if np.isnan(self.y_val).any(): raise ValueError( f'{self.__class__.__name__} does not support missing targets,' ' please use a different evaluator.') logger.info(f'Pruning features using {self}') feature_df_map = self._get_feature_df_map() lmbda_1, lmbda_2 = _compute_lmbdas(self.lmbda_1, self.lmbda_2, feature_df_map) logger.info(f'Recomputed lambda_1={lmbda_1}, lambda_2={lmbda_2}') redundant_features = [] for candidate_feature in self.features: candidate_src = candidate_feature.source logger.debug( f'Trying to prune feature with source {candidate_src}') candidate_df = feature_df_map[candidate_feature] _, n_candidate_cols = candidate_df.shape z = _concat_datasets(feature_df_map, omit=[candidate_feature]) logger.debug(CMI_MESSAGE) cmi = estimate_conditional_information(candidate_df, self.y_val, z) logger.debug(f'Conditional Mutual Information Score: {cmi}') statistic = cmi threshold = _compute_threshold(lmbda_1, lmbda_2, n_candidate_cols) logger.debug(f'Calculated Threshold: {threshold}') if statistic >= threshold: logger.debug(f'Passed, keeping feature {candidate_src}') else: # ballet.validation.main._prune_existing_features will log # this at level INFO logger.debug( f'Failed, found redundant feature: {candidate_src}') del feature_df_map[candidate_feature] redundant_features.append(candidate_feature) return redundant_features
def _categorize_file_diffs(self, file_diffs): """Partition file changes into admissible and inadmissible changes""" # TODO move this into a new validator candidate_feature_diffs = [] valid_init_diffs = [] inadmissible_files = [] for diff in file_diffs: valid, failures = check_from_class(ProjectStructureCheck, diff, self.project) if valid: if pathlib.Path(diff.b_path).parts[-1] != '__init__.py': candidate_feature_diffs.append(diff) logger.debug( 'Categorized {file} as CANDIDATE FEATURE MODULE'. format(file=diff.b_path)) else: valid_init_diffs.append(diff) logger.debug( 'Categorized {file} as VALID INIT MODULE'.format( file=diff.b_path)) else: inadmissible_files.append(diff) logger.debug('Categorized {file} as INADMISSIBLE; ' 'failures were {failures}'.format( file=diff.b_path, failures=failures)) logger.info('Admitted {} candidate feature{} ' 'and {} __init__ module{} ' 'and rejected {} file{}'.format( len(candidate_feature_diffs), make_plural_suffix(candidate_feature_diffs), len(valid_init_diffs), make_plural_suffix(valid_init_diffs), len(inadmissible_files), make_plural_suffix(inadmissible_files))) return candidate_feature_diffs, valid_init_diffs, inadmissible_files
def prune(self): feature_dfs_by_src = {} for accepted_feature in [self.candidate_feature] + self.features: accepted_df = accepted_feature.as_feature_engineering_pipeline( ).fit_transform(self.X_df, self.y) feature_dfs_by_src[accepted_feature.source] = accepted_df lmbda_1, lmbda_2 = _compute_lmbdas(self.lmbda_1, self.lmbda_2, feature_dfs_by_src) logger.info( "Pruning features using GFSSF: lambda_1={l1}, lambda_2={l2}". format(l1=lmbda_1, l2=lmbda_2)) redundant_features = [] for candidate_feature in self.features: candidate_src = candidate_feature.source logger.debug("Pruning feature: {}".format(candidate_src)) candidate_df = feature_dfs_by_src[candidate_src] _, n_candidate_cols = candidate_df.shape z = _concat_datasets(feature_dfs_by_src, omit=candidate_src) logger.debug(CMI_MESSAGE) cmi = estimate_conditional_information(candidate_df, self.y, z) logger.debug( "Conditional Mutual Information Score: {}".format(cmi)) statistic = cmi threshold = _compute_threshold(lmbda_1, lmbda_2, n_candidate_cols) logger.debug("Calculated Threshold: {}".format(threshold)) if statistic >= threshold: logger.debug( "Passed, keeping feature: {}".format(candidate_src)) else: logger.debug("Failed, found redundant feature: {}".format( candidate_src)) del feature_dfs_by_src[candidate_src] redundant_features.append(candidate_feature) return redundant_features
def _prune_existing_features(project, force=False): """Prune existing features""" if not force and not project.on_master_after_merge: raise SkippedValidationTest('Not on master') try: proposed_feature = get_proposed_feature(project) except NoFeaturesCollectedError: raise SkippedValidationTest('No features collected') out = project.build() X_df, y, features = out['X_df'], out['y'], out['features'] accepted_features = get_accepted_features(features, proposed_feature) Pruner = load_class(project, 'validation.feature_pruner') pruner = Pruner(X_df, y, accepted_features, proposed_feature) redundant_features = pruner.prune() # "propose removal" for feature in redundant_features: logger.info(PRUNER_MESSAGE + feature.source) return redundant_features
def _categorize_file_diffs( self, file_diffs: git.DiffIndex ) -> Tuple[List[git.Diff], List[git.Diff], List[git.Diff]]: """Partition file changes into admissible and inadmissible changes""" # TODO move this into a new validator candidate_feature_diffs = [] valid_init_diffs = [] inadmissible_files = [] for diff in file_diffs: valid, failures, _ = check_from_class(ProjectStructureCheck, diff, self.project) if valid: if pathlib.Path(diff.b_path).parts[-1] != '__init__.py': candidate_feature_diffs.append(diff) logger.debug(f'Categorized {diff.b_path} as ' 'CANDIDATE FEATURE MODULE') else: valid_init_diffs.append(diff) logger.debug( f'Categorized {diff.b_path} as VALID INIT MODULE') else: inadmissible_files.append(diff) logger.debug(f'Categorized {diff.b_path} as INADMISSIBLE; ' f'failures were {failures}') logger.info('Admitted {n1} candidate feature{s1} ' 'and {n2} __init__ module{s2} ' 'and rejected {n3} file{s3}'.format( n1=len(candidate_feature_diffs), s1=make_plural_suffix(candidate_feature_diffs), n2=len(valid_init_diffs), s2=make_plural_suffix(valid_init_diffs), n3=len(inadmissible_files), s3=make_plural_suffix(inadmissible_files))) return candidate_feature_diffs, valid_init_diffs, inadmissible_files
def test_validation_end_to_end(quickstart): project = quickstart.project modname = 'foo' base = project.path repo = project.repo def _import(modname): relpath = modname_to_relpath(modname, project_root=base, add_init=False) abspath = base.joinpath(relpath) return import_module_at_path(modname, abspath) foo = _import('foo') assert isinstance(foo, ModuleType) foo_features = _import('foo.features') assert isinstance(foo_features, ModuleType) collect_contrib_features = foo_features.collect_contrib_features features = collect_contrib_features() assert len(features) == 0 # first providing a mock feature, call build with patch.object( foo_features, 'collect_contrib_features', return_value=[Feature(input='A_1', transformer=IdentityTransformer())] ): X_df = pd.util.testing.makeCustomDataframe(5, 2) X_df.columns = ['A_0', 'A_1'] out = foo_features.build(X_df=X_df, y_df=[]) assert np.shape(out.X) == (5, 1) assert isinstance(out.mapper_X, FeatureEngineeringPipeline) # write a new version of foo.load_data.load_data new_load_data_str = get_source(load_regression_data) p = base.joinpath(modname, 'load_data.py') with p.open('w') as f: f.write(new_load_data_str) # commit changes repo.index.add([str(p)]) repo.index.commit('Load mock regression dataset') # call different validation routines def call_validate_all(pr=None): envvars = { 'TRAVIS_BUILD_DIR': repo.working_tree_dir, } if pr is None: envvars['TRAVIS_PULL_REQUEST'] = 'false' envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range( repo.commit('HEAD@{-1}').hexsha, repo.commit('HEAD').hexsha) envvars['TRAVIS_PULL_REQUEST_BRANCH'] = '' envvars['TRAVIS_BRANCH'] = repo.heads.master.name else: envvars['TRAVIS_PULL_REQUEST'] = str(pr) envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range( repo.heads.master.name, repo.commit('pull/{pr}'.format(pr=pr)).hexsha) with patch.dict(os.environ, envvars): cmd = 'ballet validate -A' check_call(cmd, cwd=safepath(base), env=os.environ) call_validate_all() # branch to a fake PR and write a new feature contrib_dir = base.joinpath(modname, 'features', 'contrib') logger.info('Switching to pull request 1, User Bob, Feature A') switch_to_new_branch(repo, 'pull/1') new_feature_str = make_feature_str('A_0') username = '******' featurename = 'A_0' submit_feature(repo, contrib_dir, username, featurename, new_feature_str) # call different validation routines logger.info('Validating pull request 1, User Bob, Feature A') call_validate_all(pr=1) # merge PR with master logger.info('Merging into master') repo.git.checkout('master') repo.git.merge('pull/1', no_ff=True) # call different validation routines logger.info('Validating after merge') call_validate_all() # write another new feature logger.info('Switching to pull request 2, User Charlie, Feature Z_1') switch_to_new_branch(repo, 'pull/2') new_feature_str = make_feature_str('Z_1') username = '******' featurename = 'Z_1' submit_feature(repo, contrib_dir, username, featurename, new_feature_str) # if we expect this feature to fail with pytest.raises(CalledProcessError): logger.info('Validating pull request 2, User Charlie, Feature Z_1') call_validate_all(pr=2) # write another new feature - redudancy repo.git.checkout('master') switch_to_new_branch(repo, 'pull/3') new_feature_str = make_feature_str('A_0') username = '******' featurename = 'A_0' submit_feature(repo, contrib_dir, username, featurename, new_feature_str) with pytest.raises(CalledProcessError): call_validate_all(pr=3)
def render_project_template(project_template_path: Optional[Pathy] = None, create_github_repo: bool = False, github_token: Optional[str] = None, **cc_kwargs) -> str: """Generate a ballet project according to the project template If creating the GitHub repo is requested and the process fails for any reason, quickstart will complete successfully and users are instructed to read the corresponding section of the Maintainer's Guide to continue manually. Args: project_template_path: path to specific project template create_github_repo: whether to act to create the desired repo on GitHub after rendering the project. The repo will be owned by either the user or an org that the user has relevant permissions for, depending on what is entered during the quickstart prompts. If True, then a valid github token must also be provided. github_token: valid github token with appropriate permissions **cc_kwargs: options for the cookiecutter template """ if project_template_path is None: project_template_path = PROJECT_TEMPLATE_PATH project_path = cookiecutter(project_template_path, **cc_kwargs) if create_github_repo: if github_token is None: raise ValueError('Need to provide github token') g = Github(github_token) # need to get params from new project config project = Project.from_path(project_path) owner = project.config.get('github.github_owner') name = project.config.get('project.project_slug') # create repo on github try: github_repo = ballet.util.git.create_github_repo(g, owner, name) logger.info(f'Created repo on GitHub at {github_repo.html_url}') except GithubException: logger.exception('Failed to create GitHub repo for this project') logger.warning( 'Failed to create GitHub repo for this project...\n' 'did you specify the intended repo owner, and do you have' ' permissions to create a repo under that owner?\n' 'Try manually creating the repo: https://ballet.github.io/ballet/maintainer_guide.html#manual-repository-creation' # noqa E501 ) return project_path # now push to remote # we don't need to set up the remote, as it has already been setup in # post_gen_hook.py local_repo = project.repo remote_name = project.config.get('github.remote') branches = [DEFAULT_BRANCH] try: push_branches_to_remote(local_repo, remote_name, branches) except BalletError: logger.exception('Failed to push branches to GitHub repo') logger.warning( 'Failed to push branches to GitHub repo...\n' 'Try manually pushing the branches: https://ballet.github.io/ballet/maintainer_guide.html#manual-repository-creation' # noqa E501 ) return project_path return project_path
def echo(): fn = pathlib.Path.cwd().resolve() logger.info(f'New project created in {fn!s}')
def _log_start_new_feature_success(result: List[Tuple[pathlib.Path, str]]): logger.info('Start new feature successful') for (name, kind) in result: if kind == 'file' and '__init__' not in str(name): relname = pathlib.Path(name).relative_to(pathlib.Path.cwd()) logger.info(f'Created {relname}')
def _log_switch_to_new_branch(branch: Optional[str]): if branch is not None: logger.info(f'Switched to branch {branch}')
def echo(): fn = pathlib.Path.cwd().absolute() logger.info('New project created in {!s}'.format(fn))
def dump_travis_env_vars(): logger.info(repr(get_travis_env_vars()))
def judge(self): logger.info(f'Judging feature using {self}') return True
def judge(self): """Judge feature acceptance using GFSSF Uses lines 1-8 of agGFSSF where we do not remove accepted but redundant features on line 8. """ if np.isnan(self.y_val).any(): raise ValueError( f'{self.__class__.__name__} does not support missing targets,' ' please use a different evaluator.') logger.info(f'Judging feature using {self}') feature_df_map = self._get_feature_df_map() candidate_df = feature_df_map[self.candidate_feature] n_samples, n_candidate_cols = candidate_df.shape lmbda_1, lmbda_2 = _compute_lmbdas(self.lmbda_1, self.lmbda_2, feature_df_map) logger.debug( f'Recomputed lambda_1={lmbda_1:0.3e}, lambda_2={lmbda_2:0.3e}') info = [] omit_in_test = [None, *self.features] n_omit = len(omit_in_test) for i, omitted_feature in enumerate(omit_in_test): z = _concat_datasets( feature_df_map, n_samples, omit=[self.candidate_feature, omitted_feature]) # Calculate CMI of candidate feature cmi = estimate_conditional_information(candidate_df, self.y_val, z) if omitted_feature is not None: omit_df = feature_df_map[omitted_feature] _, n_omit_cols = omit_df.shape # Calculate CMI of omitted feature cmi_omit = estimate_conditional_information( omit_df, self.y_val, z) else: cmi_omit = 0 n_omit_cols = 0 # want to log to INFO only the case of I(Z|Y;X) where X is the # entire feature matrix, i.e. no omitted features. logger.info(f'I(feature ; target | existing_features) = {cmi}') statistic = cmi - cmi_omit threshold = _compute_threshold(lmbda_1, lmbda_2, n_candidate_cols, n_omit_cols) delta = statistic - threshold if delta >= 0: omitted_source = getattr(omitted_feature, 'source', 'None') logger.debug( f'Succeeded while omitting feature: {omitted_source}') return True else: iteration_info = GFSSFIterationInfo( i=i, n_samples=n_samples, candidate_feature=self.candidate_feature, candidate_cols=n_candidate_cols, candidate_cmi=cmi, omitted_feature=omitted_feature, omitted_cols=n_omit_cols, omitted_cmi=cmi_omit, statistic=statistic, threshold=threshold, delta=delta, ) info.append(iteration_info) logger.debug( f'Completed iteration {i}/{n_omit}: {iteration_info}') info_closest = max(info, key=lambda x: x.delta) cmi_closest = info_closest.candidate_cmi omitted_cmi_closest = info_closest.omitted_cmi statistic_closest = info_closest.statistic threshold_closest = info_closest.threshold logger.info( f'Rejected feature: best marginal conditional mutual information was not greater than threshold ({cmi_closest:0.3e} - {omitted_cmi_closest:0.3e} = {statistic_closest:0.3e}, vs needed {threshold_closest:0.3e}).' ) # noqa return False
def judge(self): """Accept feature with probability p""" logger.info(f'Judging feature using {self}') with seeded(self.seed): return random.uniform(0, 1) < self.p
def _log_collect_items(name, items): n = len(items) s = make_plural_suffix(items) logger.info('Collected {n} {name}{s}'.format(n=n, name=name, s=s)) return items
def prune(self): logger.info(f'Pruning features using {self}') return []
def prune(self): """With probability p, select a random feature to prune""" logger.info(f'Pruning features using {self}') with seeded(self.seed): if random.uniform(0, 1) < self.p: return [random.choice(self.features)]
def _log_failure_no_more_approaches(self): logger.info('Conversion failed, and we\'re not sure why...')