def _collect_contrib_features_from_package( package: ModuleType) -> Iterator[Optional[Feature]]: logger.debug( f'Walking package path {package.__path__} to detect modules...' ) # type: ignore # mypy issue 1422 # noqa E501 for importer, modname, _ in pkgutil.walk_packages( path=package.__path__, # type: ignore # mypy issue #1422 prefix=package.__name__ + '.', onerror=logger.error): # mistakenly typed as MetaPathFinder importer = cast(PathEntryFinder, importer) try: if importer is None: raise ImportError # TODO use find_spec # https://docs.python.org/3/library/importlib.html#importlib.abc.PathEntryFinder.find_spec) finder = importer.find_module(modname) if finder is None: raise ImportError mod = finder.load_module(modname) except ImportError: logger.exception(f'Failed to import module {modname}') continue yield _collect_contrib_feature_from_module(mod)
def _create_queue(self): features = [] for user_path in self.feature_path.iterdir(): if not (user_path.is_dir() and re.search(USER_REGEX, str(user_path))): continue user_num = int(user_path.parts[-1].split("_")[1]) logger.debug("COLLECTING FEATURES FROM USER {}".format(user_num)) user_features = [] for feature_path in user_path.iterdir(): if not re.search(FEATURE_REGEX, str(feature_path)): logger.debug("INVALID FEATURE {}".format( safepath(feature_path.parts[-1]))) continue feature_num = int( feature_path.parts[-1].split("_")[1].split(".")[0]) if self.start and self.start > feature_num: continue elif self.end and self.end < feature_num: continue else: user_features.append((user_num, feature_num)) logger.debug("FOUND {} FEATURES".format(len(user_features))) features.append(sorted(user_features, key=lambda f: f[1])) self.feature_queue = self.shuffle_feature_queue(features) logger.debug("USING QUEUE:") logger.debug("\n".join(list(map(str, self.feature_queue))))
def _log_failure_using_stored_approach(self, approach, e): pretty_tb = self._get_pretty_tb() exc_name = type(e).__name__ logger.debug(f'{self._tname}: ' f'Conversion unexpectedly failed using stored, ' f'previously-successful approach {approach.name!r} ' f'because of error {exc_name!r}\n\n{pretty_tb}')
def name(self): if self._name is None: if self._scorer is not None: # try from scorer if isinstance(self._scorer, sklearn.metrics.scorer._BaseScorer): scorers = sklearn.metrics.scorer.SCORERS matches = select_values( lambda x: x == self._scorer, scorers) matches = list(matches.keys()) if len(matches) == 1: self._name = matches[0] elif len(matches) > 1: # unexpected logger.debug( 'Unexpectedly found multiple matches for scorer ' 'name {name}: {matches!r}' .format(name=self._name, matches=matches)) else: # must be a custom scorer, try to get name if hasattr(self._scorer, '__name__'): self._name = self._scorer.__name__ elif self._description is not None: # try from description mapper = flip(SCORING_NAME_MAPPER) if self._description in mapper: self._name = mapper[self._description] else: # default formatting self._name = '_'.join(self._description.lower().split(' ')) if self._name is not None: return self._name else: raise BalletError('Could not get name from scorer')
def _collect_feature_info(self, candidate_feature_diffs): """Collect feature info Args: candidate_feature_diffs (List[git.diff.Diff]): list of Diffs corresponding to admissible file changes compared to comparison ref Returns: List[Tuple[Callable, str, str]]: list of tuple of importer, module name, and module path. The "importer" is a callable that returns a module """ # the directory containing ballet.yml project_root = self.project.path # the directory containing the package try: package_path = self.project.package.__path__[0] package_root = pathlib.Path(package_path).parent except (AttributeError, IndexError): logger.debug("Couldn't get package root, will try to recover", exc_info=True) package_root = project_root for diff in candidate_feature_diffs: path = diff.b_path relpath = project_root.joinpath(path).relative_to(package_root) modname = relpath_to_modname(relpath) modpath = project_root.joinpath(path) importer = partial(import_module_at_path, modname, modpath) yield importer, modname, modpath
def load_class(project, config_key): path = project.config.get(config_key) modname, clsname = path.rsplit('.', maxsplit=1) mod = import_module_from_modname(modname) cls = getattr(mod, clsname) logger.debug('Loaded class {} from {}' .format(cls.__name__, mod.__path__)) return cls
def _collect_file_diffs(self) -> git.DiffIndex: file_diffs = self.differ.diff() # log results for i, file in enumerate(file_diffs): logger.debug(f'File {i}: {file}') return file_diffs
def _collect_file_diffs(self): file_diffs = self.differ.diff() # log results for i, file in enumerate(file_diffs): logger.debug('File {i}: {file}'.format(i=i, file=file)) return file_diffs
def submit_with_delay(self): self.submit() if not self.is_completed(): time_to_sleep = self.rng.randint(MIN_SLEEP_TIME, MAX_SLEEP_TIME) logger.debug("Sleeping for {} minutes".format(time_to_sleep / 60.0)) time.sleep(time_to_sleep) logger.debug("Waking up...")
def _log_catch(self, approach, e): pretty_tb = self._get_pretty_tb() exc_name = type(e).__name__ logger.debug( f'{self._tname}: ' f'Conversion approach {approach.name!r} didn\'t work so we\'ll ' f'try another approach, ' f'caught exception {exc_name!r}\n\n{pretty_tb}')
def judge(self): logger.info(f'Judging feature using {self}') outcomes = { accepter.__class__.__name__: accepter.judge() for accepter in self.accepters } logger.debug(f'Got outcomes {outcomes!r} from underlying accepters') return self.agg(outcomes.values())
def _log_catch(self, approach, e): pretty_tb = self._get_pretty_tb() exc_name = type(e).__name__ logger.debug('{tname}: ' 'Conversion approach {approach.name!r} didn\'t work, ' 'caught exception {exc_name!r}\n\n{tb}'.format( tname=self._tname, approach=approach, exc_name=exc_name, tb=pretty_tb))
def _log_error(self, approach, e): pretty_tb = self._get_pretty_tb() exc_name = type(e).__name__ logger.debug('{tname}: ' 'Conversion failed during {approach.name!r} because of ' 'an unrecoverable error {exc_name!r}\n\n{tb}'.format( tname=self._tname, approach=approach, exc_name=exc_name, tb=pretty_tb))
def _load_class(project, config_key): path = project.config.get(config_key) modname, clsname = path.rsplit('.', maxsplit=1) mod = import_module_from_modname(modname) cls = getattr(mod, clsname) clsname = getattr(cls, '__name__', '<unknown>') modfile = getattr(mod, '__file__', '<unknown>') logger.debug('Loaded class {} from {}'.format(clsname, modfile)) return cls
def _log_failure_using_stored_approach(self, approach, e): pretty_tb = self._get_pretty_tb() exc_name = type(e).__name__ logger.debug('{tname}: ' 'Conversion unexpectedly failed using stored, ' 'previously-successful approach {approach.name!r} ' 'because of error {exc_name!r}\n\n{tb}'.format( tname=self._tname, approach=approach, exc_name=exc_name, tb=pretty_tb))
def validate_feature_api(feature, X, y, subsample=False): logger.debug('Validating feature {feature!r}'.format(feature=feature)) if subsample: X, y = subsample_data_for_validation(X, y) valid, failures = check_from_class(FeatureApiCheck, feature, X, y) if valid: logger.info('Feature is valid') else: logger.info( 'Feature is NOT valid; failures were {failures}' .format(failures=failures)) return valid
def _collect_contrib_feature_from_module(mod: ModuleType) -> Optional[Feature]: logger.debug( f'Trying to import contributed feature from module {mod.__name__}...') candidates = [] for attr in dir(mod): obj = getattr(mod, attr) if isinstance(obj, Feature): candidates.append(obj) if len(candidates) == 1: feature = candidates[0] feature.source = mod.__name__ logger.debug( f'Imported 1 feature from {mod.__name__} from {Feature.__name__}' ' object') return feature elif len(candidates) > 1: logger.debug( f'Found too many {Feature.__name__} objects in module ' '{mod.__name__}, skipping; candidates were {candidates!r}') return None else: logger.debug( f'Failed to import anything useful from module {mod.__name__}') return None
def load_spec(spec: Union[str, dict]) -> Tuple[type, dict]: if isinstance(spec, str): path = spec params = {} else: path = spec['name'] params = spec.get('params', {}) modname, clsname = path.rsplit('.', maxsplit=1) mod = import_module_from_modname(modname) cls = getattr(mod, clsname) modfile = getattr(mod, '__file__', '<unknown>') logger.debug(f'Loaded class {clsname} from module at {modfile} ' f'with params {params}') return cls, params
def _collect_contrib_feature_from_module(mod): logger.debug( 'Trying to import contributed feature from module {modname}...' .format(modname=mod.__name__)) candidates = [] for attr in dir(mod): obj = getattr(mod, attr) if isinstance(obj, Feature): candidates.append(obj) if len(candidates) == 1: feature = candidates[0] feature.source = mod.__name__ logger.debug( 'Imported 1 feature from {modname} from {Feature.__name__} object' .format(modname=mod.__name__, Feature=Feature)) return feature elif len(candidates) > 1: logger.debug( 'Found too many {Feature.__name__} objects in module {modname}, ' 'skipping; candidates were {candidates!r}' .format(Feature=Feature, modname=mod.__name__, candidates=candidates)) return None else: logger.debug( 'Failed to import anything useful from module {modname}' .format(modname=mod.__name__)) return None
def estimate_conditional_information(x: np.ndarray, y: np.ndarray, z: np.ndarray) -> float: r"""Estimate the conditional mutual information of x and y given z Conditional mutual information is the mutual information of two datasets, given a third: .. math:: I(x;y|z) = H(x,z) + H(y,z) - H(x,y,z) - H(z) Where :math:`H(X)` is the Shannon entropy of dataset :math:`X`. For continuous datasets, adapts the KSG Estimator [1] for mutual information. Eq 8 from [1] holds because the epsilon terms cancel out. Let :math:`d_x`, represent the dimensionality of the continuous portion of x. Then, we see that: .. math:: :nowrap: \begin{align} d_{xz} + d_{yz} - d_{xyz} - d_z &= (d_x + d_z) + (d_y + d_z) - (d_x + d_y + d_z) - d_z \\ &= 0 \end{align} Args: x: An array with shape (n_samples, n_features_x) y: An array with shape (n_samples, n_features_y) z: An array with shape (n_samples, n_features_z). This is the dataset being conditioned on. Returns: conditional mutual information of x and y given z. References: .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual information". Phys. Rev. E 69, 2004. """ xz = np.concatenate((x, z), axis=1) yz = np.concatenate((y, z), axis=1) xyz = np.concatenate((x, y, z), axis=1) epsilon = _compute_epsilon(xyz) H_xz = _estimate_entropy(xz, epsilon) H_yz = _estimate_entropy(yz, epsilon) H_xyz = _estimate_entropy(xyz, epsilon) H_z = _estimate_entropy(z, epsilon) logger.debug('H(X,Z): %s', H_xz) logger.debug('H(Y,Z): %s', H_yz) logger.debug('H(X,Y,Z): %s', H_xyz) logger.debug('H(Z): %s', H_z) return H_xz + H_yz - H_xyz - H_z
def fit(self, X, y, tune=True, **fit_kwargs): if tune: # do some tuning if btb is not None and self.tunables is not None: scorer = None def score(estimator): scores = cross_val_score(estimator, X, y, scoring=scorer, cv=self.tuning_cv, fit_params=fit_kwargs) return np.mean(scores) logger.info('Tuning model using BTB GP tuner...') tuner = btb.tuning.gp.GP(self.tunables) estimator = self._get_parent_instance() original_score = score(estimator) # TODO: this leads to an error because default value of # max_depth for RF is `None` # params = funcy.project( # estimator.get_params(), [t[0] for t in self.tunables]) # tuner.add(params, original_score) for i in range(self.tuning_iter): params = tuner.propose() estimator.set_params(**params) score_ = score(estimator) logger.debug('Iteration {}, params {}, score {}'.format( i, params, score_)) tuner.add(params, score_) best_params = tuner._best_hyperparams best_score = tuner._best_score self.set_params(**best_params) logger.info( 'Tuning complete. ' 'Cross val score changed from {:0.3f} to {:0.3f}.'.format( original_score, best_score)) else: logger.warning('Tuning requested, but either btb not ' 'installed or tunable HyperParameters not ' 'specified.') return super().fit(X, y, **fit_kwargs)
def configure_logging(output_dir): logger.setLevel(logging.DEBUG) handler = logging.FileHandler(output_dir.joinpath("info.log")) formatter = logging.Formatter(SIMPLE_LOG_FORMAT) handler.setFormatter(formatter) handler.setLevel(logging.INFO) logger.addHandler(handler) handler = logging.FileHandler(output_dir.joinpath("debug.log")) handler.setFormatter(formatter) handler.setLevel(logging.DEBUG) handler.addFilter(LevelFilter(logging.DEBUG)) logger.addHandler(handler) logger.info("***BEGIN NEW SIMULATION SESSION***.") logger.debug("***BEGIN NEW SIMULATION SESSION***.")
def _collect_contrib_features_from_package(package): logger.debug( 'Walking package path {path} to detect modules...' .format(path=package.__path__)) for importer, modname, _ in pkgutil.walk_packages( path=package.__path__, prefix=package.__name__ + '.', onerror=logger.error): try: mod = importer.find_module(modname).load_module(modname) except ImportError: logger.exception( 'Failed to import module {modname}' .format(modname=modname)) continue yield _collect_contrib_feature_from_module(mod)
def _synctree( src: pathlib.Path, dst: pathlib.Path, onexist: Callable[[pathlib.Path], None]) -> List[Tuple[pathlib.Path, str]]: result = [] cleanup = [] try: for _root, dirnames, filenames in os.walk(src): root = pathlib.Path(_root) relative_dir = root.relative_to(src) for dirname in dirnames: dstdir = dst.joinpath(relative_dir, dirname) if dstdir.exists(): if not dstdir.is_dir(): raise BalletError else: logger.debug(f'Making directory: {dstdir!s}') dstdir.mkdir() result.append((dstdir, 'dir')) cleanup.append(partial(os.rmdir, dstdir)) for filename in filenames: srcfile = root.joinpath(filename) dstfile = dst.joinpath(relative_dir, filename) if dstfile.exists(): onexist(dstfile) else: logger.debug(f'Copying file to destination: {dstfile!s}') copyfile(srcfile, dstfile) result.append((dstfile, 'file')) cleanup.append(partial(os.unlink, dstfile)) except Exception: with suppress(Exception): for f in reversed(cleanup): f() raise return result
def _categorize_file_diffs(self, file_diffs): """Partition file changes into admissible and inadmissible changes""" # TODO move this into a new validator candidate_feature_diffs = [] valid_init_diffs = [] inadmissible_files = [] for diff in file_diffs: valid, failures = check_from_class(ProjectStructureCheck, diff, self.project) if valid: if pathlib.Path(diff.b_path).parts[-1] != '__init__.py': candidate_feature_diffs.append(diff) logger.debug( 'Categorized {file} as CANDIDATE FEATURE MODULE'. format(file=diff.b_path)) else: valid_init_diffs.append(diff) logger.debug( 'Categorized {file} as VALID INIT MODULE'.format( file=diff.b_path)) else: inadmissible_files.append(diff) logger.debug('Categorized {file} as INADMISSIBLE; ' 'failures were {failures}'.format( file=diff.b_path, failures=failures)) logger.info('Admitted {} candidate feature{} ' 'and {} __init__ module{} ' 'and rejected {} file{}'.format( len(candidate_feature_diffs), make_plural_suffix(candidate_feature_diffs), len(valid_init_diffs), make_plural_suffix(valid_init_diffs), len(inadmissible_files), make_plural_suffix(inadmissible_files))) return candidate_feature_diffs, valid_init_diffs, inadmissible_files
def estimate_mutual_information(x: np.ndarray, y: np.ndarray) -> float: r"""Estimate the mutual information of two datasets. Mutual information is a measure of dependence between two datasets and is calculated as: .. math:: I(x;y) = H(x) + H(y) - H(x,y) Where H(x) is the Shannon entropy of x. For continuous datasets, adapts the KSG Estimator [1] for mutual information. Args: x: An array with shape (n_samples, n_features_x) y: An array with shape (n_samples, n_features_y) Returns: mutual information of x and y References: .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual information". Phys. Rev. E 69, 2004. """ xy = np.concatenate((x, y), axis=1) epsilon = _compute_epsilon(xy) H_x = _estimate_entropy(x, epsilon) H_y = _estimate_entropy(y, epsilon) H_xy = _estimate_entropy(xy, epsilon) logger.debug('H(X): %s', H_x) logger.debug('H(Y): %s', H_y) logger.debug('H(X,Y): %s', H_xy) return H_x + H_y - H_xy
def _categorize_file_diffs( self, file_diffs: git.DiffIndex ) -> Tuple[List[git.Diff], List[git.Diff], List[git.Diff]]: """Partition file changes into admissible and inadmissible changes""" # TODO move this into a new validator candidate_feature_diffs = [] valid_init_diffs = [] inadmissible_files = [] for diff in file_diffs: valid, failures, _ = check_from_class(ProjectStructureCheck, diff, self.project) if valid: if pathlib.Path(diff.b_path).parts[-1] != '__init__.py': candidate_feature_diffs.append(diff) logger.debug(f'Categorized {diff.b_path} as ' 'CANDIDATE FEATURE MODULE') else: valid_init_diffs.append(diff) logger.debug( f'Categorized {diff.b_path} as VALID INIT MODULE') else: inadmissible_files.append(diff) logger.debug(f'Categorized {diff.b_path} as INADMISSIBLE; ' f'failures were {failures}') logger.info('Admitted {n1} candidate feature{s1} ' 'and {n2} __init__ module{s2} ' 'and rejected {n3} file{s3}'.format( n1=len(candidate_feature_diffs), s1=make_plural_suffix(candidate_feature_diffs), n2=len(valid_init_diffs), s2=make_plural_suffix(valid_init_diffs), n3=len(inadmissible_files), s3=make_plural_suffix(inadmissible_files))) return candidate_feature_diffs, valid_init_diffs, inadmissible_files
def judge(self): """Judge feature acceptance using GFSSF Uses lines 1-8 of agGFSSF where we do not remove accepted but redundant features on line 8. """ if np.isnan(self.y_val).any(): raise ValueError( f'{self.__class__.__name__} does not support missing targets,' ' please use a different evaluator.') logger.info(f'Judging feature using {self}') feature_df_map = self._get_feature_df_map() candidate_df = feature_df_map[self.candidate_feature] n_samples, n_candidate_cols = candidate_df.shape lmbda_1, lmbda_2 = _compute_lmbdas(self.lmbda_1, self.lmbda_2, feature_df_map) logger.debug( f'Recomputed lambda_1={lmbda_1:0.3e}, lambda_2={lmbda_2:0.3e}') info = [] omit_in_test = [None, *self.features] n_omit = len(omit_in_test) for i, omitted_feature in enumerate(omit_in_test): z = _concat_datasets( feature_df_map, n_samples, omit=[self.candidate_feature, omitted_feature]) # Calculate CMI of candidate feature cmi = estimate_conditional_information(candidate_df, self.y_val, z) if omitted_feature is not None: omit_df = feature_df_map[omitted_feature] _, n_omit_cols = omit_df.shape # Calculate CMI of omitted feature cmi_omit = estimate_conditional_information( omit_df, self.y_val, z) else: cmi_omit = 0 n_omit_cols = 0 # want to log to INFO only the case of I(Z|Y;X) where X is the # entire feature matrix, i.e. no omitted features. logger.info(f'I(feature ; target | existing_features) = {cmi}') statistic = cmi - cmi_omit threshold = _compute_threshold(lmbda_1, lmbda_2, n_candidate_cols, n_omit_cols) delta = statistic - threshold if delta >= 0: omitted_source = getattr(omitted_feature, 'source', 'None') logger.debug( f'Succeeded while omitting feature: {omitted_source}') return True else: iteration_info = GFSSFIterationInfo( i=i, n_samples=n_samples, candidate_feature=self.candidate_feature, candidate_cols=n_candidate_cols, candidate_cmi=cmi, omitted_feature=omitted_feature, omitted_cols=n_omit_cols, omitted_cmi=cmi_omit, statistic=statistic, threshold=threshold, delta=delta, ) info.append(iteration_info) logger.debug( f'Completed iteration {i}/{n_omit}: {iteration_info}') info_closest = max(info, key=lambda x: x.delta) cmi_closest = info_closest.candidate_cmi omitted_cmi_closest = info_closest.omitted_cmi statistic_closest = info_closest.statistic threshold_closest = info_closest.threshold logger.info( f'Rejected feature: best marginal conditional mutual information was not greater than threshold ({cmi_closest:0.3e} - {omitted_cmi_closest:0.3e} = {statistic_closest:0.3e}, vs needed {threshold_closest:0.3e}).' ) # noqa return False
def _log_failure_no_more_approaches(self): logger.debug('Conversion failed, and we\'re not sure why...')
def _log_error(self, approach, e): pretty_tb = self._get_pretty_tb() exc_name = type(e).__name__ logger.debug(f'{self._tname}: ' f'Conversion failed during {approach.name!r} because of ' f'an unrecoverable error {exc_name!r}\n\n{pretty_tb}')