def fitSingleTask(self, clf, X, y, test_X, info={}, nr_class=2, opt_submit=True): clf.fit(X.values, y.values) X_new = clf.train_meta_features_ p = pd.DataFrame({'TARGET': clf.predict_proba(X.values)[:, -1]}, index=X.index) test_X_new = clf.predict_meta_features(test_X.values) test_p = pd.DataFrame({'TARGET': clf.predict_proba(test_X.values)[:, -1]}, index=test_X.index) logger.info('X_meta={}, test_X_meta={}'.format(X_new.shape, test_X_new.shape)) counter = [i for i in range(1, X_new.shape[1], nr_class)] bases_auc = [roc_auc_score(y, X_new[:, i]) for i in counter] #bases_p = [X_new[:, i] for i in counter] #tests_p = [test_X_new[:, i] for i in counter] if opt_submit: l = info['level'] - 1 info.update({'feature_num': X.shape[1]}) for i, s in zip(counter, bases_auc): p = pd.DataFrame({'TARGET': test_X_new[:, i]}, index=test_X.index) info.update({'level': l, 'score': s}) self.saveSubmit(info, p, template=filename_submit_mlxtend_base) return X_new, test_X_new, p, test_p, bases_auc
def saveSubmit(self, file_stem, preds, template): stem = ComposeResultName(file_stem) filename = template.format(loc=self.output_loc, prefix=self.prefix, stem=stem) logger.info('Save predictions to {}'.format(filename)) preds.to_csv(filename)
def main(argc, argv): logger.info('reading arguments') args = parse_command_line() logger.info('starting to compute') compute(args) return
def set_model(self, m, params): params = SwitchDevice(params, enable_gpu=self.enable_gpu) availabe_params = m().get_params() if any([k not in availabe_params for k in params.keys()]): ret = m(**params) else: # need all parameters in get_params() so safe to call set_params() ret = m().set_params(**params) logger.info('set {}'.format(ret)) return ret
def loadExternalMeta(self, configs): """ preds = { 'train_oof' : oof_preds_df, 'test_oof' : sub_preds_df, 'test_full' : test_preds_full, 'feature_importance': feature_importance_df } """ self.X_meta = list() self.test_X_meta = list() def func(x, by, name): return x.groupby(by)['PROBA'].mean().rank(pct=True).rename(name) for k, v in configs.items(): Xs, test_Xs = list(), list() for f in v: ret = self.data_io_manager.loadHDF('{loc}/{filename}'.format(loc=self.input_loc, filename=f)) if not ret: continue Xs.append(ret.get('train_oof', pd.DataFrame())) test_Xs.append(ret.get('test_oof', pd.DataFrame())) X = pd.concat(Xs, axis=0) test_X = pd.concat(test_Xs, axis=0) X = func(X.reset_index(), X.index.name, k) test_X = func(test_X.reset_index(), test_X.index.name, k) self.X_meta.append(X) self.test_X_meta.append(test_X) filename = filename_mlxtend_meta_features_external.format(loc=self.input_loc, prefix=self.prefix) ret = self.data_io_manager.loadHDF(filename) if ret: df = ret.get('train_meta', pd.DataFrame()).apply(lambda x: x.rank(pct=True)) self.X_meta.append(df) df = ret.get('test_meta', pd.DataFrame()).apply(lambda x: x.rank(pct=True)) self.test_X_meta.append(df) self.X_meta = pd.concat(self.X_meta, axis=1) self.test_X_meta = pd.concat(self.test_X_meta, axis=1) logger.info('Load Meta {}, {}'.format(self.X_meta .shape, self.test_X_meta.shape)) return self.X_meta, self.test_X_meta
def fit_predict(self, X, y, test_X, seed=42): for i, (clf, info) in enumerate(zip(self.meta_clfs, self.meta_clfs_info), 1): name = info['model'] logger.info('fitting meta stackers {}'.format(name)) np.random.seed(info.get('seed', seed)) X = self._process_meta_features(self.X_meta, gamma=None).reindex(X.index) test_X = self._process_meta_features(self.test_X_meta, gamma=None).reindex(test_X.index) logger.info('processed for X_meta: {}, {}'.format(X.shape, test_X.shape)) X_new, test_X_new, p, test_p, scores = self.fitSingleTask(clf, X, y, test_X, info=info.copy()) info.update({'feature_num':X_new.shape[1], 'score': max(scores)}) self.saveSubmit(info, test_p, template=filename_submit_mlxtend_meta) self.saveMetaFeatures(info, {'train_meta': X, 'test_meta': test_X}, stacker_level=True)
def _create_model_object(self, model, parameters, task, model_zoo): # TODO: enable GPU assist if task in model_zoo.keys(): parameters = model_zoo[task].get('params', {}) logger.info('load parameters {} from model zoo: {}'.format(task, parameters)) hpo_export = model_zoo[task].get('task', None) if hpo_export: filename = filename_hpo_external.format(loc=self.params_loc, prefix=self.prefix, task=hpo_export) if CheckFileExist(filename): parameters = LoadPickle(filename) logger.info('Update {} from {}'.format(hpo_export, filename)) if isinstance(parameters.get('base_estimator', None), str): n = parameters.get('base_estimator', None) if n in model_zoo.keys(): params = model_zoo[n].get('params', {}) sub_model = model_zoo[n].get('model', None) logger.info('override parameters {} from model zoo: {}'.format(n, params)) parameters['base_estimator'] = self.set_model(sub_model, params) return self.set_model(model, parameters)
def saveMetaFeatures(self, file_stem, data, stacker_level=False): stem = ComposeResultName(file_stem) filename = filename_mlxtend_meta_features.format(loc=self.input_loc, prefix=self.prefix, stem=stem) logger.info('Save meta features to {}'.format(filename)) self.data_io_manager.saveHDF(filename, data, opt_overwrite=True, opt_fast=False) if stacker_level: filename = filename_mlxtend_stacker_external.format(loc=self.input_loc, prefix=self.prefix) else: filename = filename_mlxtend_meta_features_external.format(loc=self.input_loc, prefix=self.prefix) logger.info('export meta features to {}'.format(filename)) self.data_io_manager.saveHDF(filename, data, opt_overwrite=True, opt_fast=False)
def fit_transform(self, X, y, test_X, seed=42): X = X.apply(lambda x: np.nan_to_num(x)) test_X = test_X.apply(lambda x: np.nan_to_num(x)) for i, (clf, info) in enumerate(zip(self.clfs, self.clfs_info), 1): name = info['model'] logger.info('fit meta feature source: {}'.format(name)) np.random.seed(info.get('seed', seed)) X_new, test_X_new, p, test_p, scores = self.fitSingleTask(clf, X, y, test_X, info=info.copy()) info.update({'feature_num':X_new.shape[1], 'score': max(scores)}) self.saveSubmit(info, test_p, template=filename_submit_mlxtend_meta) columns = ['{}_{}'.format(name, j) for j in range(X_new.shape[1])] self.X_meta.append(pd.DataFrame(X_new, index=X.index, columns=columns)) self.test_X_meta.append(pd.DataFrame(test_X_new, index=test_X.index, columns=columns)) X = pd.concat(self.X_meta, axis=1) test_X = pd.concat(self.test_X_meta, axis=1) logger.info('transform meta feature for X={}, test_X={}'.format(X.shape, test_X.shape)) self.saveMetaFeatures(info, {'train_meta': X, 'test_meta' : test_X})
def buildMetaFeatures(self, model_zoo): for clf in self.meta_feature_configs: name = clf.get('name', 'foobar') use_features_in_secondary = clf.get('use_features', True) stratify = clf.get('stratify', True) nr_folds = clf.get('cv', 3) seed = clf.get('seed', 42) bases = [model_zoo.get(c) for c in clf['sources']] base_classifiers = [self._create_model_object(clf['model'], clf.get('params', dict()), clf.get('task', None), model_zoo) for clf in bases] logger.info('create meta feature extractor') self.clfs.append(StackingCVClassifier(base_classifiers, self._create_model_object(clf['meta_classifier'], clf.get('params', dict()), clf.get('task', None), model_zoo), use_probas=True, cv=nr_folds, use_features_in_secondary=use_features_in_secondary, stratify=stratify, store_train_meta_features=True, use_clones=True) ) self.clfs_info.append(self._set_submit_filename(level=1, name=name, feature_num=None, score=None, nr_fold=nr_folds, seed=seed) ) logger.info('Read in on {} base learners for {}'.format(len(bases), name)) logger.info('Read in {} meta feature extractors'.format(len(self.clfs)))
def parse_command_line(): default_cache_prefix = 'sample' params_loc = file_dir_path.get('params', './params') configs_loc = file_dir_path.get('configs', './configs') default_data_configs_path = '{}/SampleDataConfigs.py'.format(configs_loc) default_model_configs_path = '{}/SampleModelConfigs.py'.format(configs_loc) default_stacker_configs_path = '{}/SampleStackerConfigs.py'.format( configs_loc) default_select_to_hpo = None default_feature_score_cutoff = 10. parser = argparse.ArgumentParser( description='Home Credit Default Risk Modeler', add_help=True, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-a', '--cache-prefix', type=str, default=default_cache_prefix, help='specifiy cache file prefix') parser.add_argument('-d', '--configs-data', type=str, default=default_data_configs_path, help='path to data configs') parser.add_argument('-m', '--configs-model', type=str, default=default_model_configs_path, help='path to model configs') parser.add_argument('-s', '--configs-stacker', type=str, default=default_stacker_configs_path, help='path to stacker configs') parser.add_argument('-t', '--select-hpo', type=str, default=default_select_to_hpo, help='hpo on selected models') parser.add_argument('--cutoff-score', type=float, default=default_feature_score_cutoff, help='cutoff to remove unimportant features') parser.add_argument('-c', '--cull_features', action='store_true', default=False, help='cull features') parser.add_argument('--enable-gpu', action='store_true', default=False, help='compute using gpu') parser.add_argument('--refresh-cache', action='store_true', default=False, help='refresh cache by data configs') parser.add_argument('--refresh-meta', action='store_true', default=False, help='refresh constructed meta features') parser.add_argument('--compute-hpo', action='store_true', default=False, help='hpo') parser.add_argument('--compute-stack', action='store_true', default=False, help='stacking') parser.add_argument('--debug', action='store_true', default=False, help='debug moode using 20000 samples') args = parser.parse_args() logger.info('running task with prefix={}'.format(args.cache_prefix)) if args.enable_gpu: logger.info('enable GPU computing in hyperparameters') if args.cull_features: logger.info('cull feature features scores under {}'.format( args.cutoff_score)) if args.select_hpo: args.select_hpo = args.select_hpo.split(',') if args.debug: logger.warning('**Debug Mode**') args.configs_model = '{}/DebugModelConfigs.py'.format(configs_loc) args.configs_stacker = '{}/DebugStackerConfigs.py'.format(configs_loc) return args
def compute(args): # loading configs DataConfigs = InitializeConfigs(args.configs_data).DataConfigs if args.compute_hpo: ModelConfigs = InitializeConfigs(args.configs_model).ModelConfigs if args.compute_stack: StackerConfigs = InitializeConfigs(args.configs_stacker).StackerConfigs BaseModelZoo = InitializeConfigs(args.configs_stacker).BaseModelConfigs ExtMetaConfigs = InitializeConfigs( args.configs_stacker).ExternalMetaConfigs dp = DataProvider(IOConfigs=file_dir_path) if args.refresh_cache: data = dp.LoadData(DataConfigs, source='from_processed', prefix=args.cache_prefix) else: data = dp.LoadData(DataConfigs, source='from_train_test', prefix=args.cache_prefix) train_x, train_y, test_x, test_y = data if args.cull_features: # a bit feature selection f_path = InitializeConfigs(args.configs_model).fileFeatureImportance featSel = FeatureImportance() featSel.LoadResult(f_path) blacklist = featSel.GetBlacklist(args.cutoff_score) train_x = featSel.CullFeatures(train_x, blacklist) test_x = featSel.CullFeatures(test_x, blacklist) if args.debug: train_x = train_x.iloc[:20000] train_y = train_y.iloc[:20000] logger.warning('debug mode: x={}'.format(train_x.shape)) args.cache_prefix = 'debug' logger.info('P/N ratio:\n{}'.format(train_y.value_counts(normalize=True))) if args.compute_hpo: logger.info('load hpo configs of {} models'.format(len(ModelConfigs))) if args.select_hpo: ModelConfigs = { k: v for k, v in ModelConfigs.items() if k in args.select_hpo } logger.info('compute hpo for selected {} models'.format( len(ModelConfigs))) for k, v in ModelConfigs.items(): try: model = v.get("model") hpo_range = v.get("hyperparameter_optimization") init = hpo_range.get('initialize', {}) hpo_range.update({ 'initialize': SwithDevice(init, enable_gpu=args.enable_gpu) }) hpo_search = ScikitOptimize(model, hpo_range, task_name='{}'.format(k), data_prefix=args.cache_prefix) hpo_search.search(train_x, train_y) hpo_search.save_hyperparameters(export=True) # TODO: fine tune model except: logger.info( 'Errors in optimizing {}'.format(task_name='{}'.format(k))) if args.compute_stack: stackers = AutoStacker(StackerConfigs, args.enable_gpu, data_prefix=args.cache_prefix) if args.refresh_meta: stackers.buildMetaFeatures(BaseModelZoo) stackers.fit_transform(train_x, train_y, test_x, seed=42) else: stackers.loadExternalMeta(ExtMetaConfigs) stackers.buildMetaClassifiers(BaseModelZoo) stackers.fit_predict(train_x, train_y, test_x, seed=538) return