def fitSingleTask(self, clf, X, y, test_X, info={}, nr_class=2, opt_submit=True):
        clf.fit(X.values, y.values)
        X_new = clf.train_meta_features_
        p = pd.DataFrame({'TARGET': clf.predict_proba(X.values)[:, -1]},
                          index=X.index)

        test_X_new = clf.predict_meta_features(test_X.values)
        test_p = pd.DataFrame({'TARGET': clf.predict_proba(test_X.values)[:, -1]},
                               index=test_X.index)

        logger.info('X_meta={}, test_X_meta={}'.format(X_new.shape, test_X_new.shape))
        counter   = [i for i in range(1, X_new.shape[1], nr_class)]
        bases_auc = [roc_auc_score(y, X_new[:, i]) for i in counter]
        #bases_p   = [X_new[:, i]      for i in counter]
        #tests_p   = [test_X_new[:, i] for i in counter]
        if opt_submit:
            l = info['level'] - 1
            info.update({'feature_num': X.shape[1]})
            for i, s in zip(counter, bases_auc):
                p = pd.DataFrame({'TARGET': test_X_new[:, i]}, index=test_X.index)
                info.update({'level': l, 'score': s})
                self.saveSubmit(info,
                                p,
                                template=filename_submit_mlxtend_base)

        return X_new, test_X_new, p, test_p, bases_auc
 def saveSubmit(self, file_stem, preds, template):
     stem = ComposeResultName(file_stem)
     filename = template.format(loc=self.output_loc,
                                prefix=self.prefix,
                                stem=stem)
     logger.info('Save predictions to {}'.format(filename))
     preds.to_csv(filename)
def main(argc, argv):
    logger.info('reading arguments')
    args = parse_command_line()

    logger.info('starting to compute')
    compute(args)

    return
    def set_model(self, m, params):
        params = SwitchDevice(params, enable_gpu=self.enable_gpu)

        availabe_params = m().get_params()
        if any([k not in availabe_params for k in params.keys()]):
            ret = m(**params)
        else:  # need all parameters in get_params() so safe to call set_params()
            ret = m().set_params(**params)

        logger.info('set {}'.format(ret))
        return ret
    def loadExternalMeta(self, configs):
        """
        preds = {
            'train_oof' : oof_preds_df,
            'test_oof'  : sub_preds_df,
            'test_full' : test_preds_full,
            'feature_importance': feature_importance_df
        }
        """

        self.X_meta = list()
        self.test_X_meta = list()

        def func(x, by, name):
            return x.groupby(by)['PROBA'].mean().rank(pct=True).rename(name)

        for k, v in configs.items():
            Xs, test_Xs = list(), list()
            for f in v:
                ret = self.data_io_manager.loadHDF('{loc}/{filename}'.format(loc=self.input_loc, filename=f))

                if not ret:
                    continue

                Xs.append(ret.get('train_oof', pd.DataFrame()))
                test_Xs.append(ret.get('test_oof', pd.DataFrame()))

            X      = pd.concat(Xs, axis=0)
            test_X = pd.concat(test_Xs, axis=0)

            X      = func(X.reset_index(), X.index.name, k)
            test_X = func(test_X.reset_index(), test_X.index.name, k)

            self.X_meta.append(X)
            self.test_X_meta.append(test_X)

        filename = filename_mlxtend_meta_features_external.format(loc=self.input_loc, prefix=self.prefix)
        ret = self.data_io_manager.loadHDF(filename)
        if ret:
            df = ret.get('train_meta', pd.DataFrame()).apply(lambda x: x.rank(pct=True))
            self.X_meta.append(df)
            df = ret.get('test_meta', pd.DataFrame()).apply(lambda x: x.rank(pct=True))
            self.test_X_meta.append(df)

        self.X_meta = pd.concat(self.X_meta, axis=1)
        self.test_X_meta = pd.concat(self.test_X_meta, axis=1)
        logger.info('Load Meta {}, {}'.format(self.X_meta .shape, self.test_X_meta.shape))
        return self.X_meta, self.test_X_meta
    def fit_predict(self, X, y, test_X, seed=42):
        for i, (clf, info) in enumerate(zip(self.meta_clfs, self.meta_clfs_info), 1):
            name = info['model']
            logger.info('fitting meta stackers {}'.format(name))
            np.random.seed(info.get('seed', seed))

            X      = self._process_meta_features(self.X_meta, gamma=None).reindex(X.index)
            test_X = self._process_meta_features(self.test_X_meta, gamma=None).reindex(test_X.index)
            logger.info('processed for X_meta: {}, {}'.format(X.shape, test_X.shape))
            X_new, test_X_new, p, test_p, scores = self.fitSingleTask(clf, X, y, test_X, info=info.copy())
            info.update({'feature_num':X_new.shape[1], 'score': max(scores)})
            self.saveSubmit(info,
                            test_p,
                            template=filename_submit_mlxtend_meta)

            self.saveMetaFeatures(info, {'train_meta': X, 'test_meta': test_X}, stacker_level=True)
    def _create_model_object(self, model, parameters, task, model_zoo):
    # TODO: enable GPU assist

        if task in model_zoo.keys():
            parameters = model_zoo[task].get('params', {})
            logger.info('load parameters {} from model zoo: {}'.format(task, parameters))

            hpo_export = model_zoo[task].get('task', None)
            if hpo_export:
                filename = filename_hpo_external.format(loc=self.params_loc,
                                                        prefix=self.prefix,
                                                        task=hpo_export)
                if CheckFileExist(filename):
                    parameters = LoadPickle(filename)
                    logger.info('Update {} from {}'.format(hpo_export, filename))


        if isinstance(parameters.get('base_estimator', None), str):
            n = parameters.get('base_estimator', None)
            if n in model_zoo.keys():
                params = model_zoo[n].get('params', {})
                sub_model = model_zoo[n].get('model', None)
                logger.info('override parameters {} from model zoo: {}'.format(n, params))
                parameters['base_estimator'] = self.set_model(sub_model, params)

        return self.set_model(model, parameters)
    def saveMetaFeatures(self, file_stem, data, stacker_level=False):
        stem = ComposeResultName(file_stem)
        filename = filename_mlxtend_meta_features.format(loc=self.input_loc,
                                                         prefix=self.prefix,
                                                         stem=stem)

        logger.info('Save meta features to {}'.format(filename))
        self.data_io_manager.saveHDF(filename,
                                     data,
                                     opt_overwrite=True,
                                     opt_fast=False)

        if stacker_level:
            filename = filename_mlxtend_stacker_external.format(loc=self.input_loc,
                                                            prefix=self.prefix)
        else:
            filename = filename_mlxtend_meta_features_external.format(loc=self.input_loc, 
                                                                      prefix=self.prefix)
            
        logger.info('export meta features to {}'.format(filename))
        self.data_io_manager.saveHDF(filename,
                                     data,
                                     opt_overwrite=True,
                                     opt_fast=False)
    def fit_transform(self, X, y, test_X, seed=42):

        X = X.apply(lambda x: np.nan_to_num(x))
        test_X = test_X.apply(lambda x: np.nan_to_num(x))

        for i, (clf, info) in enumerate(zip(self.clfs, self.clfs_info), 1):
            name = info['model']
            logger.info('fit meta feature source: {}'.format(name))
            np.random.seed(info.get('seed', seed))

            X_new, test_X_new, p, test_p, scores = self.fitSingleTask(clf, X, y, test_X, info=info.copy())
            info.update({'feature_num':X_new.shape[1], 'score': max(scores)})
            self.saveSubmit(info,
                            test_p,
                            template=filename_submit_mlxtend_meta)

            columns = ['{}_{}'.format(name, j) for j in range(X_new.shape[1])]
            self.X_meta.append(pd.DataFrame(X_new, index=X.index, columns=columns))
            self.test_X_meta.append(pd.DataFrame(test_X_new, index=test_X.index, columns=columns))

        X      = pd.concat(self.X_meta, axis=1)
        test_X = pd.concat(self.test_X_meta, axis=1)
        logger.info('transform meta feature for X={}, test_X={}'.format(X.shape, test_X.shape))
        self.saveMetaFeatures(info, {'train_meta': X, 'test_meta' : test_X})
    def buildMetaFeatures(self, model_zoo):
        for clf in self.meta_feature_configs:
            name                      = clf.get('name', 'foobar')
            use_features_in_secondary = clf.get('use_features', True)
            stratify                  = clf.get('stratify', True)
            nr_folds                  = clf.get('cv', 3)
            seed                      = clf.get('seed', 42)

            bases = [model_zoo.get(c) for c in clf['sources']]
            base_classifiers = [self._create_model_object(clf['model'],
                                                          clf.get('params', dict()),
                                                          clf.get('task', None),
                                                          model_zoo) for clf in bases]

            logger.info('create meta feature extractor')
            self.clfs.append(StackingCVClassifier(base_classifiers,
                     self._create_model_object(clf['meta_classifier'],
                                               clf.get('params', dict()),
                                               clf.get('task', None),
                                               model_zoo),
                     use_probas=True,
                     cv=nr_folds,
                     use_features_in_secondary=use_features_in_secondary,
                     stratify=stratify,
                     store_train_meta_features=True,
                     use_clones=True)
            )
            self.clfs_info.append(self._set_submit_filename(level=1,
                                                            name=name,
                                                            feature_num=None,
                                                            score=None,
                                                            nr_fold=nr_folds,
                                                            seed=seed)
            )
            logger.info('Read in on {} base learners for {}'.format(len(bases), name))

        logger.info('Read in {} meta feature extractors'.format(len(self.clfs)))
def parse_command_line():

    default_cache_prefix = 'sample'

    params_loc = file_dir_path.get('params', './params')
    configs_loc = file_dir_path.get('configs', './configs')
    default_data_configs_path = '{}/SampleDataConfigs.py'.format(configs_loc)
    default_model_configs_path = '{}/SampleModelConfigs.py'.format(configs_loc)
    default_stacker_configs_path = '{}/SampleStackerConfigs.py'.format(
        configs_loc)
    default_select_to_hpo = None
    default_feature_score_cutoff = 10.

    parser = argparse.ArgumentParser(
        description='Home Credit Default Risk Modeler',
        add_help=True,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-a',
                        '--cache-prefix',
                        type=str,
                        default=default_cache_prefix,
                        help='specifiy cache file prefix')
    parser.add_argument('-d',
                        '--configs-data',
                        type=str,
                        default=default_data_configs_path,
                        help='path to data configs')
    parser.add_argument('-m',
                        '--configs-model',
                        type=str,
                        default=default_model_configs_path,
                        help='path to model configs')
    parser.add_argument('-s',
                        '--configs-stacker',
                        type=str,
                        default=default_stacker_configs_path,
                        help='path to stacker configs')
    parser.add_argument('-t',
                        '--select-hpo',
                        type=str,
                        default=default_select_to_hpo,
                        help='hpo on selected models')
    parser.add_argument('--cutoff-score',
                        type=float,
                        default=default_feature_score_cutoff,
                        help='cutoff to remove unimportant features')
    parser.add_argument('-c',
                        '--cull_features',
                        action='store_true',
                        default=False,
                        help='cull features')
    parser.add_argument('--enable-gpu',
                        action='store_true',
                        default=False,
                        help='compute using gpu')
    parser.add_argument('--refresh-cache',
                        action='store_true',
                        default=False,
                        help='refresh cache by data configs')
    parser.add_argument('--refresh-meta',
                        action='store_true',
                        default=False,
                        help='refresh constructed meta features')
    parser.add_argument('--compute-hpo',
                        action='store_true',
                        default=False,
                        help='hpo')
    parser.add_argument('--compute-stack',
                        action='store_true',
                        default=False,
                        help='stacking')
    parser.add_argument('--debug',
                        action='store_true',
                        default=False,
                        help='debug moode using 20000 samples')

    args = parser.parse_args()

    logger.info('running task with prefix={}'.format(args.cache_prefix))

    if args.enable_gpu:
        logger.info('enable GPU computing in hyperparameters')

    if args.cull_features:
        logger.info('cull feature features scores under {}'.format(
            args.cutoff_score))

    if args.select_hpo:
        args.select_hpo = args.select_hpo.split(',')

    if args.debug:
        logger.warning('**Debug Mode**')
        args.configs_model = '{}/DebugModelConfigs.py'.format(configs_loc)
        args.configs_stacker = '{}/DebugStackerConfigs.py'.format(configs_loc)

    return args
def compute(args):

    # loading configs
    DataConfigs = InitializeConfigs(args.configs_data).DataConfigs
    if args.compute_hpo:
        ModelConfigs = InitializeConfigs(args.configs_model).ModelConfigs
    if args.compute_stack:
        StackerConfigs = InitializeConfigs(args.configs_stacker).StackerConfigs
        BaseModelZoo = InitializeConfigs(args.configs_stacker).BaseModelConfigs
        ExtMetaConfigs = InitializeConfigs(
            args.configs_stacker).ExternalMetaConfigs

    dp = DataProvider(IOConfigs=file_dir_path)
    if args.refresh_cache:
        data = dp.LoadData(DataConfigs,
                           source='from_processed',
                           prefix=args.cache_prefix)
    else:
        data = dp.LoadData(DataConfigs,
                           source='from_train_test',
                           prefix=args.cache_prefix)

    train_x, train_y, test_x, test_y = data

    if args.cull_features:  # a bit feature selection
        f_path = InitializeConfigs(args.configs_model).fileFeatureImportance
        featSel = FeatureImportance()
        featSel.LoadResult(f_path)
        blacklist = featSel.GetBlacklist(args.cutoff_score)
        train_x = featSel.CullFeatures(train_x, blacklist)
        test_x = featSel.CullFeatures(test_x, blacklist)

    if args.debug:
        train_x = train_x.iloc[:20000]
        train_y = train_y.iloc[:20000]
        logger.warning('debug mode: x={}'.format(train_x.shape))
        args.cache_prefix = 'debug'
    logger.info('P/N ratio:\n{}'.format(train_y.value_counts(normalize=True)))

    if args.compute_hpo:
        logger.info('load hpo configs of {} models'.format(len(ModelConfigs)))
        if args.select_hpo:
            ModelConfigs = {
                k: v
                for k, v in ModelConfigs.items() if k in args.select_hpo
            }
            logger.info('compute hpo for selected {} models'.format(
                len(ModelConfigs)))

        for k, v in ModelConfigs.items():
            try:
                model = v.get("model")
                hpo_range = v.get("hyperparameter_optimization")
                init = hpo_range.get('initialize', {})
                hpo_range.update({
                    'initialize':
                    SwithDevice(init, enable_gpu=args.enable_gpu)
                })
                hpo_search = ScikitOptimize(model,
                                            hpo_range,
                                            task_name='{}'.format(k),
                                            data_prefix=args.cache_prefix)
                hpo_search.search(train_x, train_y)
                hpo_search.save_hyperparameters(export=True)
                # TODO: fine tune model
            except:
                logger.info(
                    'Errors in optimizing {}'.format(task_name='{}'.format(k)))

    if args.compute_stack:
        stackers = AutoStacker(StackerConfigs,
                               args.enable_gpu,
                               data_prefix=args.cache_prefix)

        if args.refresh_meta:
            stackers.buildMetaFeatures(BaseModelZoo)
            stackers.fit_transform(train_x, train_y, test_x, seed=42)

        else:
            stackers.loadExternalMeta(ExtMetaConfigs)
            stackers.buildMetaClassifiers(BaseModelZoo)
            stackers.fit_predict(train_x, train_y, test_x, seed=538)

    return