Beispiel #1
0
def CheckFileExist(filename, silent=True):
    if not os.path.exists(filename):
        if not silent:
            logger.warning('{} does not exist'.format(filename))

        return False

    return True
    def load_hyperparameters(self, filename):
        if not CheckFileExist(filename, silent=False):
            logger.warning('no hpo parameters load from {}'.format(filename))
            return {}

        with open(filename, 'rb') as f:
            params = pickle.load(f)
            logger.info('load from {} with params:{}'.format(filename, params))
            return params
    def _evaluate(self, eval_params):
        eval_params = dict(zip(self.eval_params_name, eval_params))
        tuning_params = self.init_params.copy()
        tuning_params.update(eval_params)

        # reinitialize cv
        cv_obj = self.nr_fold
        if self.valid_type == 'TimeSeriesSplit':
            cv_obj = model_selection_object[self.valid_type](
                n_splits=self.nr_fold)
        elif 'KFold' in self.valid_type:
            cv_obj = model_selection_object[self.valid_type](
                n_splits=self.nr_fold,
                shuffle=True,
                random_state=self.split_seed)

        if self.set_params_safe:
            try:
                m = self.model().set_params(**tuning_params)
            except:
                logger.warning('fail to use set_params')
                m = self.model(**tuning_params)
                logger.warning('model params={}'.format(m.get_params()))
        else:  # unless some parameters cannot pass through set_params()
            m = self.model(**tuning_params)

        score = np.mean(
            cross_val_score(m,
                            self.X,
                            self.y,
                            cv=cv_obj,
                            n_jobs=1,
                            scoring=self.metric))

        self.nr_iteration += 1
        self.best_score = max(self.best_score, score)

        # save the current best paramerters here
        if self.best_score == score:
            # update new result
            self.filestem_meta.update({'score': score})
            self.optimized_params = tuning_params.copy()
            if self.nr_iteration >= self.n_init_points:
                self.save_hyperparameters(show_iter=True)
            else:
                self.save_hyperparameters(show_iter=False)

        if self.nr_iteration == self.n_init_points:  # save after intinializing
            self.save_hyperparameters(show_iter=False)

        logger.info(
            'iteration {:04d}/{:04d}, current score: {:04f}, best: {:.4f}, current params: {}, best params: {}'
            .format(self.nr_iteration, self.n_calls, score, self.best_score,
                    tuning_params, self.optimized_params))

        return -score  # for minimize, most scikit-learn metric are larger the better
    def CullFeatures(self, x, blacklist=list()):

        if not blacklist:
            logger.warning('empty blacklist')
            return x

        before = x.shape
        x = x[[f for f in x.columns if f not in blacklist]]
        logger.info('shrink from {} to {} by {}'.format(before, x.shape, len(blacklist)))
        return x
    def GetBlacklist(self, threshold=10.):

        if self.importance_series.empty:
            logger.warning('no feature')
            return list()

        logger.info('create blacklist on score <= {}'.format(threshold))
        ret = self.importance_series.loc[self.importance_series  <= threshold].index.tolist()
        logger.info('return blacklist of {} from {} features'.format(len(ret), len(self.importance_series)))
        return ret
Beispiel #6
0
 def ReturnTrainTest(configs):
     df_names = ['train_x', 'train_y', 'test_x', 'test_y']
     configs.update({
         k: pd.DataFrame()
         for k, v in configs.items() if k not in df_names
     })
     for df_name in [k for k, v in configs.items() if v.empty]:
         logger.warning("no key as {}".format(df_name))
     # return train_x, train_y, test_x, test_y
     return configs['train_x'], configs['train_y'], configs[
         'test_x'], configs['test_y']
Beispiel #7
0
def AnyEmptyDataframe(data):
    if not data:
        logger.warning('passing no dataframes')
        return True

    if isinstance(data, dict):
        return any([v.empty for k, v in data.items()])

    elif isinstance(data, list):
        return any([l.empty for l in data])

    return False
Beispiel #8
0
def process_interaction(df, process_configs):
    """
    process configs is a dictionary as
    a dictionary with {'new_feature_name': {'mode': 'add', 'a': 'col_name', 'b':'col_name',}, }
    ------

    """
    logger.info("Process Interactions")

    possible_arithmetics = ['add', 'sum_squared',
                            'subtract', 'subtract_positive',
                            'multiply',
                            'divide', 'divide_nonzero']

    new_columns = []
    for v in process_configs:
        k = v['name']
        logger.info("process {}".format(k))

        # check arithmetic
        arithmetic = v.get('mode', None)
        if arithmetic not in possible_arithmetics:
            logger.warning("no arithmetic on {}".format(k))
            continue

        #check feature columns
        ckeck_cols = [vv for kk, vv in v.items() if kk not in ['name', 'mode']]
        cols_exist, cols_not_exist = CheckColumnsExist(df, ckeck_cols)
        if cols_not_exist:
            logger.warning("missing {} columns: {}".format(len(cols_not_exist), cols_not_exist))
            continue

        # process
        if 'add' == arithmetic:
            df[k] = df[v['a']] + df[v['b']]
        elif 'subtract' == arithmetic:
            df[k] = df[v['a']] - df[v['b']]
        elif 'subtract_positive' == arithmetic:
            df[k] = (df[v['a']] - df[v['b']]).apply(lambda x: x if x > 0 else 0)
        elif 'multiply' == arithmetic:
            df[k] = df[v['a']] * df[v['b']]
        elif 'divide' == arithmetic:
            df[k] = df[v['a']] / df[v['b']]
        elif 'divide_nonzero' == arithmetic:
            df[k] = df[v['a']] / (df[v['b']] + 1.)
        elif 'sum_squared' == arithmetic:
            df[k] = df[[v['a'], v['b']]].pow(2).sum(axis=1)# np.square(df[v['a']]) + np.square(df[v['b']])

        new_columns.append(k)

    return df, new_columns
Beispiel #9
0
def process_factorize(df, process_configs):
    """
    input a list of features to factorize (label encoding)
    """
    logger.info("Process Factorize")
    cols_exist, cols_not_exist = CheckColumnsExist(df, sorted(process_configs))

    for bin_feature in cols_exist:
        df[bin_feature], uniques = pd.factorize(df[bin_feature], sort=False)
        logger.info("factorize {} in {}: {}".format(len(uniques), bin_feature, uniques))

    for k in cols_not_exist:
        logger.warning("missing {}".format(k))

    return df
    def saveHDF(self, filename, data, opt_overwrite=True, opt_fast=False):
        if self.checkFile(filename):
            if not opt_overwrite:
                logger.warning("overwrite is not allowed")
                return False

        compress_option = hdf5_compress_option
        if opt_fast:
            logger.info("use faster compression option")
            compress_option = fast_hdf5_compress_option
        with pd.HDFStore(filename, 'w', **compress_option) as store:
            logger.info("Save to {}".format(filename))
            for k, d in data.items():
                store.put(k, d, format='table')
                #store.put(k, d, format='fixed')
                logger.info("Save {}: {}".format(k, d.shape))
Beispiel #11
0
def process_replace(df, process_configs):
    """
    {'DAYS_EMPLOYED': {365243: np.nan, }, }
    """
    logger.info("Process Fill NA")
    columns = sorted(list(process_configs.keys()))
    cols_exist, cols_not_exist = CheckColumnsExist(df, columns)

    configs = {k: v for k, v in process_configs.items() if k in cols_exist}
    df.replace(configs, inplace=True)

    for k, v in configs.items():
        logger.info("impute {} using {}".format(k, v))
    if cols_not_exist:
        logger.warning("missing {} columns: {}".format(len(cols_not_exist), cols_not_exist))

    return df
Beispiel #12
0
def process_aggregate(df, process_configs, groupby_cols, cat_cols=[]):
    """
    pass each groupby_cols one by one: aggregate and condictional aggregate, general aggregate
    """
    logger.info("Process Aggregate")
    groupby_cols = [f for f in groupby_cols if f in df.columns]
#    if groupby_cols not in df.columns:
    if not groupby_cols:
        logger.warning("aggregate column {} not exist".format(groupby_cols))
        return pd.DataFrame({groupby_cols:[]}).set_index(groupby_cols)

    logger.info("aggregate on {}".format(groupby_cols))
    header = process_configs.get('header', 'foobar')

    aggregations = {}
    # aggregate and condictional aggregate
    num_cols = process_configs.get('num', {})
    cat_agg = process_configs.get('cat', [])
    if num_cols or cat_agg:
        aggregations = {k:list(v) for k, v in num_cols.items() if k in df.columns and v}
        aggregations.update({k:list(cat_agg) for k in cat_cols if k in df.columns and cat_agg})
        for k, v in aggregations.items():  # dict
            logger.info("aggregate {} ({}) with {}".format(k, df[k].dtype, v))

        # assigned in configs but not in dataframe
        missing = sorted(list(set(num_cols.keys()).union(set(cat_cols)).difference(set(aggregations.keys()))))
        for k in missing:  # dict
            if k in num_cols.keys():
                logger.info("missing {} in num".format(k))
            elif k in cat_cols:
                logger.info("missing {} in cat".format(k))

    #  processing
    if aggregations:
        df_agg = df.groupby(groupby_cols).agg({**aggregations})
        df_agg.columns = pd.Index(['{}_{}_{}'.format(header, e[0], e[1].upper()) for e in df_agg.columns.tolist()])
    else:
        logger.info("no aggragation on {} and {}".format(header, groupby_cols))
        df_agg = pd.DataFrame({groupby_cols:[]}).set_index(groupby_cols)

    if process_configs.get('count', False):
        logger.info("aggregate count on {} at {}".format(groupby_cols, header))
        df_agg['{}_COUNT_{}'.format(header, '_'.join(groupby_cols))] = df.groupby(groupby_cols).size()

    return df_agg
    def __init__(self, model, configs={}, task_name=None, data_prefix=None):
        self.model = model
        self.task_name = task_name
        self.data_prefix = data_prefix
        self.params_dir = file_dir_path.get('params', '../params')

        #skopt
        search_settings = configs.get("search_settings", {})
        self.n_calls = search_settings.get("n_calls", 15)
        self.random_state = search_settings.get("random_state", 42)
        self.n_init_points = search_settings.get("n_inits", 10)

        if self.n_init_points >= self.n_calls:
            logger.warning(
                'initial points {} is larger than n_calls {}'.format(
                    self.n_init_points, self.n_calls))

        #validation
        evalute_settings = configs.get("evaluation_settings", {})

        self.valid_type = evalute_settings.get("validation", "KFold")
        self.nr_fold = evalute_settings.get("nr_fold", 3)
        self.split_seed = evalute_settings.get("split_seed", 42)
        self.metric = evalute_settings.get("eval_metric", "neg_log_loss")

        #model
        self.init_params = configs.get("initialize", {})
        self.search_space = configs.get("search_space", {})
        self.set_params_safe = self._check_parameters()

        self.optimized_params = {}
        self.filename_hpo_iter = ''
        self.filename_hpo_best = ''

        #initializing
        self._search_space_initialize()

        #
        self.filestem_meta = {
            'level': 0,
            'model': self.task_name,
            'feature_num': 0,
            'score': 0,
            'fold': self.nr_fold,
        }
    def LoadResult(self, result_files):
        if not result_files:
            logger.warning('no result file to rank features')
            return False

        elif len(result_files) == 1:
            ret = DataFileIO().loadHDF('{loc}/{filename}'.format(loc=self.result_dir,
                                    filename=result_files[0]))
            df = ret.get('feature_importance', pd.DataFrame())

        else:
            logger.info('concate {} results to rank features'.format(len(result_files)))
            rets = list()
            for f in result_files:
                rets.append(DataFileIO().loadHDF('{loc}/{filename}'.format(loc=self.result_dir, filename=f)))

            rets = [ret.get('feature_importance', pd.DataFrame()) for ret in rets]
            df = pd.concat(rets, axis=1)

        self._analyzeFeatures(df)
Beispiel #15
0
def process_drop_rows(df, process_configs):
    """
    {'CODE_GENDER': ['XNA'], }
    """
    logger.info("Process Drop Rows")
    columns = sorted(list(process_configs.keys()))
    cols_exist, cols_not_exist = CheckColumnsExist(df, columns)

    configs = {k: v for k, v in process_configs.items() if k in cols_exist}
    inds = df[cols_exist].isin(configs)
    inds_sel = inds.any(axis=1)

    for f, series in inds.iteritems():
        logger.info("remove {} rows by in {} if any {}".format(f, series.sum(), process_configs[f]))

    logger.info("overall remove {} from {} rows".format(inds_sel.astype(int).sum(), inds_sel.shape[0]))
    if cols_not_exist:
        logger.warning("missing {} columns: {}".format(len(cols_not_exist), cols_not_exist))

    return df.loc[~inds_sel]
Beispiel #16
0
def process_deep_interactions(df, process_configs):
    """
    {'header'   : 'EXT_SOURCES_SYNTHESIZE',
     'transform': ['product', 'mean', 'sum', 'sum_squared', 'std'],
     'columns'  : ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'],
     }
    """
    applicable_methods = ['kurtosis', 'sum', 'sum_squared', 'product', 'mean', 'std']

    header  = process_configs.get('header', 'NEW')
    cols    = process_configs.get('columns', [])
    cols_na = [f for f in cols if f not in df.columns]
    cols    = [f for f in cols if f in df.columns]
    methods = process_configs.get('transform', [])
    methods = [m for m in methods if m in applicable_methods]

    for m in methods:
        logger.info('transform deep interactions ({}): {}'.format(m, cols))
        if cols_na:
            logger.warning('transform deep interactions ({}), features not found: {}'.format(
                    m, cols_na))

        name = '{}_{}'.format(header, m.upper())
        if m == 'kurtosis':
            df[name] = df[cols].kurtosis(axis=1)
        elif m == 'mean':
            df[name] = df[cols].mean(axis=1)
        elif m == 'sum':
            df[name] = df[cols].sum(axis=1)
        elif m == 'sum_squared':
            df[name] = df[cols].pow(2).sum(axis=1)
        elif m == 'product':
            df[name] = df[cols].fillna(df[cols].mean()).product(axis=1)
        elif m == 'std':
            df[name] = df[cols].std(axis=1)
            df[name] = df[name].fillna(df[name].mean())

    return df
    def save_hyperparameters(self,
                             export=False,
                             show_iter=True,
                             remove_old=True):
        if not self.optimized_params:
            logger.warning('need to run optimize first')
            return False

        params = SwitchDevice(self.optimized_params, enable_gpu=False)

        if export:
            filename = filename_hpo_external.format(loc=self.params_dir,
                                                    prefix=self.data_prefix,
                                                    task=self.task_name)
            logger.warning('export for external module: {}'.format(filename))
            self._save_pickle(filename, obj=params)
            return filename

        if remove_old and CheckFileExist(self.filename_hpo_best, silent=True):
            os.remove(self.filename_hpo_best)

        stem = self._current_file_stem()
        if show_iter:
            self.filename_hpo_iter = filename_hpo_intermediate.format(
                loc=self.params_dir,
                prefix=self.data_prefix,
                iter_num=self.nr_iteration,
                stem=stem)
            self._save_pickle(self.filename_hpo_iter, obj=params)

        #write current best anyway
        self.filename_hpo_best = filename_hpo_result.format(
            loc=self.params_dir, prefix=self.data_prefix, stem=stem)
        self._save_pickle(self.filename_hpo_best, obj=params)
        #self.load_hyperparameters(filename)  # attemp to reload
        return True
Beispiel #18
0
    def LoadData(self, data_configs, source='from_csv', prefix='sample'):
        """
        """
        #initialize, reading in configs for data provider itself
        configs_table = pd.DataFrame(self.provider_configs).T
        configs_table['level'] = configs_table['level'].astype(int)
        configs_table.set_index('level', inplace=True)
        configs_table['filename'] = configs_table['filename'].apply(
            lambda x: x.format(header=prefix) if isinstance(x, str) else None)

        provider_configs = self.provider_configs.get(source,
                                                     'from_csv').copy()  #
        refresh_level = provider_configs.get('level')

        # load data at its refresh level
        filename = '{}/{}'.format(self.cache_path,
                                  configs_table.loc[refresh_level, 'filename'])
        if refresh_level == 3:
            logger.info("Load Train and Test from Cache")
            self.ReadTrainTestHDF(data_configs['input'], filename)
            if not AnyEmptyDataframe(self.xy_train_test):
                return self.ReturnTrainTest(self.xy_train_test)
            else:
                refresh_level = 2
                logger.warning(
                    'No train_test cache to load. Try to refresh at level {}'.
                    format(refresh_level))
                filename = '{}/{}'.format(
                    self.cache_path, configs_table.loc[refresh_level,
                                                       'filename'])

        if refresh_level == 2:
            logger.info("Recreate Train and Test")
            self.ReadProcessedHDF(data_configs['input'], filename)
            if AnyEmptyDataframe(self.data_processed):
                refresh_level = 1
                logger.warning(
                    'no processed cache to load from disk. Attempt to refresh at level {}'
                    .format(refresh_level))
                filename = '{}/{}'.format(
                    self.cache_path, configs_table.loc[refresh_level,
                                                       'filename'])

        if refresh_level == 1:
            logger.info("Process DataFrames from HDF Cashe")
            self.ReadRawHDF(data_configs['input'],
                            filename,
                            limited_by_configs=True)
            if AnyEmptyDataframe(self.data_raw):
                refresh_level = 0
                logger.warning(
                    'No raw cache to load. Try to refresh at level {}'.format(
                        refresh_level))

        if refresh_level == 0:
            logger.info("Process DataFrames from CSV")
            self.ReadDataCSV(data_configs['input'])
            filename = '{}/{}'.format(self.cache_path,
                                      configs_table.loc[1, 'filename'])
            self.SaveFileHDF(filename, self.data_raw, opt_overwrite=True)

        # process data
        if refresh_level <= 1:
            logger.info("Process DataFrames")
            train_test = self._application_train_test(
                data_configs['application'])
            self.data_processed = {
                'application_train': train_test[0],
                'application_test': train_test[1],
            }

            self.data_processed.update({
                'bureau':
                self._bureau_and_balance(data_configs['bureau']),
                'previous_application':
                self._previous_application(
                    data_configs['previous_application']),
                'pos_cash':
                self._pos_cash_balance(data_configs['pos_cash']),
                'credit_card_balance':
                self._credit_card_balance(data_configs['credit_card_balance']),
                'installments_payments':
                self._installments_payments(
                    data_configs['installments_payments']),
            })

            # save processed
            filename = '{}/{}'.format(self.cache_path,
                                      configs_table.loc[2, 'filename'])
            self.SaveFileHDF(filename, self.data_processed, opt_overwrite=True)

        # create train and test
        if refresh_level <= 2:
            self.CreateTrainTestData(self.data_processed)
            filename = '{}/{}'.format(self.cache_path,
                                      configs_table.loc[3, 'filename'])
            self.SaveFileHDF(filename, self.xy_train_test, opt_overwrite=True)

        return self.ReturnTrainTest(self.xy_train_test)
    def get_optimal_parameters(self):
        if not self.optimized_params:
            logger.warning('need to run optimize first')

        return self.optimized_params.copy()