def _split_configs(c, name): ret = dict() for k, v in c.items(): if 'aggregations' in k: ret[k] = [f for f in v if f.get('data', None) == name] logger.info('split configs: {}'.format(c)) return ret
def LoadPickle(filename): if not CheckFileExist(filename): return None with open(filename, 'rb') as f: logger.info('load model {}'.format(filename)) return pickle.load(f)
def _bureau_and_balance(self, configs): current_index = self.data_index['bureau'] major_index = self.data_index['application_train'] nan_as_category = configs.get('nan_as_category', False) # Read data and merge df = self.data_raw['bureau'] bb = self.data_raw['bureau_balance'] logger.info("Bureau: {}, Bureau Balance: {}".format( df.shape, bb.shape)) if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, configs['onehot_columns'], nan_as_category) bb, cat_cols_bb, new_cols_bb = process_one_hot_encode( bb, configs['onehot_columns'], nan_as_category) self.cols_one_hot.update({'bureau': new_cols + new_cols_bb}) agg_configs = self._split_configs(configs.copy(), 'bureau_balance') bb_agg = self._aggregate_pipeline(bb, cat_cols_bb, agg_configs)[current_index] df = df.set_index(current_index).join(bb_agg, how='left') bureau_cat_cols = cat_cols + [ c for c in bb_agg if any([True if cc in c else False for cc in cat_cols_bb]) ] #condictional aggregation # Bureau: Active credits - using only numerical aggregations # Bureau: Closed credits - using only numerical aggregations agg_configs = self._split_configs(configs.copy(), 'bureau') bureau_agg = self._aggregate_pipeline(df, bureau_cat_cols, agg_configs)[major_index] return Cast64To32(bureau_agg)
def search(self, X, y): self.X = X.apply(lambda x: np.nan_to_num(x)) self.y = y self.filestem_meta.update({ 'feature_num': X.shape[1], }) self.nr_iteration = 0 self.best_score = 0 logger.info( 'evaluate {} at {} iteration, {}-fold cv, metric={}'.format( self.task_name, self.n_calls, self.nr_fold, self.metric)) gp_optimizer = gp_minimize(self._evaluate, self.search_params_list, n_calls=self.n_calls, n_random_starts=self.n_init_points, random_state=self.random_state, verbose=False) optimized_params = { k: v for k, v in zip(self.eval_params_name, gp_optimizer.x) } # not using logger.info('best cv score: {}, hyperparameters={}'.format( self.best_score, optimized_params)) return self.optimized_params.copy()
def func(k, v, sub): tf = v.get('object', None) params = v.get('params', {}) if not tf: return pd.DataFrame() logger.info('decompose {} on {} features'.format(k, sub.shape[1])) d = tf().set_params(**params).fit_transform(sub) return pd.DataFrame(d, columns=['{}_{}'.format(k, i) for i in range(1, d.shape[1]+1)])
def _search_space_initialize(self): self.eval_params_name = sorted([k for k in self.search_space.keys()]) self.search_params_list = [ self.search_space[k] for k in self.eval_params_name ] logger.info('search range of skopt:') for k, v in self.search_space.items(): logger.info('search {} in {}'.format(k, v))
def func(params, opt, silent=False): for k, v in params.items(): a = k in opt.keys() b = v in opt.get(k, {}).keys() if all([a, b]): params.update({k: opt[k].get(v)}) logger.info('switch {} to {}'.format(k, params[k])) return params
def load_hyperparameters(self, filename): if not CheckFileExist(filename, silent=False): logger.warning('no hpo parameters load from {}'.format(filename)) return {} with open(filename, 'rb') as f: params = pickle.load(f) logger.info('load from {} with params:{}'.format(filename, params)) return params
def _evaluate(self, eval_params): eval_params = dict(zip(self.eval_params_name, eval_params)) tuning_params = self.init_params.copy() tuning_params.update(eval_params) # reinitialize cv cv_obj = self.nr_fold if self.valid_type == 'TimeSeriesSplit': cv_obj = model_selection_object[self.valid_type]( n_splits=self.nr_fold) elif 'KFold' in self.valid_type: cv_obj = model_selection_object[self.valid_type]( n_splits=self.nr_fold, shuffle=True, random_state=self.split_seed) if self.set_params_safe: try: m = self.model().set_params(**tuning_params) except: logger.warning('fail to use set_params') m = self.model(**tuning_params) logger.warning('model params={}'.format(m.get_params())) else: # unless some parameters cannot pass through set_params() m = self.model(**tuning_params) score = np.mean( cross_val_score(m, self.X, self.y, cv=cv_obj, n_jobs=1, scoring=self.metric)) self.nr_iteration += 1 self.best_score = max(self.best_score, score) # save the current best paramerters here if self.best_score == score: # update new result self.filestem_meta.update({'score': score}) self.optimized_params = tuning_params.copy() if self.nr_iteration >= self.n_init_points: self.save_hyperparameters(show_iter=True) else: self.save_hyperparameters(show_iter=False) if self.nr_iteration == self.n_init_points: # save after intinializing self.save_hyperparameters(show_iter=False) logger.info( 'iteration {:04d}/{:04d}, current score: {:04f}, best: {:.4f}, current params: {}, best params: {}' .format(self.nr_iteration, self.n_calls, score, self.best_score, tuning_params, self.optimized_params)) return -score # for minimize, most scikit-learn metric are larger the better
def CullFeatures(self, x, blacklist=list()): if not blacklist: logger.warning('empty blacklist') return x before = x.shape x = x[[f for f in x.columns if f not in blacklist]] logger.info('shrink from {} to {} by {}'.format(before, x.shape, len(blacklist))) return x
def GetBlacklist(self, threshold=10.): if self.importance_series.empty: logger.warning('no feature') return list() logger.info('create blacklist on score <= {}'.format(threshold)) ret = self.importance_series.loc[self.importance_series <= threshold].index.tolist() logger.info('return blacklist of {} from {} features'.format(len(ret), len(self.importance_series))) return ret
def process_interaction(df, process_configs): """ process configs is a dictionary as a dictionary with {'new_feature_name': {'mode': 'add', 'a': 'col_name', 'b':'col_name',}, } ------ """ logger.info("Process Interactions") possible_arithmetics = ['add', 'sum_squared', 'subtract', 'subtract_positive', 'multiply', 'divide', 'divide_nonzero'] new_columns = [] for v in process_configs: k = v['name'] logger.info("process {}".format(k)) # check arithmetic arithmetic = v.get('mode', None) if arithmetic not in possible_arithmetics: logger.warning("no arithmetic on {}".format(k)) continue #check feature columns ckeck_cols = [vv for kk, vv in v.items() if kk not in ['name', 'mode']] cols_exist, cols_not_exist = CheckColumnsExist(df, ckeck_cols) if cols_not_exist: logger.warning("missing {} columns: {}".format(len(cols_not_exist), cols_not_exist)) continue # process if 'add' == arithmetic: df[k] = df[v['a']] + df[v['b']] elif 'subtract' == arithmetic: df[k] = df[v['a']] - df[v['b']] elif 'subtract_positive' == arithmetic: df[k] = (df[v['a']] - df[v['b']]).apply(lambda x: x if x > 0 else 0) elif 'multiply' == arithmetic: df[k] = df[v['a']] * df[v['b']] elif 'divide' == arithmetic: df[k] = df[v['a']] / df[v['b']] elif 'divide_nonzero' == arithmetic: df[k] = df[v['a']] / (df[v['b']] + 1.) elif 'sum_squared' == arithmetic: df[k] = df[[v['a'], v['b']]].pow(2).sum(axis=1)# np.square(df[v['a']]) + np.square(df[v['b']]) new_columns.append(k) return df, new_columns
def readHDF(filename, configs={}, opt_load=True): with pd.HDFStore(filename, 'r', **hdf5_compress_option) as store: logger.info("{} contained {} items".format(filename, len(store.keys()))) for k in store.keys(): logger.info("{}: {}".format(k, store[k].shape)) if opt_load and configs: # load and limited by configs ret = {k: pd.DataFrame() for k in configs.keys()} ret.update({k.strip('/'): store[k] for k in store.keys() if k.strip('/') in configs.keys()}) return ret if opt_load: # load all saved dataframes return {k.strip('/'): store[k] for k in store.keys()} return {}
def process_factorize(df, process_configs): """ input a list of features to factorize (label encoding) """ logger.info("Process Factorize") cols_exist, cols_not_exist = CheckColumnsExist(df, sorted(process_configs)) for bin_feature in cols_exist: df[bin_feature], uniques = pd.factorize(df[bin_feature], sort=False) logger.info("factorize {} in {}: {}".format(len(uniques), bin_feature, uniques)) for k in cols_not_exist: logger.warning("missing {}".format(k)) return df
def loadCSV(self, configs={}): """ configs = {'name': 'file_path'} return load_data = {'name': dataframe} """ logger.info("Read Data from CSV") load_data = {} for k, f_path in configs.items(): if not self.checkFile(f_path): continue load_data[k] = pd.read_csv(f_path) logger.info("Read in {}: from {}, shape={}".format(k, f_path, load_data[k].shape)) self.data_lastet_load = load_data.copy() return load_data
def process_replace(df, process_configs): """ {'DAYS_EMPLOYED': {365243: np.nan, }, } """ logger.info("Process Fill NA") columns = sorted(list(process_configs.keys())) cols_exist, cols_not_exist = CheckColumnsExist(df, columns) configs = {k: v for k, v in process_configs.items() if k in cols_exist} df.replace(configs, inplace=True) for k, v in configs.items(): logger.info("impute {} using {}".format(k, v)) if cols_not_exist: logger.warning("missing {} columns: {}".format(len(cols_not_exist), cols_not_exist)) return df
def _pos_cash_balance(self, configs): current_index = self.data_index['pos_cash_balance'] major_index = self.data_index['application_train'] nan_as_category = configs.get('nan_as_category', False) df = self.data_raw['pos_cash_balance'] logger.info("pos_cash: {}".format(df.shape)) if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, configs['onehot_columns'], nan_as_category) self.cols_one_hot.update({'pos_cash': new_cols}) else: cat_cols = IdentifyCategoricalColumn(df) pos_cash_agg = self._aggregate_pipeline(df, cat_cols, configs)[major_index] return Cast64To32(pos_cash_agg)
def LoadResult(self, result_files): if not result_files: logger.warning('no result file to rank features') return False elif len(result_files) == 1: ret = DataFileIO().loadHDF('{loc}/{filename}'.format(loc=self.result_dir, filename=result_files[0])) df = ret.get('feature_importance', pd.DataFrame()) else: logger.info('concate {} results to rank features'.format(len(result_files))) rets = list() for f in result_files: rets.append(DataFileIO().loadHDF('{loc}/{filename}'.format(loc=self.result_dir, filename=f))) rets = [ret.get('feature_importance', pd.DataFrame()) for ret in rets] df = pd.concat(rets, axis=1) self._analyzeFeatures(df)
def _application_train_test(self, configs): nan_as_category = configs.get('nan_as_category', False) # Read data and merge major_index = self.data_index['application_train'] df = self.data_raw['application_train'] test_df = self.data_raw['application_test'] logger.info("Train samples: {}, test samples: {}".format( df.shape, test_df.shape)) df = df.append(test_df, sort=False, ignore_index=True) df = process_drop_rows(df, process_configs=configs['filter_rows']) df = process_factorize(df, process_configs=configs['factorize_columns']) if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, configs['onehot_columns'], nan_as_category) self.cols_one_hot.update({'application': new_cols}) else: cat_cols = IdentifyCategoricalColumn(df) df = process_replace(df, process_configs=configs['replace_rows']) df, interact_cols = process_interaction( df, process_configs=configs['interaction_columns']) if configs.get('deep_interactions', []): deep_interactions = configs.get('deep_interactions', []) for c in deep_interactions: df = process_deep_interactions(df, c) logger.info('prepare decompostion, application={}'.format(df.shape)) df_ext = [ process_decomposition(df, c) for c in configs['decomposition'] ] df = pd.concat([df] + df_ext, axis=1, join='inner') logger.info('finished decompositions, application={}'.format(df.shape)) df = Cast64To32(df) # seperate train test # Divide in training/validation and test data train_df = df.loc[df[ self.target_column].notnull()].reset_index().set_index(major_index) test_df = df.loc[df[ self.target_column].isnull()].reset_index().set_index(major_index) logger.info("Split into train samples: {}, test samples: {}".format( train_df.shape, test_df.shape)) del df gc.collect() return train_df, test_df
def _installments_payments(self, configs): current_index = self.data_index['installments_payments'] major_index = self.data_index['application_train'] nan_as_category = configs.get('nan_as_category', False) df = self.data_raw['installments_payments'] logger.info("installments_payments: {}".format(df.shape)) cat_cols = [] if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, cat_cols, nan_as_category) self.cols_one_hot.update({'installments_payments': new_cols}) else: cat_cols = IdentifyCategoricalColumn(df) df, interact_cols = process_interaction( df, process_configs=configs['interaction_columns']) installments_agg = self._aggregate_pipeline(df, cat_cols, configs)[major_index] return Cast64To32(installments_agg)
def CreateTrainTestData(self, configs): """ concat all dataframes to create train and test dataframe configs={'application_train' : df}, """ train = configs.get('application_train', pd.DataFrame()) test = configs.get('application_test', pd.DataFrame()) if train.empty or test.empty: logger.error('no train and test dataframe') excluded = ['application_train', 'application_test'] for k, v in configs.items(): if k not in excluded: train = train.join(v, how='left') test = test.join(v, how='left') logger.info("to_join={}, {}: train={}, test{}".format( k, v.shape, train.shape, test.shape)) gc.collect() # sorted for further cols = sorted(train.columns.tolist()) train = train[cols] test = test[cols] #all process complete cols = sorted([ f for f in train.columns if f != self.target_column and f in test.columns ]) self.xy_train_test = { 'train_x': train[cols], 'train_y': train[self.target_column], 'test_x': test[cols], 'test_y': test[self.target_column] } del train, test gc.collect() return self.ReturnTrainTest(self.xy_train_test)
def process_decomposition(df, process_configs): """ {'columns': ['FLAG_CONT_MOBILE', 'FLAG_PHONE'], 'stems' : ['CODE_GENDER_'], 'methods' : {'APPLICANT_SVD': {'object': TruncatedSVD, 'params': {'n_components': 8, 'algorithm': 'randomized', 'n_iter': 10, 'random_state': 42},}, }, } """ use_cols, cols_not_exist = CheckColumnsExist(df, process_configs.get('columns', [])) stems = process_configs.get('stems', []) if stems: dict_stem = {s:[f for f in df.columns if s in f] for s in stems} cols_stem = list(itertools.chain.from_iterable(dict_stem.values())) if cols_stem: use_cols.extend(cols_stem) for k, v in dict_stem.items(): logger.info('find {} stem "{}": {}'.format(len(v), k, v)) use_cols = sorted(use_cols) logger.info('decompose on {} features: {}'.format(len(use_cols), use_cols)) df_sub = df[use_cols].apply(lambda x: np.nan_to_num(x)) def func(k, v, sub): tf = v.get('object', None) params = v.get('params', {}) if not tf: return pd.DataFrame() logger.info('decompose {} on {} features'.format(k, sub.shape[1])) d = tf().set_params(**params).fit_transform(sub) return pd.DataFrame(d, columns=['{}_{}'.format(k, i) for i in range(1, d.shape[1]+1)]) ret = [func(k, v, df_sub) for k, v in process_configs.get('methods', {}).items()] ret = pd.concat(ret, axis=1, join='inner') ret.index = df.index return ret
def main(argc, argv): DataConfigs = InitializeConfigs( '../configs/SampleDataConfigs.py').DataConfigs dp = DataProvider() #dp.ReadRawHDF(DataConfigs, filename='../data/cache_sample_raw.hdf5', limited_by_configs=False) #import pdb; pdb.set_trace() #dp.LoadData(DataConfigs, source='from_csv', prefix='sample') d = dp.LoadData(DataConfigs, source='from_raw_cache', prefix='sample') #d = dp.LoadData(DataConfigs, source='from_processed', prefix='sample') #d = dp.LoadData(DataConfigs, source='from_train_test', prefix='sample') import pdb pdb.set_trace() train_x, train_y = d[0], d[1] logger.info('P/N ratio:\n{}'.format( train_y.value_counts(normalize=True).sort_index()))
def process_deep_interactions(df, process_configs): """ {'header' : 'EXT_SOURCES_SYNTHESIZE', 'transform': ['product', 'mean', 'sum', 'sum_squared', 'std'], 'columns' : ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'], } """ applicable_methods = ['kurtosis', 'sum', 'sum_squared', 'product', 'mean', 'std'] header = process_configs.get('header', 'NEW') cols = process_configs.get('columns', []) cols_na = [f for f in cols if f not in df.columns] cols = [f for f in cols if f in df.columns] methods = process_configs.get('transform', []) methods = [m for m in methods if m in applicable_methods] for m in methods: logger.info('transform deep interactions ({}): {}'.format(m, cols)) if cols_na: logger.warning('transform deep interactions ({}), features not found: {}'.format( m, cols_na)) name = '{}_{}'.format(header, m.upper()) if m == 'kurtosis': df[name] = df[cols].kurtosis(axis=1) elif m == 'mean': df[name] = df[cols].mean(axis=1) elif m == 'sum': df[name] = df[cols].sum(axis=1) elif m == 'sum_squared': df[name] = df[cols].pow(2).sum(axis=1) elif m == 'product': df[name] = df[cols].fillna(df[cols].mean()).product(axis=1) elif m == 'std': df[name] = df[cols].std(axis=1) df[name] = df[name].fillna(df[name].mean()) return df
def _aggregate_pipeline(self, df, cat_cols, configs): ret = list() for c in configs.get('aggregations', []): groupby_cols = c.get('groupby', []) if not groupby_cols: logger.info( "No columns to Aggregate on {}".format(groupby_cols)) continue configs_subset = c.get('subset', {}) if configs_subset: cond_k = configs_subset.get('column_name', 'foobar') cond_i = configs_subset.get('conditions', []) if cond_k in df.columns and cond_i: sub_df = df.loc[df[cond_k].isin(cond_i)] logger.info( "Condictional Aggregate on {}, {}, shape={}".format( cond_k, groupby_cols, sub_df.shape)) ret.append( process_aggregate(sub_df, process_configs=c, groupby_cols=groupby_cols, cat_cols=[])) else: logger.info("Specific Aggregate on {}".format(groupby_cols)) ret.append( process_aggregate(df, process_configs=c, groupby_cols=groupby_cols, cat_cols=cat_cols)) ret = [r for r in ret if not r.empty] inds = sorted(list(set([r.index.name for r in ret]))) ret = { ind: pd.concat([r for r in ret if r.index.name == ind], axis=1, join='inner') for ind in inds } for k, v in ret.items(): logger.info("Result Aggregate on {}: {}".format(k, v.shape)) return ret
def _previous_application(self, configs): current_index = self.data_index['previous_application'] major_index = self.data_index['application_train'] nan_as_category = configs.get('nan_as_category', False) df = self.data_raw['previous_application'] logger.info("Previous application: {}".format(df.shape)) if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, configs['onehot_columns'], nan_as_category) self.cols_one_hot.update({'previous_application': new_cols}) else: cat_cols = IdentifyCategoricalColumn(df) df = process_replace(df, process_configs=configs['replace_rows']) df, interact_cols = process_interaction( df, process_configs=configs['interaction_columns']) # Previous applications categorical features # Previous Applications: Approved Applications - only numerical features # Previous Applications: Refused Applications - only numerical features prev_agg = self._aggregate_pipeline(df, cat_cols, configs)[major_index] return Cast64To32(prev_agg)
def saveHDF(self, filename, data, opt_overwrite=True, opt_fast=False): if self.checkFile(filename): if not opt_overwrite: logger.warning("overwrite is not allowed") return False compress_option = hdf5_compress_option if opt_fast: logger.info("use faster compression option") compress_option = fast_hdf5_compress_option with pd.HDFStore(filename, 'w', **compress_option) as store: logger.info("Save to {}".format(filename)) for k, d in data.items(): store.put(k, d, format='table') #store.put(k, d, format='fixed') logger.info("Save {}: {}".format(k, d.shape))
def process_one_hot_encode(df, categorical_columns=[], nan_as_category=True): """ ------ return df, new_columns, columns_to_convert """ logger.info("Process OneHot Encoding") original_columns = df.columns.tolist() if not categorical_columns: categorical_columns = IdentifyCategoricalColumn(df) categorical_columns, _ = CheckColumnsExist(df, categorical_columns) logger.info("identify {} categorical columns: {}".format(len(categorical_columns), categorical_columns)) df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category) new_columns = [c for c in df.columns if c not in original_columns] logger.info("one-hot encoded to {} columns:".format(len(new_columns))) df[new_columns] = df[new_columns].astype(np.int8) ret = {cat: sorted([col for col in new_columns if cat in col]) for cat in categorical_columns} for k, v in ret.items(): logger.info("onehot {} to {} columns: {}".format(k, len(v), v)) return df, new_columns, categorical_columns
def Cast64To32(df, blacklist=['SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_BUREAU']): series_dtypes = df.dtypes series_dtypes = series_dtypes.loc[~series_dtypes.index.isin(blacklist)] if not series_dtypes.empty: logger.info('cast dataframe from 64 to 32') to_float32 = series_dtypes.loc[series_dtypes.apply( lambda x: x == np.float64)].index.tolist() df[to_float32] = df[to_float32].astype(np.float32) logger.info('cast {} columns float32: {}'.format( len(to_float32), to_float32)) to_int32 = series_dtypes.loc[series_dtypes.apply( lambda x: x == np.int64)].index.tolist() df[to_int32] = df[to_int32].astype(np.int32) logger.info('cast {} columns to int32: {}'.format( len(to_int32), to_int32)) return df
def loadHDF(self, filename, configs={}, limited_by_configs=True): """ """ logger.info("Read Data from HDFS") if not self.checkFile(filename): return self.loadEmpty(configs) if limited_by_configs: logger.info("Load selected DataFrame Only") load_data = self.readHDF(filename, configs, opt_load=True) else: # full loaded load_data = self.readHDF(filename, opt_load=True) for k, v in load_data.items(): if isinstance(v, pd.DataFrame): logger.info('memory usage on {} is {:.3f} MB'.format(k, v.memory_usage().sum() / 1024. ** 2)) self.data_lastet_load = load_data#.copy() return load_data