def CheckFileExist(filename, silent=True): if not os.path.exists(filename): if not silent: logger.warning('{} does not exist'.format(filename)) return False return True
def load_hyperparameters(self, filename): if not CheckFileExist(filename, silent=False): logger.warning('no hpo parameters load from {}'.format(filename)) return {} with open(filename, 'rb') as f: params = pickle.load(f) logger.info('load from {} with params:{}'.format(filename, params)) return params
def _evaluate(self, eval_params): eval_params = dict(zip(self.eval_params_name, eval_params)) tuning_params = self.init_params.copy() tuning_params.update(eval_params) # reinitialize cv cv_obj = self.nr_fold if self.valid_type == 'TimeSeriesSplit': cv_obj = model_selection_object[self.valid_type]( n_splits=self.nr_fold) elif 'KFold' in self.valid_type: cv_obj = model_selection_object[self.valid_type]( n_splits=self.nr_fold, shuffle=True, random_state=self.split_seed) if self.set_params_safe: try: m = self.model().set_params(**tuning_params) except: logger.warning('fail to use set_params') m = self.model(**tuning_params) logger.warning('model params={}'.format(m.get_params())) else: # unless some parameters cannot pass through set_params() m = self.model(**tuning_params) score = np.mean( cross_val_score(m, self.X, self.y, cv=cv_obj, n_jobs=1, scoring=self.metric)) self.nr_iteration += 1 self.best_score = max(self.best_score, score) # save the current best paramerters here if self.best_score == score: # update new result self.filestem_meta.update({'score': score}) self.optimized_params = tuning_params.copy() if self.nr_iteration >= self.n_init_points: self.save_hyperparameters(show_iter=True) else: self.save_hyperparameters(show_iter=False) if self.nr_iteration == self.n_init_points: # save after intinializing self.save_hyperparameters(show_iter=False) logger.info( 'iteration {:04d}/{:04d}, current score: {:04f}, best: {:.4f}, current params: {}, best params: {}' .format(self.nr_iteration, self.n_calls, score, self.best_score, tuning_params, self.optimized_params)) return -score # for minimize, most scikit-learn metric are larger the better
def CullFeatures(self, x, blacklist=list()): if not blacklist: logger.warning('empty blacklist') return x before = x.shape x = x[[f for f in x.columns if f not in blacklist]] logger.info('shrink from {} to {} by {}'.format(before, x.shape, len(blacklist))) return x
def GetBlacklist(self, threshold=10.): if self.importance_series.empty: logger.warning('no feature') return list() logger.info('create blacklist on score <= {}'.format(threshold)) ret = self.importance_series.loc[self.importance_series <= threshold].index.tolist() logger.info('return blacklist of {} from {} features'.format(len(ret), len(self.importance_series))) return ret
def ReturnTrainTest(configs): df_names = ['train_x', 'train_y', 'test_x', 'test_y'] configs.update({ k: pd.DataFrame() for k, v in configs.items() if k not in df_names }) for df_name in [k for k, v in configs.items() if v.empty]: logger.warning("no key as {}".format(df_name)) # return train_x, train_y, test_x, test_y return configs['train_x'], configs['train_y'], configs[ 'test_x'], configs['test_y']
def AnyEmptyDataframe(data): if not data: logger.warning('passing no dataframes') return True if isinstance(data, dict): return any([v.empty for k, v in data.items()]) elif isinstance(data, list): return any([l.empty for l in data]) return False
def process_interaction(df, process_configs): """ process configs is a dictionary as a dictionary with {'new_feature_name': {'mode': 'add', 'a': 'col_name', 'b':'col_name',}, } ------ """ logger.info("Process Interactions") possible_arithmetics = ['add', 'sum_squared', 'subtract', 'subtract_positive', 'multiply', 'divide', 'divide_nonzero'] new_columns = [] for v in process_configs: k = v['name'] logger.info("process {}".format(k)) # check arithmetic arithmetic = v.get('mode', None) if arithmetic not in possible_arithmetics: logger.warning("no arithmetic on {}".format(k)) continue #check feature columns ckeck_cols = [vv for kk, vv in v.items() if kk not in ['name', 'mode']] cols_exist, cols_not_exist = CheckColumnsExist(df, ckeck_cols) if cols_not_exist: logger.warning("missing {} columns: {}".format(len(cols_not_exist), cols_not_exist)) continue # process if 'add' == arithmetic: df[k] = df[v['a']] + df[v['b']] elif 'subtract' == arithmetic: df[k] = df[v['a']] - df[v['b']] elif 'subtract_positive' == arithmetic: df[k] = (df[v['a']] - df[v['b']]).apply(lambda x: x if x > 0 else 0) elif 'multiply' == arithmetic: df[k] = df[v['a']] * df[v['b']] elif 'divide' == arithmetic: df[k] = df[v['a']] / df[v['b']] elif 'divide_nonzero' == arithmetic: df[k] = df[v['a']] / (df[v['b']] + 1.) elif 'sum_squared' == arithmetic: df[k] = df[[v['a'], v['b']]].pow(2).sum(axis=1)# np.square(df[v['a']]) + np.square(df[v['b']]) new_columns.append(k) return df, new_columns
def process_factorize(df, process_configs): """ input a list of features to factorize (label encoding) """ logger.info("Process Factorize") cols_exist, cols_not_exist = CheckColumnsExist(df, sorted(process_configs)) for bin_feature in cols_exist: df[bin_feature], uniques = pd.factorize(df[bin_feature], sort=False) logger.info("factorize {} in {}: {}".format(len(uniques), bin_feature, uniques)) for k in cols_not_exist: logger.warning("missing {}".format(k)) return df
def saveHDF(self, filename, data, opt_overwrite=True, opt_fast=False): if self.checkFile(filename): if not opt_overwrite: logger.warning("overwrite is not allowed") return False compress_option = hdf5_compress_option if opt_fast: logger.info("use faster compression option") compress_option = fast_hdf5_compress_option with pd.HDFStore(filename, 'w', **compress_option) as store: logger.info("Save to {}".format(filename)) for k, d in data.items(): store.put(k, d, format='table') #store.put(k, d, format='fixed') logger.info("Save {}: {}".format(k, d.shape))
def process_replace(df, process_configs): """ {'DAYS_EMPLOYED': {365243: np.nan, }, } """ logger.info("Process Fill NA") columns = sorted(list(process_configs.keys())) cols_exist, cols_not_exist = CheckColumnsExist(df, columns) configs = {k: v for k, v in process_configs.items() if k in cols_exist} df.replace(configs, inplace=True) for k, v in configs.items(): logger.info("impute {} using {}".format(k, v)) if cols_not_exist: logger.warning("missing {} columns: {}".format(len(cols_not_exist), cols_not_exist)) return df
def process_aggregate(df, process_configs, groupby_cols, cat_cols=[]): """ pass each groupby_cols one by one: aggregate and condictional aggregate, general aggregate """ logger.info("Process Aggregate") groupby_cols = [f for f in groupby_cols if f in df.columns] # if groupby_cols not in df.columns: if not groupby_cols: logger.warning("aggregate column {} not exist".format(groupby_cols)) return pd.DataFrame({groupby_cols:[]}).set_index(groupby_cols) logger.info("aggregate on {}".format(groupby_cols)) header = process_configs.get('header', 'foobar') aggregations = {} # aggregate and condictional aggregate num_cols = process_configs.get('num', {}) cat_agg = process_configs.get('cat', []) if num_cols or cat_agg: aggregations = {k:list(v) for k, v in num_cols.items() if k in df.columns and v} aggregations.update({k:list(cat_agg) for k in cat_cols if k in df.columns and cat_agg}) for k, v in aggregations.items(): # dict logger.info("aggregate {} ({}) with {}".format(k, df[k].dtype, v)) # assigned in configs but not in dataframe missing = sorted(list(set(num_cols.keys()).union(set(cat_cols)).difference(set(aggregations.keys())))) for k in missing: # dict if k in num_cols.keys(): logger.info("missing {} in num".format(k)) elif k in cat_cols: logger.info("missing {} in cat".format(k)) # processing if aggregations: df_agg = df.groupby(groupby_cols).agg({**aggregations}) df_agg.columns = pd.Index(['{}_{}_{}'.format(header, e[0], e[1].upper()) for e in df_agg.columns.tolist()]) else: logger.info("no aggragation on {} and {}".format(header, groupby_cols)) df_agg = pd.DataFrame({groupby_cols:[]}).set_index(groupby_cols) if process_configs.get('count', False): logger.info("aggregate count on {} at {}".format(groupby_cols, header)) df_agg['{}_COUNT_{}'.format(header, '_'.join(groupby_cols))] = df.groupby(groupby_cols).size() return df_agg
def __init__(self, model, configs={}, task_name=None, data_prefix=None): self.model = model self.task_name = task_name self.data_prefix = data_prefix self.params_dir = file_dir_path.get('params', '../params') #skopt search_settings = configs.get("search_settings", {}) self.n_calls = search_settings.get("n_calls", 15) self.random_state = search_settings.get("random_state", 42) self.n_init_points = search_settings.get("n_inits", 10) if self.n_init_points >= self.n_calls: logger.warning( 'initial points {} is larger than n_calls {}'.format( self.n_init_points, self.n_calls)) #validation evalute_settings = configs.get("evaluation_settings", {}) self.valid_type = evalute_settings.get("validation", "KFold") self.nr_fold = evalute_settings.get("nr_fold", 3) self.split_seed = evalute_settings.get("split_seed", 42) self.metric = evalute_settings.get("eval_metric", "neg_log_loss") #model self.init_params = configs.get("initialize", {}) self.search_space = configs.get("search_space", {}) self.set_params_safe = self._check_parameters() self.optimized_params = {} self.filename_hpo_iter = '' self.filename_hpo_best = '' #initializing self._search_space_initialize() # self.filestem_meta = { 'level': 0, 'model': self.task_name, 'feature_num': 0, 'score': 0, 'fold': self.nr_fold, }
def LoadResult(self, result_files): if not result_files: logger.warning('no result file to rank features') return False elif len(result_files) == 1: ret = DataFileIO().loadHDF('{loc}/{filename}'.format(loc=self.result_dir, filename=result_files[0])) df = ret.get('feature_importance', pd.DataFrame()) else: logger.info('concate {} results to rank features'.format(len(result_files))) rets = list() for f in result_files: rets.append(DataFileIO().loadHDF('{loc}/{filename}'.format(loc=self.result_dir, filename=f))) rets = [ret.get('feature_importance', pd.DataFrame()) for ret in rets] df = pd.concat(rets, axis=1) self._analyzeFeatures(df)
def process_drop_rows(df, process_configs): """ {'CODE_GENDER': ['XNA'], } """ logger.info("Process Drop Rows") columns = sorted(list(process_configs.keys())) cols_exist, cols_not_exist = CheckColumnsExist(df, columns) configs = {k: v for k, v in process_configs.items() if k in cols_exist} inds = df[cols_exist].isin(configs) inds_sel = inds.any(axis=1) for f, series in inds.iteritems(): logger.info("remove {} rows by in {} if any {}".format(f, series.sum(), process_configs[f])) logger.info("overall remove {} from {} rows".format(inds_sel.astype(int).sum(), inds_sel.shape[0])) if cols_not_exist: logger.warning("missing {} columns: {}".format(len(cols_not_exist), cols_not_exist)) return df.loc[~inds_sel]
def process_deep_interactions(df, process_configs): """ {'header' : 'EXT_SOURCES_SYNTHESIZE', 'transform': ['product', 'mean', 'sum', 'sum_squared', 'std'], 'columns' : ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'], } """ applicable_methods = ['kurtosis', 'sum', 'sum_squared', 'product', 'mean', 'std'] header = process_configs.get('header', 'NEW') cols = process_configs.get('columns', []) cols_na = [f for f in cols if f not in df.columns] cols = [f for f in cols if f in df.columns] methods = process_configs.get('transform', []) methods = [m for m in methods if m in applicable_methods] for m in methods: logger.info('transform deep interactions ({}): {}'.format(m, cols)) if cols_na: logger.warning('transform deep interactions ({}), features not found: {}'.format( m, cols_na)) name = '{}_{}'.format(header, m.upper()) if m == 'kurtosis': df[name] = df[cols].kurtosis(axis=1) elif m == 'mean': df[name] = df[cols].mean(axis=1) elif m == 'sum': df[name] = df[cols].sum(axis=1) elif m == 'sum_squared': df[name] = df[cols].pow(2).sum(axis=1) elif m == 'product': df[name] = df[cols].fillna(df[cols].mean()).product(axis=1) elif m == 'std': df[name] = df[cols].std(axis=1) df[name] = df[name].fillna(df[name].mean()) return df
def save_hyperparameters(self, export=False, show_iter=True, remove_old=True): if not self.optimized_params: logger.warning('need to run optimize first') return False params = SwitchDevice(self.optimized_params, enable_gpu=False) if export: filename = filename_hpo_external.format(loc=self.params_dir, prefix=self.data_prefix, task=self.task_name) logger.warning('export for external module: {}'.format(filename)) self._save_pickle(filename, obj=params) return filename if remove_old and CheckFileExist(self.filename_hpo_best, silent=True): os.remove(self.filename_hpo_best) stem = self._current_file_stem() if show_iter: self.filename_hpo_iter = filename_hpo_intermediate.format( loc=self.params_dir, prefix=self.data_prefix, iter_num=self.nr_iteration, stem=stem) self._save_pickle(self.filename_hpo_iter, obj=params) #write current best anyway self.filename_hpo_best = filename_hpo_result.format( loc=self.params_dir, prefix=self.data_prefix, stem=stem) self._save_pickle(self.filename_hpo_best, obj=params) #self.load_hyperparameters(filename) # attemp to reload return True
def LoadData(self, data_configs, source='from_csv', prefix='sample'): """ """ #initialize, reading in configs for data provider itself configs_table = pd.DataFrame(self.provider_configs).T configs_table['level'] = configs_table['level'].astype(int) configs_table.set_index('level', inplace=True) configs_table['filename'] = configs_table['filename'].apply( lambda x: x.format(header=prefix) if isinstance(x, str) else None) provider_configs = self.provider_configs.get(source, 'from_csv').copy() # refresh_level = provider_configs.get('level') # load data at its refresh level filename = '{}/{}'.format(self.cache_path, configs_table.loc[refresh_level, 'filename']) if refresh_level == 3: logger.info("Load Train and Test from Cache") self.ReadTrainTestHDF(data_configs['input'], filename) if not AnyEmptyDataframe(self.xy_train_test): return self.ReturnTrainTest(self.xy_train_test) else: refresh_level = 2 logger.warning( 'No train_test cache to load. Try to refresh at level {}'. format(refresh_level)) filename = '{}/{}'.format( self.cache_path, configs_table.loc[refresh_level, 'filename']) if refresh_level == 2: logger.info("Recreate Train and Test") self.ReadProcessedHDF(data_configs['input'], filename) if AnyEmptyDataframe(self.data_processed): refresh_level = 1 logger.warning( 'no processed cache to load from disk. Attempt to refresh at level {}' .format(refresh_level)) filename = '{}/{}'.format( self.cache_path, configs_table.loc[refresh_level, 'filename']) if refresh_level == 1: logger.info("Process DataFrames from HDF Cashe") self.ReadRawHDF(data_configs['input'], filename, limited_by_configs=True) if AnyEmptyDataframe(self.data_raw): refresh_level = 0 logger.warning( 'No raw cache to load. Try to refresh at level {}'.format( refresh_level)) if refresh_level == 0: logger.info("Process DataFrames from CSV") self.ReadDataCSV(data_configs['input']) filename = '{}/{}'.format(self.cache_path, configs_table.loc[1, 'filename']) self.SaveFileHDF(filename, self.data_raw, opt_overwrite=True) # process data if refresh_level <= 1: logger.info("Process DataFrames") train_test = self._application_train_test( data_configs['application']) self.data_processed = { 'application_train': train_test[0], 'application_test': train_test[1], } self.data_processed.update({ 'bureau': self._bureau_and_balance(data_configs['bureau']), 'previous_application': self._previous_application( data_configs['previous_application']), 'pos_cash': self._pos_cash_balance(data_configs['pos_cash']), 'credit_card_balance': self._credit_card_balance(data_configs['credit_card_balance']), 'installments_payments': self._installments_payments( data_configs['installments_payments']), }) # save processed filename = '{}/{}'.format(self.cache_path, configs_table.loc[2, 'filename']) self.SaveFileHDF(filename, self.data_processed, opt_overwrite=True) # create train and test if refresh_level <= 2: self.CreateTrainTestData(self.data_processed) filename = '{}/{}'.format(self.cache_path, configs_table.loc[3, 'filename']) self.SaveFileHDF(filename, self.xy_train_test, opt_overwrite=True) return self.ReturnTrainTest(self.xy_train_test)
def get_optimal_parameters(self): if not self.optimized_params: logger.warning('need to run optimize first') return self.optimized_params.copy()