def parse_command_line(): default_cache_prefix = 'default' default_data_configs = 'default_data_configs.py' default_model_configs = 'default_model_configs.py' arg_parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('-p', '--cache_prefix', type=str, default=default_cache_prefix, help='cache file prefix') arg_parser.add_argument('-d', '--data_configs', type=str, default=default_data_configs, help='data provider configs') arg_parser.add_argument('-m', '--model_configs', type=str, default=default_model_configs, help='model training configs') arg_parser.add_argument('--refresh_cache', action='store_true', default=False, help='refresh cache from orignal data') arg_parser.add_argument('--debug', action='store_true', default=False, help='debug mode using {} samples'.format(debug_nrows)) arg_parser.add_argument('--training', action='store_true', default=False, help='training model') arg_parser.add_argument('--predict', action='store_true', default=False, help='predicting') args = arg_parser.parse_args() configs_loc = file_dir_path.get('configs', './configs') args.data_configs='{}/{}'.format(configs_loc,args.data_configs) args.model_configs='{}/{}'.format(configs_loc,args.model_configs) logger.info('-' * 30) logger.info('running task with prefix={}'.format(args.cache_prefix)) logger.info('running task with data configs={}'.format(args.data_configs)) logger.info('running task with model configs={}'.format(args.model_configs)) logger.info('Refreshing cache from original data?:{}'.format(args.refresh_cache)) if args.debug: logger.warning('**Debug Mode**') return args
def load_and_transform_data(args,datatype): data_configs = initialize_configs(args.data_configs).data_configs dp = DataProvider(data_configs=data_configs,cache_prefix=args.cache_prefix,datatype=datatype,debug=args.debug) if args.refresh_cache: x, y = dp.load_and_transform_data(source='from_csv') else: x, y = dp.load_and_transform_data(source='from_train_test') if args.debug: logger.warning('debug mode,x={}'.format(x.shape)) else: logger.info('normal mode,x={}'.format(x.shape)) return (x,y)
def predict(args): #loading test data test, _=load_and_transform_data(args,'test') #loading model model=load_pickle('./model/best_model.pkl') submission_filename='{}/{}'.format(file_dir_path.get('output','./output'),'elo.csv') #predict preds = model.predict(test.drop(labels=main_table_pk,axis=1)) test["target"] = preds submission = test[['card_id','target']] submission.to_csv(submission_filename, index=False) logger.info(submission.head())
def load_data_from_csv(self): '''Load data from csv There must be 'fact_train' for main table of training data and 'fact_test' for main table of test data. When the datatype is 'train','fact_train' will be loaded. Otherwise, fact_test will be loaded. csv files are loaded by chunks. The chunk size is determined by variable chunksize in config module. ''' process_configs = self.data_configs['input'] data_dict = { k: '{}/{}'.format(self.input_path, data.get('name', None)) for k, data in process_configs.items() } if self.datatype == 'test': data_dict.pop('fact_train', None) else: data_dict.pop('fact_test', None) for k, f_path in data_dict.items(): if not check_file_exist(f_path): logger.error("file {} doesn't exist!".format(f_path)) continue df = pd.DataFrame() chunk_no = 0 for df_chunk in pd.read_csv(f_path, chunksize=config.chunksize): df_chunk.reset_index(drop=True, inplace=True) chunk_no = chunk_no + 1 logger.info( 'loading {} chunk(s) from {}, df_chunk shape={}'.format( chunk_no, f_path, df_chunk.shape)) df_chunk = FeatureTransformer.reduce_mem_usage(df_chunk) df = pd.concat([df, df_chunk], ignore_index=True, sort=False) logger.info(" {} records has been loaded in total".format( df.shape[0])) logger.info('memory usage on {} is {:.3f} MB'.format( k, df.memory_usage(index=True, deep=True).sum() / 1024.**2)) if self.debug == True and df.shape[0] >= config.debug_nrows: break df = FeatureTransformer.reduce_mem_usage(df) if k == 'fact_train' or k == 'fact_test': k = 'fact' self.data_raw[k] = df return
def train(args): #loading training data X, y = load_and_transform_data(args, 'train') train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=40) #training model model_configs = initialize_configs(args.model_configs).model_configs logger.info('load hpo configs of {} models'.format(len(model_configs))) rmse_best = 100 for model_name, model_config in model_configs.items(): logger.info('training model {}'.format(model_name)) regression = model_config.get('model', None)(model_configs.get('initialize')) param_grid = model_config.get('search_space') search = GridSearchCV(estimator=regression, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', refit=True, n_jobs=-1, verbose=True) search.fit(train_x, train_y) logger.info('best hyperparameters for {}:{}'.format( model_name, search.best_params_)) score_train = search.score(train_x, train_y) score_test = search.score(test_x, test_y) logger.info('score:{:.2f}/{:.2f}'.format(score_train, score_test)) pred_train = search.predict(train_x) pred_test = search.predict(test_x) rmse_train = np.sqrt(metrics.mean_squared_error(pred_train, train_y)) rmse_test = np.sqrt(metrics.mean_squared_error(pred_test, test_y)) logger.info('RMSE:{:.2f}/{:.2f}'.format(rmse_train, rmse_test)) if rmse_test < rmse_best: logger.info( 'New test rmse benchmark! rmse_test:{},rmse_best:{}'.format( rmse_test, rmse_best)) rmse_best = rmse_test model_filename = "./model/best_model.pkl" with open(model_filename, "wb") as f: pickle.dump(search, f) logger.info("Model has been written to " + model_filename) return
def preprocess(self): '''This method preprocess actions given by data configs. It sequentially processes the value list whose dict key is 'process_sequence'. In every dict whose dict key is the items of the above value list,it processes the value list whose dict key is 'action_sequence'.Configurations of actions are documented at module default_data_configs. Below are supported actions:aggregations,change_dtype,clip_outliers,drop_columns,drop_rows,factorize_columns, get_data,interaction_columns,kbins,one_hot_encoding,pca,remove_duplicate, replace_values,result,select_columns,simple_impute,standardization. Note that this method supports those action names as the base name. That menas you could use 'replace_values1' and 'replace_values2' in the same action_sequence. ''' process_sequence = self.data_configs.get('process_sequence', []) for process_key in process_sequence: logger.info("processing {}......".format(process_key)) process_configs = self.data_configs[process_key] action_sequence = process_configs.get('action_sequence', []) for action_key in action_sequence: if 'aggregations' in action_key: action_configs = process_configs.get(action_key, []) if action_configs: df = FeatureTransformer.process_aggregation( df, action_configs) elif 'change_dtype' in action_key: action_configs = process_configs.get(action_key, []) if action_configs: df = FeatureTransformer.process_change_dtype( df, action_configs=action_configs) elif 'clip_outliers' in action_key: action_configs = process_configs.get(action_key, []) if action_configs: df = self.feature_transformer.process_clip_outliers( df, process_key, action_key, action_configs, self.datatype) elif 'drop_columns' in action_key: cols_to_drop = process_configs.get(action_key, []) if cols_to_drop: df = FeatureTransformer.process_drop_columns( df, drop_columns=cols_to_drop) elif 'drop_rows' in action_key: action_configs = process_configs.get(action_key, {}) if action_configs: df = FeatureTransformer.process_drop_rows( df, action_configs=action_configs) elif 'factorize_columns' in action_key: action_configs = process_configs.get(action_key, []) if action_configs: df = self.feature_transformer.process_factorize( df, process_key, action_key, action_configs, self.datatype) elif 'get_data' in action_key: action_configs = process_configs.get(action_key, []) for v in action_configs: if v.get('dict', '') == 'raw': df_tmp = self.data_raw[v.get('key')].copy() else: df_tmp = self.data_processed[v.get('key')].copy() how_to = v.get('how_to', '') if how_to == 'first_table': df = df_tmp.copy() elif how_to == 'merge': df = df.merge(df_tmp, how=v.get('how_to_merge'), left_on=v.get('left_on'), right_on=v.get('right_on')) logger.info('After {} merge:{}'.format( v.get('how_to_merge'), df.shape)) elif how_to == 'append': df = df.append(df_tmp, sort=False, ignore_index=True) logger.info("After append: {}".format(df.shape)) elif 'interaction_columns' in action_key: action_configs = process_configs.get(action_key, []) if action_configs: df = FeatureTransformer.process_interaction( df, action_configs=action_configs) elif 'kbins' in action_key: action_configs = process_configs.get(action_key, []) if action_configs: df = self.feature_transformer.process_kbinsprocess_kbins( df, process_key, action_key, action_configs, self.datatype) elif 'onehot_encoding' in action_key: action_configs = process_configs.get(action_key, []) df = self.feature_transformer.process_one_hot_encoder( df, process_key, action_key, self.datatype) elif 'pca' in action_key: action_configs = process_configs.get(action_key, []) df = self.feature_transformer.process_pca( df, process_key, action_key, action_configs, self.datatype) elif 'remove_duplicate' in action_key: action_configs = process_configs.get(action_key, {}) logger.info("df shape before removing duplcates:{}".format( df.shape)) df = df[~df.duplicated( subset=action_configs['duplicated_index_columns'], keep=action_configs['keep'])] logger.info("df shape after removing duplcates:{}".format( df.shape)) elif 'replace_values' in action_key: action_configs = process_configs.get(action_key, {}) if action_configs: df = FeatureTransformer.process_replace_values( df, action_configs=action_configs) elif 'result' in action_key: action_configs = process_configs.get(action_key, []) for f in action_configs: t = f.get('dict', '') key = f.get('key', '') include_columns = f.get('include_columns', '') exclude_columns = f.get('exclude_columns', '') if (t == 'train_test'): if include_columns: self.data_train_test[ key] = FeatureTransformer.process_select_columns( df, include_columns) elif exclude_columns: self.data_train_test[ key] = FeatureTransformer.process_drop_columns( df, exclude_columns) else: self.data_train_test[key] = df self.data_train_test[ key] = FeatureTransformer.reduce_mem_usage( self.data_train_test[key]) else: if include_columns: self.data_processed[ key] = FeatureTransformer.process_select_columns( df, include_columns) elif exclude_columns: self.data_processed[ key] = FeatureTransformer.process_drop_columns( df, exclude_columns) else: self.data_processed[key] = df self.data_processed[ key] = FeatureTransformer.reduce_mem_usage( self.data_processed[key]) elif 'select_columns' in action_key: cols_to_select = process_configs.get(action_key, []) if cols_to_select: df = FeatureTransformer.process_select_columns( df, select_columns=cols_to_select) elif 'simple_impute' in action_key: action_configs = process_configs.get(action_key, []) if action_configs: df = self.feature_transformer.process_simple_impute( df, process_key, action_key, action_configs, self.datatype) elif 'standardization' in action_key: action_configs = process_configs.get(action_key, []) if action_configs: df = self.feature_transformer.process_standardization( df, process_key, action_key, action_configs, self.datatype) else: logger.error( 'This action {} is not supported!'.format(action_key)) return
def load_and_transform_data(self, source): '''This is the method exposed to caller. This is a 'controller' method:it invokes 'check_syntax' to check whether there're some errors in data configs.Then it decides where to load the sources(e.g. from csv or from cached files): When refresh_cache is set to False, data is loaded from train_test first(level=3),then data will be loaded from level 2 if there's no files in level 3, and so on.When refresh_cache is set to True, data will be reloaded from csv(level 1) to refresh the cache from scratch. Then it invokes 'preprocess' to do feature engineering if there's no files in level 3 or refresh_cache is set to True. Finally it return (x,y). Parameters: source('str') : source='train' means load and transform training data.source='test' means loading and transforming testing data. Returns: tuple: the first item is X, the second item is Y. ''' configs_table = pd.DataFrame(self.data_refresh_configs).T configs_table['level'] = configs_table['level'].astype(int) configs_table.set_index('level', inplace=True) configs_table['filename'] = configs_table['filename'].apply( lambda x: x.format(prefix=self.cache_prefix, datatype=self.datatype) if isinstance(x, str) else None) self.feature_transformer_filename = self.feature_transformer_filename.format( cache_path=self.cache_path, prefix=self.cache_prefix) #check syntax of config files self.check_syntax() refresh_level = self.data_refresh_configs.get(source).get('level') if refresh_level == 3: logger.info("loading data from train_test......") filename = '{}/{}'.format( self.cache_path, configs_table.loc[refresh_level, 'filename']) self.data_train_test = load_pickle(filename=filename) if FeatureTransformer.any_empty_dataframe(self.data_train_test): refresh_level = 2 logger.warning( 'No train_test cache for {} to load. Trying to refresh at level {}' .format(self.datatype, refresh_level)) self.data_train_test = {} else: self.data_train_test['x'] = self.get_debug( self.data_train_test['x']) self.data_train_test['y'] = self.get_debug( self.data_train_test['y']) if refresh_level == 2: logger.info("loading data from raw......") filename = '{}/{}'.format( self.cache_path, configs_table.loc[refresh_level, 'filename']) self.data_raw = load_pickle(filename=filename) if FeatureTransformer.any_empty_dataframe(self.data_raw): refresh_level = 1 logger.warning( 'No raw cache for {} to load. Trying to refresh at level {}' .format(self.datatype, refresh_level)) self.data_raw = {} else: for k, v in self.data_raw.items(): self.data_raw[k] = self.get_debug(v) if refresh_level == 1: logger.info("loading data from csv......") self.load_data_from_csv() filename = '{}/{}'.format(self.cache_path, configs_table.loc[2, 'filename']) save_pickle(filename, self.data_raw) if refresh_level <= 2: if self.datatype == 'test': self.feature_transformer = load_pickle( filename=self.feature_transformer_filename) if self.feature_transformer == None: logger.error( 'No feature transformer for transforming test data!') exit(0) self.preprocess() filename = '{}/{}'.format(self.cache_path, configs_table.loc[3, 'filename']) save_pickle(filename, self.data_train_test) if self.datatype == 'train': save_pickle(self.feature_transformer_filename, self.feature_transformer) if self.datatype == 'train': self.data_train_test[ 'x'] = FeatureTransformer.process_drop_columns( self.data_train_test['x'], config.main_table_pk) return (self.data_train_test['x'], self.data_train_test['y'])