def parse_command_line(): default_cache_prefix = 'default' default_data_configs = 'default_data_configs.py' default_model_configs = 'default_model_configs.py' arg_parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('-p', '--cache_prefix', type=str, default=default_cache_prefix, help='cache file prefix') arg_parser.add_argument('-d', '--data_configs', type=str, default=default_data_configs, help='data provider configs') arg_parser.add_argument('-m', '--model_configs', type=str, default=default_model_configs, help='model training configs') arg_parser.add_argument('--refresh_cache', action='store_true', default=False, help='refresh cache from orignal data') arg_parser.add_argument('--debug', action='store_true', default=False, help='debug mode using {} samples'.format(debug_nrows)) arg_parser.add_argument('--training', action='store_true', default=False, help='training model') arg_parser.add_argument('--predict', action='store_true', default=False, help='predicting') args = arg_parser.parse_args() configs_loc = file_dir_path.get('configs', './configs') args.data_configs='{}/{}'.format(configs_loc,args.data_configs) args.model_configs='{}/{}'.format(configs_loc,args.model_configs) logger.info('-' * 30) logger.info('running task with prefix={}'.format(args.cache_prefix)) logger.info('running task with data configs={}'.format(args.data_configs)) logger.info('running task with model configs={}'.format(args.model_configs)) logger.info('Refreshing cache from original data?:{}'.format(args.refresh_cache)) if args.debug: logger.warning('**Debug Mode**') return args
def load_and_transform_data(args,datatype): data_configs = initialize_configs(args.data_configs).data_configs dp = DataProvider(data_configs=data_configs,cache_prefix=args.cache_prefix,datatype=datatype,debug=args.debug) if args.refresh_cache: x, y = dp.load_and_transform_data(source='from_csv') else: x, y = dp.load_and_transform_data(source='from_train_test') if args.debug: logger.warning('debug mode,x={}'.format(x.shape)) else: logger.info('normal mode,x={}'.format(x.shape)) return (x,y)
def get_debug(self, df): '''Get rows from df for debugging Parameters: df(DataFrame) : the original DataFrame Returns: DataFrame : If debug mode is ON,the final debug rows is the minimum of debug_rows in module config and df rows.If debug mode is OFF, return df. ''' if self.debug: debug_num = min(df.shape[0], config.debug_nrows) logger.warning('Debug mode, get {} records'.format(debug_num)) return df.iloc[:debug_num] else: return df
def load_and_transform_data(self, source): '''This is the method exposed to caller. This is a 'controller' method:it invokes 'check_syntax' to check whether there're some errors in data configs.Then it decides where to load the sources(e.g. from csv or from cached files): When refresh_cache is set to False, data is loaded from train_test first(level=3),then data will be loaded from level 2 if there's no files in level 3, and so on.When refresh_cache is set to True, data will be reloaded from csv(level 1) to refresh the cache from scratch. Then it invokes 'preprocess' to do feature engineering if there's no files in level 3 or refresh_cache is set to True. Finally it return (x,y). Parameters: source('str') : source='train' means load and transform training data.source='test' means loading and transforming testing data. Returns: tuple: the first item is X, the second item is Y. ''' configs_table = pd.DataFrame(self.data_refresh_configs).T configs_table['level'] = configs_table['level'].astype(int) configs_table.set_index('level', inplace=True) configs_table['filename'] = configs_table['filename'].apply( lambda x: x.format(prefix=self.cache_prefix, datatype=self.datatype) if isinstance(x, str) else None) self.feature_transformer_filename = self.feature_transformer_filename.format( cache_path=self.cache_path, prefix=self.cache_prefix) #check syntax of config files self.check_syntax() refresh_level = self.data_refresh_configs.get(source).get('level') if refresh_level == 3: logger.info("loading data from train_test......") filename = '{}/{}'.format( self.cache_path, configs_table.loc[refresh_level, 'filename']) self.data_train_test = load_pickle(filename=filename) if FeatureTransformer.any_empty_dataframe(self.data_train_test): refresh_level = 2 logger.warning( 'No train_test cache for {} to load. Trying to refresh at level {}' .format(self.datatype, refresh_level)) self.data_train_test = {} else: self.data_train_test['x'] = self.get_debug( self.data_train_test['x']) self.data_train_test['y'] = self.get_debug( self.data_train_test['y']) if refresh_level == 2: logger.info("loading data from raw......") filename = '{}/{}'.format( self.cache_path, configs_table.loc[refresh_level, 'filename']) self.data_raw = load_pickle(filename=filename) if FeatureTransformer.any_empty_dataframe(self.data_raw): refresh_level = 1 logger.warning( 'No raw cache for {} to load. Trying to refresh at level {}' .format(self.datatype, refresh_level)) self.data_raw = {} else: for k, v in self.data_raw.items(): self.data_raw[k] = self.get_debug(v) if refresh_level == 1: logger.info("loading data from csv......") self.load_data_from_csv() filename = '{}/{}'.format(self.cache_path, configs_table.loc[2, 'filename']) save_pickle(filename, self.data_raw) if refresh_level <= 2: if self.datatype == 'test': self.feature_transformer = load_pickle( filename=self.feature_transformer_filename) if self.feature_transformer == None: logger.error( 'No feature transformer for transforming test data!') exit(0) self.preprocess() filename = '{}/{}'.format(self.cache_path, configs_table.loc[3, 'filename']) save_pickle(filename, self.data_train_test) if self.datatype == 'train': save_pickle(self.feature_transformer_filename, self.feature_transformer) if self.datatype == 'train': self.data_train_test[ 'x'] = FeatureTransformer.process_drop_columns( self.data_train_test['x'], config.main_table_pk) return (self.data_train_test['x'], self.data_train_test['y'])