コード例 #1
0
ファイル: elo_predict.py プロジェクト: davidwang9527/12_elo
def parse_command_line():
    default_cache_prefix    = 'default'
    default_data_configs    = 'default_data_configs.py'
    default_model_configs   = 'default_model_configs.py'

    arg_parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('-p', '--cache_prefix',    type=str, default=default_cache_prefix, help='cache file prefix')
    arg_parser.add_argument('-d', '--data_configs',    type=str, default=default_data_configs,  help='data provider configs')
    arg_parser.add_argument('-m', '--model_configs',   type=str, default=default_model_configs, help='model training configs')
    arg_parser.add_argument('--refresh_cache', action='store_true', default=False,  help='refresh cache from orignal data')
    arg_parser.add_argument('--debug', action='store_true', default=False, help='debug mode using {} samples'.format(debug_nrows))
    arg_parser.add_argument('--training',   action='store_true', default=False, help='training model')
    arg_parser.add_argument('--predict',   action='store_true', default=False, help='predicting')
    args = arg_parser.parse_args()

    configs_loc = file_dir_path.get('configs', './configs')
    args.data_configs='{}/{}'.format(configs_loc,args.data_configs)
    args.model_configs='{}/{}'.format(configs_loc,args.model_configs)

    logger.info('-' * 30)
    logger.info('running task with prefix={}'.format(args.cache_prefix))
    logger.info('running task with data configs={}'.format(args.data_configs))
    logger.info('running task with model configs={}'.format(args.model_configs))
    logger.info('Refreshing cache from original data?:{}'.format(args.refresh_cache))
    if args.debug:
        logger.warning('**Debug Mode**')
    return args
コード例 #2
0
ファイル: elo_predict.py プロジェクト: davidwang9527/12_elo
def load_and_transform_data(args,datatype):
    data_configs    = initialize_configs(args.data_configs).data_configs
    dp = DataProvider(data_configs=data_configs,cache_prefix=args.cache_prefix,datatype=datatype,debug=args.debug)

    if args.refresh_cache:
        x, y = dp.load_and_transform_data(source='from_csv')
    else:
        x, y = dp.load_and_transform_data(source='from_train_test') 
    
    if args.debug:
        logger.warning('debug mode,x={}'.format(x.shape))
    else:
        logger.info('normal mode,x={}'.format(x.shape))
    return (x,y)
コード例 #3
0
    def get_debug(self, df):
        '''Get rows from df for debugging

        Parameters:
            df(DataFrame) : the original DataFrame
        
        Returns:
            DataFrame : If debug mode is ON,the final debug rows is the minimum of debug_rows in module config 
              and df rows.If debug mode is OFF, return df.
        '''

        if self.debug:
            debug_num = min(df.shape[0], config.debug_nrows)
            logger.warning('Debug mode, get {} records'.format(debug_num))
            return df.iloc[:debug_num]
        else:
            return df
コード例 #4
0
    def load_and_transform_data(self, source):
        '''This is the method exposed to caller.

            This is a 'controller' method:it invokes 'check_syntax' to check whether there're some errors 
            in data configs.Then it decides where to load the sources(e.g. from csv or from cached files):   
            When refresh_cache is set to False, data is loaded from train_test first(level=3),then data will 
            be loaded from level 2 if there's no files in level 3, and so on.When refresh_cache is set to True, 
            data will be reloaded from csv(level 1) to refresh the cache from scratch.
            Then it invokes 'preprocess' to do feature engineering if there's no files in level 3 or refresh_cache 
            is set to True. 
            Finally it return (x,y).

        Parameters:
            source('str') : source='train' means load and transform training data.source='test' means loading 
                            and transforming testing data.

        Returns:
             tuple: the first item is X, the second item is Y.
        '''
        configs_table = pd.DataFrame(self.data_refresh_configs).T
        configs_table['level'] = configs_table['level'].astype(int)
        configs_table.set_index('level', inplace=True)
        configs_table['filename'] = configs_table['filename'].apply(
            lambda x: x.format(prefix=self.cache_prefix,
                               datatype=self.datatype)
            if isinstance(x, str) else None)

        self.feature_transformer_filename = self.feature_transformer_filename.format(
            cache_path=self.cache_path, prefix=self.cache_prefix)

        #check syntax of config files
        self.check_syntax()

        refresh_level = self.data_refresh_configs.get(source).get('level')
        if refresh_level == 3:
            logger.info("loading data from train_test......")
            filename = '{}/{}'.format(
                self.cache_path, configs_table.loc[refresh_level, 'filename'])
            self.data_train_test = load_pickle(filename=filename)
            if FeatureTransformer.any_empty_dataframe(self.data_train_test):
                refresh_level = 2
                logger.warning(
                    'No train_test cache for {}  to load. Trying to refresh at level {}'
                    .format(self.datatype, refresh_level))
                self.data_train_test = {}
            else:
                self.data_train_test['x'] = self.get_debug(
                    self.data_train_test['x'])
                self.data_train_test['y'] = self.get_debug(
                    self.data_train_test['y'])

        if refresh_level == 2:
            logger.info("loading data from raw......")
            filename = '{}/{}'.format(
                self.cache_path, configs_table.loc[refresh_level, 'filename'])
            self.data_raw = load_pickle(filename=filename)
            if FeatureTransformer.any_empty_dataframe(self.data_raw):
                refresh_level = 1
                logger.warning(
                    'No raw cache for {} to load. Trying to refresh at level {}'
                    .format(self.datatype, refresh_level))
                self.data_raw = {}
            else:
                for k, v in self.data_raw.items():
                    self.data_raw[k] = self.get_debug(v)

        if refresh_level == 1:
            logger.info("loading data from csv......")
            self.load_data_from_csv()
            filename = '{}/{}'.format(self.cache_path,
                                      configs_table.loc[2, 'filename'])
            save_pickle(filename, self.data_raw)

        if refresh_level <= 2:
            if self.datatype == 'test':
                self.feature_transformer = load_pickle(
                    filename=self.feature_transformer_filename)
                if self.feature_transformer == None:
                    logger.error(
                        'No feature transformer for transforming test data!')
                    exit(0)
            self.preprocess()
            filename = '{}/{}'.format(self.cache_path,
                                      configs_table.loc[3, 'filename'])
            save_pickle(filename, self.data_train_test)
            if self.datatype == 'train':
                save_pickle(self.feature_transformer_filename,
                            self.feature_transformer)

        if self.datatype == 'train':
            self.data_train_test[
                'x'] = FeatureTransformer.process_drop_columns(
                    self.data_train_test['x'], config.main_table_pk)

        return (self.data_train_test['x'], self.data_train_test['y'])