def predict(experiment_dir, config_map, predict_meta_file): result_dir = getResultDir(experiment_dir) util.maybeMakeDir(result_dir) result_file = getResultPath(result_dir) data_dir = getDataDir(experiment_dir) data_file = getDataPath(data_dir) if config_map['use_classification']: label_file = getLabelPath(data_dir) else: label_file = getRlabelPath(data_dir) meta_file = getMetaPath(data_dir) model_dir = getModelDir(experiment_dir) imputer_dir = getImputerDir(experiment_dir) model_prefix = '%s-' % getModelName(config_map) model_suffix = '-%d' % config_map['train_window'] imputer_prefix = 'imputer-' imputer_suffix = '-%d' % config_map['train_window'] cmd = ('%s/predict_all.py --data_file=%s --label_file=%s ' '--meta_file=%s --model_dir=%s --model_prefix="%s" ' '--model_suffix="%s" --imputer_dir=%s --imputer_prefix="%s" ' '--imputer_suffix="%s" --prediction_window=%d ' '--delay_window=%d --predict_meta_file=%s --result_file=%s' % ( CODE_DIR, data_file, label_file, meta_file, model_dir, model_prefix, model_suffix, imputer_dir, imputer_prefix, imputer_suffix, config_map['predict_window'], config_map['delay_window'], predict_meta_file, result_file)) util.run(cmd)
def predict(experiment_dir, config_map, predict_meta_file): result_dir = getResultDir(experiment_dir) util.maybeMakeDir(result_dir) result_file = getResultPath(result_dir) data_dir = getDataDir(experiment_dir) data_file = getDataPath(data_dir) if config_map['use_classification']: label_file = getLabelPath(data_dir) else: label_file = getRlabelPath(data_dir) meta_file = getMetaPath(data_dir) model_dir = getModelDir(experiment_dir) imputer_dir = getImputerDir(experiment_dir) model_prefix = '%s-' % getModelName(config_map) model_suffix = '-%d' % config_map['train_window'] imputer_prefix = 'imputer-' imputer_suffix = '-%d' % config_map['train_window'] cmd = ('%s/predict_all_2.py --data_file=%s --label_file=%s ' '--meta_file=%s --model_dir=%s --model_prefix="%s" ' '--model_suffix="%s" --imputer_dir=%s --imputer_prefix="%s" ' '--imputer_suffix="%s" --prediction_window=%d ' '--delay_window=%d --predict_meta_file=%s --result_file=%s' % (CODE_DIR, data_file, label_file, meta_file, model_dir, model_prefix, model_suffix, imputer_dir, imputer_prefix, imputer_suffix, config_map['predict_window'], config_map['delay_window'], predict_meta_file, result_file)) util.run(cmd)
def collectData(experiment_dir, config_map): data_dir = getDataDir(experiment_dir) util.maybeMakeDir(data_dir) gain_dir = getLabelDir(config_map['label']) feature_list = getFeatureListPath(experiment_dir) data_file = getDataPath(data_dir) label_file = getLabelPath(data_dir) rlabel_file = getRlabelPath(data_dir) meta_file = getMetaPath(data_dir) weight_file = getWeightPath(data_dir) date_file = getDatePath(config_map['predict_date_file']) cmd = ( '%s/collect_data.py --gain_dir=%s --max_neg=%f --min_pos=%f ' '--feature_base_dir=%s --feature_list=%s --feature_stats=%s ' '--min_date=%s --max_date=%s --window=%d --min_feature_perc=%f ' '--data_file=%s --label_file=%s --rlabel_file=%s --meta_file=%s ' '--weight_power=%f --weight_file=%s --date_file=%s' % (CODE_DIR, gain_dir, config_map['max_neg'], config_map['min_pos'], FEATURE_DIR, feature_list, FEATURE_STATS_FILE, config_map['min_date'], config_map['max_date'], config_map['feature_window'], config_map['min_feature_perc'], data_file, label_file, rlabel_file, meta_file, config_map['weight_power'], weight_file, date_file)) util.run(cmd)
def runExperiment(config_file): assert config_file.endswith(CONFIG_SUFFIX) pos = config_file.rfind('/') assert pos > 0 experiment = config_file[pos+1:-len(CONFIG_SUFFIX)] experiment_dir = '%s/%s' % (EXPERIMENT_BASE_DIR, experiment) util.maybeMakeDir(experiment_dir) config_map = getConfig(config_file) makeFeatureList(experiment_dir, config_map) step = '%s_collect_data' % experiment if not util.checkDone(step): collectData(experiment_dir, config_map) util.markDone(step) step = '%s_collect_label' % experiment if not util.checkDone(step): collectLabels(experiment_dir, config_map) util.markDone(step) data_dir = getDataDir(experiment_dir) if config_map['use_classification']: # For classification, negative labels indicate gain between # max_neg and min_pos and should be removed from training # (but not prediction). label_file = getLabelPath(data_dir) else: # Do not filter out negative regression labels. label_file = None train_meta_file = getTrainingMetaPath(data_dir) predict_meta_file = getPredictionMetaPath(data_dir) step = '%s_filter_train' % experiment if not util.checkDone(step): filterMetadata(experiment_dir, config_map, config_map['train_filter'], label_file, train_meta_file) util.markDone(step) step = '%s_filter_predict' % experiment if not util.checkDone(step): filterMetadata(experiment_dir, config_map, config_map['predict_filter'], None, predict_meta_file) util.markDone(step) step = '%s_train_models' % experiment if not util.checkDone(step): trainModels(experiment_dir, config_map, train_meta_file) util.markDone(step) step = '%s_predict' % experiment if not util.checkDone(step): predict(experiment_dir, config_map, predict_meta_file) util.markDone(step) step = '%s_analyze' % experiment if not util.checkDone(step): analyze(experiment_dir, config_map) util.markDone(step)
def runExperiment(config_file): assert config_file.endswith(CONFIG_SUFFIX) pos = config_file.rfind('/') assert pos > 0 experiment = config_file[pos + 1:-len(CONFIG_SUFFIX)] experiment_dir = '%s/%s' % (EXPERIMENT_BASE_DIR, experiment) util.maybeMakeDir(experiment_dir) config_map = getConfig(config_file) makeFeatureList(experiment_dir, config_map) step = '%s_collect_data' % experiment if not util.checkDone(step): collectData(experiment_dir, config_map) util.markDone(step) step = '%s_collect_label' % experiment if not util.checkDone(step): collectLabels(experiment_dir, config_map) util.markDone(step) data_dir = getDataDir(experiment_dir) if config_map['use_classification']: # For classification, negative labels indicate gain between # max_neg and min_pos and should be removed from training # (but not prediction). label_file = getLabelPath(data_dir) else: # Do not filter out negative regression labels. label_file = None train_meta_file = getTrainingMetaPath(data_dir) predict_meta_file = getPredictionMetaPath(data_dir) step = '%s_filter_train' % experiment if not util.checkDone(step): filterMetadata(experiment_dir, config_map, config_map['train_filter'], label_file, train_meta_file) util.markDone(step) step = '%s_filter_predict' % experiment if not util.checkDone(step): filterMetadata(experiment_dir, config_map, config_map['predict_filter'], None, predict_meta_file) util.markDone(step) step = '%s_train_models' % experiment if not util.checkDone(step): trainModels(experiment_dir, config_map, train_meta_file) util.markDone(step) step = '%s_predict' % experiment if not util.checkDone(step): predict(experiment_dir, config_map, predict_meta_file) util.markDone(step) step = '%s_analyze' % experiment if not util.checkDone(step): analyze(experiment_dir, config_map) util.markDone(step)
def analyze(experiment_dir, config_map): market_gain_file = getMarketGainPath(config_map) analyze_dir = getAnalyzeDir(experiment_dir) util.maybeMakeDir(analyze_dir) result_dir = getResultDir(experiment_dir) result_file = getResultPath(result_dir) cmd = ('%s/analyze_all.py --result_file=%s --skip_trans ' '--analyze_dir=%s --market_gain_file=%s' % (CODE_DIR, result_file, analyze_dir, market_gain_file)) util.run(cmd)
def analyze(experiment_dir, config_map): market_gain_file = getMarketGainPath(config_map) analyze_dir = getAnalyzeDir(experiment_dir) util.maybeMakeDir(analyze_dir) result_dir = getResultDir(experiment_dir) result_file = getResultPath(result_dir) cmd = ('%s/analyze_all.py --result_file=%s --hold_period=%d ' '--analyze_dir=%s --market_gain_file=%s' % ( CODE_DIR, result_file, config_map['predict_window'], analyze_dir, market_gain_file)) util.run(cmd)
def analyze(experiment_dir, config_map): market_gain_file = getMarketGainPath(config_map) analyze_dir = getAnalyzeDir(experiment_dir) util.maybeMakeDir(analyze_dir) result_dir = getResultDir(experiment_dir) result_file = getResultPath(result_dir) cmd = ('%s/analyze_all.py --result_file=%s --hold_period=%d ' '--analyze_dir=%s --market_gain_file=%s' % (CODE_DIR, result_file, config_map['predict_window'], analyze_dir, market_gain_file)) util.run(cmd)
def collectData(experiment_dir, config_map): data_dir = getDataDir(experiment_dir) util.maybeMakeDir(data_dir) gain_dir = getLabelDir(config_map['label']) feature_list = getFeatureListPath(experiment_dir) data_file = getDataPath(data_dir) meta_file = getMetaPath(data_dir) cmd = ('%s/collect_data.py --gain_dir=%s --feature_base_dir=%s ' '--feature_list=%s --feature_stats=%s --min_date=%s --max_date=%s ' '--window=%d --min_feature_perc=%f --data_file=%s --meta_file=%s' % ( CODE_DIR, gain_dir, FEATURE_DIR, feature_list, FEATURE_STATS_FILE, config_map['min_date'], config_map['max_date'], config_map['feature_window'], config_map['min_feature_perc'], data_file, meta_file)) util.run(cmd)
def collectData(experiment_dir, config_map): data_dir = getDataDir(experiment_dir) util.maybeMakeDir(data_dir) gain_dir = getLabelDir(config_map['label']) feature_list = getFeatureListPath(experiment_dir) data_file = getDataPath(data_dir) meta_file = getMetaPath(data_dir) cmd = ('%s/collect_data.py --gain_dir=%s --feature_base_dir=%s ' '--feature_list=%s --feature_stats=%s --min_date=%s --max_date=%s ' '--window=%d --min_feature_perc=%f --data_file=%s --meta_file=%s' % (CODE_DIR, gain_dir, FEATURE_DIR, feature_list, FEATURE_STATS_FILE, config_map['min_date'], config_map['max_date'], config_map['feature_window'], config_map['min_feature_perc'], data_file, meta_file)) util.run(cmd)
if logDo('project_yahoo'): cmd = ( '%s/project_yahoo.py --raw_dir=%s --trading_day_file=%s ' '--projected_dir=%s' % (CODE_DIR, YAHOO_SF1_DIR, YAHOO_TRADING_DAY_FILE, YAHOO_PROJECTED_DIR)) run(cmd, 'project_yahoo') if logDo('adjust_yahoo'): cmd = '%s/adjust_yahoo.py --yahoo_dir=%s --output_dir=%s' % ( CODE_DIR, YAHOO_PROJECTED_DIR, YAHOO_ADJUSTED_DIR) run(cmd, 'adjust_yahoo') if logDo('compute_rolling_window_volumed'): output_dir = '%s/volumed_mean_%d' % (YAHOO_ADJUSTED_DIR, VOLUMED_K) util.maybeMakeDir(output_dir) cmd = ('%s/compute_rolling_window_feature.py --input_dir=%s/volumed ' '--window=%d --method=mean --output_dir=%s' % (CODE_DIR, YAHOO_ADJUSTED_DIR, VOLUMED_K, output_dir)) run(cmd, 'compute_rolling_window_volumed') if logDo('compute_window_features'): cmd = ('%s/compute_window_features.py --adjusted_dir=%s ' '--feature_base_dir=%s --computer=%s/compute_window_feature.py') % ( CODE_DIR, YAHOO_ADJUSTED_DIR, FEATURE_DIR, CODE_DIR) run(cmd, 'compute_window_features') if logDo('compute_basic_features'): cmd = ('%s/compute_basic_features.py --processed_dir=%s --ticker_file=%s ' '--feature_base_dir=%s --info_dir=%s ' '--computer=%s/compute_basic_feature.py') % (
if logDo('get_yahoo_logadjprice'): cmd = ('%s/get_price_volume.py --processed_dir=%s --column=adjprice ' '--take_log --output_dir=%s' % ( CODE_DIR, YAHOO_PROCESSED_DIR, YAHOO_LOGADJPRICE_DIR)) run(cmd, 'get_yahoo_logadjprice') if logDo('get_yahoo_logadjvolume'): cmd = ('%s/get_price_volume.py --processed_dir=%s --column=adjvolume ' '--take_log --output_dir=%s' % ( CODE_DIR, YAHOO_PROCESSED_DIR, YAHOO_LOGADJVOLUME_DIR)) run(cmd, 'get_yahoo_logadjvolume') if logDo('get_eod_gain_feature'): for k in GAIN_K_LIST: gain_dir = '%s/%d' % (EOD_GAIN_DIR, k) util.maybeMakeDir(gain_dir) cmd = '%s/compute_gain.py --price_dir=%s --k=%d --gain_dir=%s' % ( CODE_DIR, EOD_ADJPRICE_DIR, k, gain_dir) run(cmd) markDone('get_eod_gain_feature') if logDo('get_yahoo_gain_feature'): for k in GAIN_K_LIST: gain_dir = '%s/%d' % (YAHOO_GAIN_DIR, k) util.maybeMakeDir(gain_dir) cmd = '%s/compute_gain.py --price_dir=%s --k=%d --gain_dir=%s' % ( CODE_DIR, YAHOO_ADJPRICE_DIR, k, gain_dir) run(cmd) markDone('get_yahoo_gain_feature') if logDo('get_membership'):
def trainModels(experiment_dir, config_map, train_meta_file): date_file = getDatePath(config_map['train_date_file']) with open(date_file, 'r') as fp: dates = sorted(fp.read().splitlines()) data_dir = getDataDir(experiment_dir) data_file = getDataPath(data_dir) if config_map['use_classification']: label_file = getLabelPath(data_dir) else: label_file = getRlabelPath(data_dir) meta_file = getMetaPath(data_dir) model_dir = getModelDir(experiment_dir) util.maybeMakeDir(model_dir) imputer_dir = getImputerDir(experiment_dir) util.maybeMakeDir(imputer_dir) stats_file = getStatsPath(experiment_dir, config_map) weight_args = '' if config_map['use_weight']: weight_args = '--weight_file=%s --tmp_weight_file=%s' % ( getWeightPath(data_dir), TMP_WEIGHT_FILE) with open(stats_file, 'w') as fp: # Keep in sync with evaluateModel(). print >> fp, '\t'.join([ 'date', 'f1', 'auc', '1perc-precision', '1perc-recall', '10perc-precision', '10perc-recall', '100perc-precision', '100perc-recall', ]) for date in dates: if date < config_map['start_date']: continue model_file = getModelPath(model_dir, date, config_map) imputer_file = getImputerPath(imputer_dir, date, config_map) cmd = ( '%s/train_model.py --data_file=%s --label_file=%s --meta_file=%s %s ' '--date=%s --months=%d --model_def="%s" --perc=%f --model_file=%s ' '--train_meta_file=%s --tmp_data_file=%s --tmp_label_file=%s ' '--imputer_strategy=%s --imputer_file=%s' % (CODE_DIR, data_file, label_file, meta_file, weight_args, date, config_map['train_window'], config_map['model_spec'], config_map['train_perc'], model_file, train_meta_file, TMP_DATA_FILE, TMP_LABEL_FILE, config_map['imputer_strategy'], imputer_file)) util.run(cmd) if not os.path.isfile(model_file): continue if config_map['use_classification']: result = evaluateModel(model_file, imputer_file, TMP_DATA_FILE, TMP_LABEL_FILE) # Keep in sync with evaluateModel(). values = [date, '%.4f' % result['f1'], '%.4f' % result['auc']] for perc in EVAL_PERCS: values.append('%.4f' % result['%dperc-precision' % perc]) values.append('%.4f' % result['%dperc-recall' % perc]) print >> fp, '\t'.join(values) fp.flush()
def trainModels(experiment_dir, config_map, train_meta_file): dates = [] date = config_map['start_date'] while date <= config_map['end_date']: dates.append(date) year = int(date[:4]) month = int(date[4:]) if month < 12: month += 1 else: month = 1 year += 1 date = '%04d%02d' % (year, month) data_dir = getDataDir(experiment_dir) data_file = getDataPath(data_dir) if config_map['use_classification']: label_file = getLabelPath(data_dir) else: label_file = getRlabelPath(data_dir) meta_file = getMetaPath(data_dir) model_dir = getModelDir(experiment_dir) util.maybeMakeDir(model_dir) imputer_dir = getImputerDir(experiment_dir) util.maybeMakeDir(imputer_dir) stats_file = getStatsPath(experiment_dir, config_map) weight_args = '' if config_map['use_weight']: weight_args = '--weight_file=%s --tmp_weight_file=%s' % ( getWeightPath(data_dir), TMP_WEIGHT_FILE) with open(stats_file, 'w') as fp: # Keep in sync with evaluateModel(). print >> fp, '\t'.join([ 'date', 'f1', 'auc', '1perc-precision', '1perc-recall', '10perc-precision', '10perc-recall', '100perc-precision', '100perc-recall', ]) for date in dates: model_file = getModelPath(model_dir, date, config_map) imputer_file = getImputerPath(imputer_dir, date, config_map) cmd = ('%s/train_model.py --data_file=%s --label_file=%s --meta_file=%s %s ' '--yyyymm=%s --months=%d --model_def="%s" --perc=%f --model_file=%s ' '--train_meta_file=%s --tmp_data_file=%s --tmp_label_file=%s ' '--imputer_strategy=%s --imputer_file=%s' % ( CODE_DIR, data_file, label_file, meta_file, weight_args, date, config_map['train_window'], config_map['model_spec'], config_map['train_perc'], model_file, train_meta_file, TMP_DATA_FILE, TMP_LABEL_FILE, config_map['imputer_strategy'], imputer_file)) util.run(cmd) if not os.path.isfile(model_file): continue if config_map['use_classification']: result = evaluateModel(model_file, imputer_file, TMP_DATA_FILE, TMP_LABEL_FILE) # Keep in sync with evaluateModel(). values = [date, '%.4f' % result['f1'], '%.4f' % result['auc']] for perc in EVAL_PERCS: values.append('%.4f' % result['%dperc-precision' % perc]) values.append('%.4f' % result['%dperc-recall' % perc]) print >> fp, '\t'.join(values) fp.flush()