def train_and_test(training_start_date, training_end_date, testing_start_date, testing_end_date, folder_path): # load training Y # chg_pct = 0.2 # chg_threshold = 0.15 # train_Y = loadY(chg_pct, chg_threshold, training_start_date, training_end_date) # tot_Y = tot_Y.rename(columns={'PeakTrough': 'Y'}) # train_Y.loc[:, 'Y'] = train_Y['PeakTrough'].shift(-1) # predict tomorrow !!!! # train_Y = train_Y.loc[~train_Y['Y'].isnull()] # drop nan # train_Y = tot_Y.loc[(tot_Y['date'] >= training_start_date) & (tot_Y['date'] <= training_end_date)] # test_Y = tot_Y.loc[(tot_Y['date'] >= testing_start_date) & (tot_Y['date'] <= testing_end_date)] # model_training(train_Y, folder_path, training_start_date, training_end_date, chain_len) best_params = model_training(folder_path, training_start_date, training_end_date) # get testing Y tot_test_Y = loadY(best_params['chg_pct'], best_params['chg_threshold'], tot_start_date, tot_end_date) tot_test_Y.loc[:, 'Y'] = tot_test_Y['PeakTrough'].shift( -1) # predict tomorrow !!!! tot_test_Y = tot_test_Y.loc[~tot_test_Y['Y'].isnull()] # drop nan test_Y = tot_test_Y.loc[(tot_test_Y['date'] >= testing_start_date) & ( tot_test_Y['date'] <= testing_end_date)] # trim Y for test dates prediction, prsc = model_testing(test_Y, folder_path, testing_start_date, testing_end_date, best_params['chain_len']) return prediction, prsc
def train_and_test(tot_test_Y, training_start_date, training_end_date, testing_start_date, testing_end_date, chg_pct, chg_threshold, chain_len, folder_path): # load training Y # chg_pct = 0.2 # chg_threshold = 0.15 train_Y = loadY(chg_pct, chg_threshold, training_start_date, training_end_date) # tot_Y = tot_Y.rename(columns={'PeakTrough': 'Y'}) train_Y.loc[:, 'Y'] = train_Y['PeakTrough'].shift(-1) # predict tomorrow !!!! train_Y = train_Y.loc[~train_Y['Y'].isnull()] # drop nan # train_Y = tot_Y.loc[(tot_Y['date'] >= training_start_date) & (tot_Y['date'] <= training_end_date)] # test_Y = tot_Y.loc[(tot_Y['date'] >= testing_start_date) & (tot_Y['date'] <= testing_end_date)] model_training(train_Y, folder_path, training_start_date, training_end_date, chain_len) # get testing Y test_Y = tot_test_Y.loc[(tot_test_Y['date'] >= testing_start_date) & (tot_test_Y['date'] <= testing_end_date)] prediction, prsc = model_testing(test_Y, folder_path, testing_start_date, testing_end_date, chain_len) return prediction, prsc
def train_and_test(training_start_date, training_end_date, testing_start_date, testing_end_date, folder_path, year, season): best_params = model_training(folder_path, training_start_date, training_end_date, year, season) # get testing Y tot_test_Y = loadY(best_params['chg_pct'], best_params['chg_threshold'], tot_start_date, tot_end_date) tot_test_Y.loc[:, 'Y'] = tot_test_Y['PeakTrough'].shift(-1) # predict tomorrow !!!! tot_test_Y = tot_test_Y.loc[~tot_test_Y['Y'].isnull()] # drop nan tot_test_Y.loc[:, 'Y'] = tot_test_Y['Y'].replace({-1, 0}) test_Y = tot_test_Y.loc[(tot_test_Y['date'] >= testing_start_date) & (tot_test_Y['date'] <= testing_end_date)] # trim Y for test dates prediction, prsc = model_testing(test_Y, folder_path, testing_start_date, testing_end_date, best_params['boost_round_num'], year, season) return prediction, prsc
def train_and_test(tot_Y, training_start_date, training_end_date, testing_start_date, testing_end_date, folder_path): # load training Y chg_pct = 0.2 chg_threshold = 0.15 Y_train = loadY(chg_pct, chg_threshold, training_start_date, training_end_date) # tot_Y = tot_Y.rename(columns={'PeakTrough': 'Y'}) Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1) # predict tomorrow !!!! Y_train = Y_train.loc[~Y_train['Y'].isnull()] # drop nan # train_Y = tot_Y.loc[(tot_Y['date'] >= training_start_date) & (tot_Y['date'] <= training_end_date)] # test_Y = tot_Y.loc[(tot_Y['date'] >= testing_start_date) & (tot_Y['date'] <= testing_end_date)] # model_training(train_Y, folder_path, training_start_date, training_end_date, chain_len) # Y_train = tot_Y.loc[(tot_Y['date'] >= training_start_date) & (tot_Y['date'] <= training_end_date)] best_params = model_training(Y_train, folder_path, training_start_date, training_end_date) test_Y = tot_Y.loc[(tot_Y['date'] >= testing_start_date) & (tot_Y['date'] <= testing_end_date)] # trim Y for test dates prediction, prsc = model_testing(test_Y, folder_path, testing_start_date, testing_end_date, best_params['chain_len']) return prediction, prsc
prediction, prsc = model_testing(test_Y, folder_path, testing_start_date, testing_end_date, chain_len) return prediction, prsc if __name__ == '__main__': tot_start_date = '2007-01-01' tot_end_date = '2018-08-31' folder_path = 'D:/FeatureAlgorithm/Timing/' # load testing Y chg_pct = 0.2 chg_threshold = 0.15 tot_test_Y = loadY(chg_pct, chg_threshold, tot_start_date, tot_end_date) tot_test_Y.loc[:, 'Y'] = tot_test_Y['PeakTrough'].shift( -1) # predict tomorrow !!!! tot_test_Y = tot_test_Y.loc[~tot_test_Y['Y'].isnull()] # loop over seasons training_duration = 3 Years = list(range(2013 - training_duration, 2018 + 1 - training_duration)) Seasons = list(range(1, 5)) season_start_dates = {1: '-01-01', 2: '-04-01', 3: '-07-01', 4: '-10-01'} season_end_dates = {1: '-03-31', 2: '-06-30', 3: '-09-30', 4: '-12-31'} chain_len = 3 total_prediction = pd.DataFrame([])
def objective(params): global X_train, sub_train_data, sub_val_data, sub_val_x, sub_val_y # X_train = params['X_train'] chg_pct = params['chg_pct'] chg_threshold = params['chg_threshold'] training_start_date = params['training_start_date'] training_end_date = params['training_end_date'] Y_train = loadY(chg_pct, chg_threshold, training_start_date, training_end_date) if Y_train is None: # failed to load Y return {'loss':9999, 'status':STATUS_FAIL, 'learning_rate': np.nan, 'max_depth': np.nan, 'bagging_fraction': np.nan, 'feature_fraction':np.nan} Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1) # predict tomorrow !!!! Y_train = Y_train.loc[~Y_train['Y'].isnull()] # drop nan Y_train.loc[:, 'Y'] = Y_train['Y'].replace({-1:0}) tmp_columns = X_train.columns.tolist() tmp_columns.remove('date') all_data = X_train.merge(Y_train, on='date', how='inner') sub_whole_x = all_data[tmp_columns] sub_whole_y = all_data['Y'] del all_data gc.collect() # sub optimization for params of light-gbm sub_train_x, sub_val_x, sub_train_y, sub_val_y = train_test_split(sub_whole_x, sub_whole_y, test_size=0.1, random_state=68) sub_train_data = lgb.Dataset(sub_train_x, label=sub_train_y) sub_val_data = lgb.Dataset(sub_val_x, label=sub_val_y, reference=sub_train_data) sub_params_space = { 'learning_rate': hp.uniform('learning_rate', 0.01, 0.15), 'max_depth': hp.randint('max_depth', 10), 'bagging_fraction': hp.uniform('bagging_fraction', 0.1, 0.9), 'feature_fraction': hp.uniform('feature_fraction', 0.1, 0.9), } tmp_sub_trails = Trials() best_sub_params = fmin(subObjective, space=sub_params_space, algo=tpe.suggest, max_evals=100, trials=tmp_sub_trails) # get boost round tmp_idx = np.argmin(np.array(tmp_sub_trails.losses())) best_boost_round_num = tmp_sub_trails.results[tmp_idx]['iteration'] print('best sub cv score:', tmp_sub_trails.results[tmp_idx]['loss']) print('best num of round:', best_boost_round_num) params = { 'task': 'train', 'num_threads': 45, 'objective': 'binary', 'boosting': 'dart', 'verbosity': -1, 'tree_learner': 'data', 'seed': 66, 'min_data_in_leaf': 200, 'metric': 'auc', 'learning_rate': best_sub_params['learning_rate'], 'feature_fraction': best_sub_params['feature_fraction'], 'max_depth': best_sub_params['max_depth'] + 6, 'bagging_fraction': best_sub_params['bagging_fraction'], 'num_leaves': np.math.floor(2 ** (best_sub_params['max_depth'] + 6) * 0.7), } whole_data = lgb.Dataset(sub_whole_x, label=sub_whole_y) clf = lgb.train(params, whole_data, num_boost_round=best_boost_round_num, verbose_eval=1000) # calculate predict P&L y_pred = clf.predict(sub_whole_x, num_iteration=best_boost_round_num) Y_train.loc[:, 'predict'] = y_pred Y_train.loc[:, 'predict'] = Y_train['predict'].shift(1) # predict tomorrow tmp_buy = Y_train.loc[Y_train['predict'] > 0.5] tmp_buy.loc[:, 'cum_ret'] = tmp_buy['ret'].cumprod() final_pnl = tmp_buy['cum_ret'].iloc[-1] # best_cv_score = rs_cv.best_score_ obj_result = {'loss':-final_pnl, 'status':STATUS_OK, 'best_boost_round_num': best_boost_round_num} obj_result.update(best_sub_params) # subObjective parmas return obj_result
def model_training(output_path, training_start_date, training_end_date, year, season): global X_train X_train = loadX(training_start_date, training_end_date) max_nan_rate = 0.7 nan_rate = X_train.isnull().sum(axis=0) / X_train.shape[0] cols_to_drop = nan_rate[nan_rate > max_nan_rate].index.tolist() if len(cols_to_drop) > 0: print('drop nan columns:', cols_to_drop) X_train = X_train.drop(cols_to_drop, axis=1) # ==== hyperopt, outer optimization for determining Y params = { 'chg_pct': hp.uniform('chg_pct', 0.05, 0.3), 'chg_threshold': hp.uniform('chg_threshold', 0.05, 0.3), 'training_start_date': training_start_date, 'training_end_date': training_end_date } tmp_trial = Trials() best_params = fmin(objective, space=params, algo=tpe.suggest, max_evals=50, trials=tmp_trial) # get sub-params tmp_idx = np.argmin(np.array(tmp_trial.losses())) best_params['learning_rate'] = tmp_trial.results[tmp_idx]['learning_rate'] best_params['feature_fraction'] = tmp_trial.results[tmp_idx]['feature_fraction'] best_params['max_depth'] = tmp_trial.results[tmp_idx]['max_depth'] best_params['bagging_fraction'] = tmp_trial.results[tmp_idx]['bagging_fraction'] best_params['boost_round_num'] = tmp_trial.results[tmp_idx]['best_boost_round_num'] print('best cv score:', tmp_trial.results[tmp_idx]['loss']) print('best params:', best_params) # ==== train with the best params (final) Y_train = loadY(best_params['chg_pct'], best_params['chg_threshold'], training_start_date, training_end_date) Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1) # predict tomorrow !!!! Y_train = Y_train.loc[~Y_train['Y'].isnull()] # drop nan Y_train.loc[:, 'Y'] = Y_train['Y'].replace({-1:0}) tmp_columns = X_train.columns.tolist() tmp_columns.remove('date') with open('%sx_name_list_%d_%s.pkl' % (folder_path, year, season), 'wb') as tmp_fo: # record columns used in training pickle.dump(tmp_columns, tmp_fo) all_data = X_train.merge(Y_train, on='date', how='inner') X_train = all_data[tmp_columns] Y_train = all_data['Y'] params = { 'task': 'train', 'num_threads': 45, 'objective': 'binary', 'boosting': 'dart', 'verbosity': -1, 'tree_learner': 'data', 'seed': 66, 'min_data_in_leaf': 200, 'metric': 'auc', 'learning_rate': best_params['learning_rate'], 'feature_fraction': best_params['feature_fraction'], 'max_depth': best_params['max_depth'] + 6, 'bagging_fraction': best_params['bagging_fraction'], 'num_leaves': np.math.floor(2 ** (best_params['max_depth'] + 6) * 0.7), } final_whole_data = lgb.Dataset(X_train, label=Y_train) clf = lgb.train(params, final_whole_data, num_boost_round=best_params['boost_round_num'], verbose_eval=1000) joblib.dump(clf, '%smodel_%s_%s.m' % (output_path, year, season)) importance = pd.DataFrame({'feature': clf.feature_name(), 'importance': clf.feature_importance('gain')}) # feature importance importance.to_csv('%sfeature_importance_%s_%s.csv' % (output_path, year, season), index=False) return best_params
def objective(params): global X_train # X_train = params['X_train'] chg_pct = params['chg_pct'] chg_threshold = params['chg_threshold'] chain_len = 2 + params['chain_len'] # 2 ~ 10 training_start_date = params['training_start_date'] training_end_date = params['training_end_date'] Y_train = loadY(chg_pct, chg_threshold, training_start_date, training_end_date) if Y_train is None: # failed to load Y best_sub_params = {'c1': np.nan, 'c2': np.nan} best_cv_score = 0 return { 'loss': 9999, 'status': STATUS_FAIL, 'c1': np.nan, 'c2': np.nan } Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1) # predict tomorrow !!!! Y_train = Y_train.loc[~Y_train['Y'].isnull()] # drop nan tmp_columns = X_train.columns.tolist() tmp_columns.remove('date') all_data = X_train.merge(Y_train, on='date', how='inner') chain_X_train = all_data[tmp_columns] chain_Y_train = all_data['Y'] chain_X_train = Xpoint2Set(chain_X_train, chain_len) chain_Y_train = Ypoint2Set(chain_Y_train, chain_len) crf = sklearn_crfsuite.CRF( algorithm='lbfgs', # c1=0.1, # c2=0.1, max_iterations=100, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } labels = ['-1.0', '1.0'] val_scorer = make_scorer(metrics.flat_precision_score, average='micro', labels=labels) rs_cv = RandomizedSearchCV(crf, params_space, cv=3, verbose=0, n_jobs=3, n_iter=50, scoring=val_scorer) # searching rs_cv.fit(chain_X_train, chain_Y_train) # calculate predict P&L tmp_crf = rs_cv.best_estimator_ y_pred = tmp_crf.predict(chain_X_train) single_y_pred = y_pred[0][:-1] single_y_pred.extend([x[-1] for x in y_pred]) Y_train.loc[:, 'predict'] = single_y_pred Y_train.loc[:, 'predict'] = Y_train['predict'].astype('float') Y_train.loc[:, 'predict'] = Y_train['predict'].shift(1) # predict tomorrow tmp_buy = Y_train.loc[Y_train['predict'] == 1] tmp_buy.loc[:, 'cum_ret'] = tmp_buy['ret'].cumprod() final_pnl = tmp_buy['cum_ret'].iloc[-1] # best_cv_score = rs_cv.best_score_ best_sub_params = rs_cv.best_params_ obj_result = {'loss': -final_pnl, 'status': STATUS_OK} obj_result.update(best_sub_params) # c1 c2 return obj_result
def model_training(output_path, training_start_date, training_end_date): global X_train X_train = loadX(training_start_date, training_end_date) X_train = dataFillNA(X_train) # fill na # ==== hyperopt validation params = { 'chg_pct': hp.uniform('chg_pct', 0, 0.3), 'chg_threshold': hp.uniform('chg_threshold', 0, 0.3), 'chain_len': hp.randint('chain_len', 9), 'training_start_date': training_start_date, 'training_end_date': training_end_date } # chg_pct = scipy.stats.uniform(scale=0.3) # chg_threshold = scipy.stats.uniform(scale=0.3) # chain_len = scipy.stats.randint(low=2, high =10) # ==== cross validation # best_cv_score = 0 # for i in range(500): # tmp_chg_pct = chg_pct.rvs() # tmp_chg_threshold = chg_threshold.rvs() # tmp_chain_len = chain_len.rvs() # tmp_sub_params, tmp_cv_score = objective(X_train, tmp_chg_pct, tmp_chg_threshold, tmp_chain_len, training_start_date, training_end_date) # if tmp_cv_score > best_cv_score: # best_cv_score = tmp_cv_score # tmp_sub_params['chg_pct'] = tmp_chg_pct # tmp_sub_params['chg_threshold'] = tmp_chg_threshold # tmp_sub_params['chain_len'] = tmp_chain_len # best_params = tmp_sub_params.copy() tmp_trial = Trials() best_params = fmin(objective, space=params, algo=tpe.suggest, max_evals=100, trials=tmp_trial) # get sub-params tmp_idx = np.argmin(np.array(tmp_trial.losses())) best_params['c1'] = tmp_trial.results[tmp_idx]['c1'] best_params['c2'] = tmp_trial.results[tmp_idx]['c2'] best_params['chain_len'] += 2 # adjust chain len print('best cv score:', tmp_trial.results[tmp_idx]['loss']) print('best params:', best_params) # ==== train with the best params Y_train = loadY(best_params['chg_pct'], best_params['chg_threshold'], training_start_date, training_end_date) Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1) # predict tomorrow !!!! Y_train = Y_train.loc[~Y_train['Y'].isnull()] # drop nan tmp_columns = X_train.columns.tolist() tmp_columns.remove('date') all_data = X_train.merge(Y_train, on='date', how='inner') X_train = all_data[tmp_columns] Y_train = all_data['Y'] X_train = Xpoint2Set(X_train, best_params['chain_len']) Y_train = Ypoint2Set(Y_train, best_params['chain_len']) crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=best_params['c1'], c2=best_params['c2'], max_iterations=100, all_possible_transitions=True) crf.fit(X_train, Y_train) # tmp_columns = X_train.columns.tolist() # tmp_columns.remove('date') # # all_data = X_train.merge(Y_train, on='date', how='inner') # X_train = all_data[tmp_columns] # Y_train = all_data['Y'] # del all_data # gc.collect() # # X_train = Xpoint2Set(X_train, chain_len) # Y_train = Ypoint2Set(Y_train, chain_len) # # # search parameter by cross validation # crf = sklearn_crfsuite.CRF( # algorithm='lbfgs', # # c1=0.1, # # c2=0.1, # max_iterations=100, # all_possible_transitions=True # ) # # params_space = { # 'c1': scipy.stats.expon(scale=0.5), # 'c2': scipy.stats.expon(scale=0.05), # } # # labels = ['-1.0', '1.0'] # # val_scorer = make_scorer(precision_score, average='micro', labels=labels) # val_scorer = make_scorer(metrics.flat_precision_score, average='micro', labels=labels) # # rs_cv = RandomizedSearchCV(crf, params_space, cv=3, verbose=10, n_jobs=-1, n_iter=50, scoring=val_scorer) # searching # rs_cv.fit(X_train, Y_train) # # crf = rs_cv.best_estimator_ # # crf.fit(X_train, y_train) with open(output_path + 'crf_model.pkl', 'wb') as tmp_fo: # dump model pickle.dump(crf, tmp_fo) return best_params