Beispiel #1
0
def train_and_test(training_start_date, training_end_date, testing_start_date,
                   testing_end_date, folder_path):
    # load training Y
    # chg_pct = 0.2
    # chg_threshold = 0.15
    # train_Y = loadY(chg_pct, chg_threshold, training_start_date, training_end_date)
    # tot_Y = tot_Y.rename(columns={'PeakTrough': 'Y'})
    # train_Y.loc[:, 'Y'] = train_Y['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    # train_Y = train_Y.loc[~train_Y['Y'].isnull()]  # drop nan
    # train_Y = tot_Y.loc[(tot_Y['date'] >= training_start_date) & (tot_Y['date'] <= training_end_date)]
    # test_Y = tot_Y.loc[(tot_Y['date'] >= testing_start_date) & (tot_Y['date'] <= testing_end_date)]

    # model_training(train_Y, folder_path, training_start_date, training_end_date, chain_len)
    best_params = model_training(folder_path, training_start_date,
                                 training_end_date)

    # get testing Y
    tot_test_Y = loadY(best_params['chg_pct'], best_params['chg_threshold'],
                       tot_start_date, tot_end_date)
    tot_test_Y.loc[:, 'Y'] = tot_test_Y['PeakTrough'].shift(
        -1)  # predict tomorrow !!!!
    tot_test_Y = tot_test_Y.loc[~tot_test_Y['Y'].isnull()]  # drop nan

    test_Y = tot_test_Y.loc[(tot_test_Y['date'] >= testing_start_date) & (
        tot_test_Y['date'] <= testing_end_date)]  # trim Y for test dates
    prediction, prsc = model_testing(test_Y, folder_path, testing_start_date,
                                     testing_end_date,
                                     best_params['chain_len'])

    return prediction, prsc
Beispiel #2
0
def train_and_test(tot_test_Y, training_start_date, training_end_date,
                   testing_start_date, testing_end_date, chg_pct,
                   chg_threshold, chain_len, folder_path):
    # load training Y
    # chg_pct = 0.2
    # chg_threshold = 0.15
    train_Y = loadY(chg_pct, chg_threshold, training_start_date,
                    training_end_date)
    # tot_Y = tot_Y.rename(columns={'PeakTrough': 'Y'})
    train_Y.loc[:,
                'Y'] = train_Y['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    train_Y = train_Y.loc[~train_Y['Y'].isnull()]  # drop nan
    # train_Y = tot_Y.loc[(tot_Y['date'] >= training_start_date) & (tot_Y['date'] <= training_end_date)]
    # test_Y = tot_Y.loc[(tot_Y['date'] >= testing_start_date) & (tot_Y['date'] <= testing_end_date)]

    model_training(train_Y, folder_path, training_start_date,
                   training_end_date, chain_len)

    # get testing Y
    test_Y = tot_test_Y.loc[(tot_test_Y['date'] >= testing_start_date)
                            & (tot_test_Y['date'] <= testing_end_date)]
    prediction, prsc = model_testing(test_Y, folder_path, testing_start_date,
                                     testing_end_date, chain_len)

    return prediction, prsc
def train_and_test(training_start_date, training_end_date, testing_start_date, testing_end_date,
                   folder_path, year, season):

    best_params = model_training(folder_path, training_start_date, training_end_date, year, season)

    # get testing Y
    tot_test_Y = loadY(best_params['chg_pct'], best_params['chg_threshold'], tot_start_date, tot_end_date)
    tot_test_Y.loc[:, 'Y'] = tot_test_Y['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    tot_test_Y = tot_test_Y.loc[~tot_test_Y['Y'].isnull()]  # drop nan
    tot_test_Y.loc[:, 'Y'] = tot_test_Y['Y'].replace({-1, 0})

    test_Y = tot_test_Y.loc[(tot_test_Y['date'] >= testing_start_date) & (tot_test_Y['date'] <= testing_end_date)] # trim Y for test dates
    prediction, prsc = model_testing(test_Y, folder_path, testing_start_date, testing_end_date, best_params['boost_round_num'], year, season)

    return prediction, prsc
Beispiel #4
0
def train_and_test(tot_Y, training_start_date, training_end_date, testing_start_date, testing_end_date,
                   folder_path):
    # load training Y
    chg_pct = 0.2
    chg_threshold = 0.15
    Y_train = loadY(chg_pct, chg_threshold, training_start_date, training_end_date)
    # tot_Y = tot_Y.rename(columns={'PeakTrough': 'Y'})
    Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    Y_train = Y_train.loc[~Y_train['Y'].isnull()]  # drop nan
    # train_Y = tot_Y.loc[(tot_Y['date'] >= training_start_date) & (tot_Y['date'] <= training_end_date)]
    # test_Y = tot_Y.loc[(tot_Y['date'] >= testing_start_date) & (tot_Y['date'] <= testing_end_date)]

    # model_training(train_Y, folder_path, training_start_date, training_end_date, chain_len)
    # Y_train = tot_Y.loc[(tot_Y['date'] >= training_start_date) & (tot_Y['date'] <= training_end_date)]
    best_params = model_training(Y_train, folder_path, training_start_date, training_end_date)

    test_Y = tot_Y.loc[(tot_Y['date'] >= testing_start_date) & (tot_Y['date'] <= testing_end_date)] # trim Y for test dates
    prediction, prsc = model_testing(test_Y, folder_path, testing_start_date, testing_end_date, best_params['chain_len'])

    return prediction, prsc
Beispiel #5
0
    prediction, prsc = model_testing(test_Y, folder_path, testing_start_date,
                                     testing_end_date, chain_len)

    return prediction, prsc


if __name__ == '__main__':
    tot_start_date = '2007-01-01'
    tot_end_date = '2018-08-31'

    folder_path = 'D:/FeatureAlgorithm/Timing/'

    # load testing Y
    chg_pct = 0.2
    chg_threshold = 0.15
    tot_test_Y = loadY(chg_pct, chg_threshold, tot_start_date, tot_end_date)
    tot_test_Y.loc[:, 'Y'] = tot_test_Y['PeakTrough'].shift(
        -1)  #  predict tomorrow !!!!
    tot_test_Y = tot_test_Y.loc[~tot_test_Y['Y'].isnull()]

    # loop over seasons
    training_duration = 3
    Years = list(range(2013 - training_duration, 2018 + 1 - training_duration))
    Seasons = list(range(1, 5))

    season_start_dates = {1: '-01-01', 2: '-04-01', 3: '-07-01', 4: '-10-01'}

    season_end_dates = {1: '-03-31', 2: '-06-30', 3: '-09-30', 4: '-12-31'}

    chain_len = 3
    total_prediction = pd.DataFrame([])
def objective(params):
    global X_train, sub_train_data, sub_val_data, sub_val_x, sub_val_y
    # X_train = params['X_train']
    chg_pct = params['chg_pct']
    chg_threshold = params['chg_threshold']
    training_start_date = params['training_start_date']
    training_end_date = params['training_end_date']

    Y_train = loadY(chg_pct, chg_threshold, training_start_date, training_end_date)
    if Y_train is None:  # failed to load Y
        return {'loss':9999, 'status':STATUS_FAIL, 'learning_rate': np.nan, 'max_depth': np.nan,
                'bagging_fraction': np.nan, 'feature_fraction':np.nan}

    Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    Y_train = Y_train.loc[~Y_train['Y'].isnull()]  # drop nan
    Y_train.loc[:, 'Y'] = Y_train['Y'].replace({-1:0})

    tmp_columns = X_train.columns.tolist()
    tmp_columns.remove('date')

    all_data = X_train.merge(Y_train, on='date', how='inner')
    sub_whole_x = all_data[tmp_columns]
    sub_whole_y = all_data['Y']
    del all_data
    gc.collect()

    # sub optimization for params of light-gbm
    sub_train_x, sub_val_x, sub_train_y, sub_val_y = train_test_split(sub_whole_x, sub_whole_y, test_size=0.1, random_state=68)
    sub_train_data = lgb.Dataset(sub_train_x, label=sub_train_y)
    sub_val_data = lgb.Dataset(sub_val_x, label=sub_val_y, reference=sub_train_data)

    sub_params_space = {
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.15),
        'max_depth': hp.randint('max_depth', 10),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.1, 0.9),
        'feature_fraction': hp.uniform('feature_fraction', 0.1, 0.9),
    }

    tmp_sub_trails = Trials()
    best_sub_params = fmin(subObjective, space=sub_params_space, algo=tpe.suggest, max_evals=100, trials=tmp_sub_trails)

    # get boost round
    tmp_idx = np.argmin(np.array(tmp_sub_trails.losses()))
    best_boost_round_num = tmp_sub_trails.results[tmp_idx]['iteration']
    print('best sub cv score:', tmp_sub_trails.results[tmp_idx]['loss'])
    print('best num of round:', best_boost_round_num)

    params = {
        'task': 'train',
        'num_threads': 45,
        'objective': 'binary',
        'boosting': 'dart',
        'verbosity': -1,
        'tree_learner': 'data',
        'seed': 66,
        'min_data_in_leaf': 200,
        'metric': 'auc',
        'learning_rate': best_sub_params['learning_rate'],
        'feature_fraction': best_sub_params['feature_fraction'],
        'max_depth': best_sub_params['max_depth'] + 6,
        'bagging_fraction': best_sub_params['bagging_fraction'],
        'num_leaves': np.math.floor(2 ** (best_sub_params['max_depth'] + 6) * 0.7),
    }

    whole_data = lgb.Dataset(sub_whole_x, label=sub_whole_y)
    clf = lgb.train(params, whole_data, num_boost_round=best_boost_round_num, verbose_eval=1000)

    # calculate predict P&L
    y_pred = clf.predict(sub_whole_x, num_iteration=best_boost_round_num)
    Y_train.loc[:, 'predict'] = y_pred
    Y_train.loc[:, 'predict'] = Y_train['predict'].shift(1)  # predict tomorrow
    tmp_buy = Y_train.loc[Y_train['predict'] > 0.5]
    tmp_buy.loc[:, 'cum_ret'] = tmp_buy['ret'].cumprod()
    final_pnl = tmp_buy['cum_ret'].iloc[-1]

    # best_cv_score = rs_cv.best_score_

    obj_result = {'loss':-final_pnl, 'status':STATUS_OK, 'best_boost_round_num': best_boost_round_num}
    obj_result.update(best_sub_params)  # subObjective parmas

    return obj_result
def model_training(output_path, training_start_date, training_end_date, year, season):
    global  X_train
    X_train = loadX(training_start_date, training_end_date)

    max_nan_rate = 0.7
    nan_rate = X_train.isnull().sum(axis=0) / X_train.shape[0]
    cols_to_drop = nan_rate[nan_rate > max_nan_rate].index.tolist()
    if len(cols_to_drop) > 0:
        print('drop nan columns:', cols_to_drop)
        X_train = X_train.drop(cols_to_drop, axis=1)

    # ==== hyperopt, outer optimization for determining Y
    params = {
        'chg_pct': hp.uniform('chg_pct', 0.05, 0.3),
        'chg_threshold': hp.uniform('chg_threshold', 0.05, 0.3),
        'training_start_date': training_start_date,
        'training_end_date': training_end_date
    }

    tmp_trial = Trials()
    best_params = fmin(objective, space=params, algo=tpe.suggest, max_evals=50, trials=tmp_trial)

    # get sub-params
    tmp_idx = np.argmin(np.array(tmp_trial.losses()))
    best_params['learning_rate'] = tmp_trial.results[tmp_idx]['learning_rate']
    best_params['feature_fraction'] = tmp_trial.results[tmp_idx]['feature_fraction']
    best_params['max_depth'] = tmp_trial.results[tmp_idx]['max_depth']
    best_params['bagging_fraction'] = tmp_trial.results[tmp_idx]['bagging_fraction']
    best_params['boost_round_num'] = tmp_trial.results[tmp_idx]['best_boost_round_num']

    print('best cv score:', tmp_trial.results[tmp_idx]['loss'])
    print('best params:', best_params)

    # ==== train with the best params (final)
    Y_train = loadY(best_params['chg_pct'], best_params['chg_threshold'], training_start_date, training_end_date)
    Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    Y_train = Y_train.loc[~Y_train['Y'].isnull()]  # drop nan
    Y_train.loc[:, 'Y'] = Y_train['Y'].replace({-1:0})

    tmp_columns = X_train.columns.tolist()
    tmp_columns.remove('date')

    with open('%sx_name_list_%d_%s.pkl' % (folder_path, year, season), 'wb') as tmp_fo:  # record columns used in training
        pickle.dump(tmp_columns, tmp_fo)

    all_data = X_train.merge(Y_train, on='date', how='inner')
    X_train = all_data[tmp_columns]
    Y_train = all_data['Y']

    params = {
        'task': 'train',
        'num_threads': 45,
        'objective': 'binary',
        'boosting': 'dart',
        'verbosity': -1,
        'tree_learner': 'data',
        'seed': 66,
        'min_data_in_leaf': 200,
        'metric': 'auc',
        'learning_rate': best_params['learning_rate'],
        'feature_fraction': best_params['feature_fraction'],
        'max_depth': best_params['max_depth'] + 6,
        'bagging_fraction': best_params['bagging_fraction'],
        'num_leaves': np.math.floor(2 ** (best_params['max_depth'] + 6) * 0.7),
    }

    final_whole_data = lgb.Dataset(X_train, label=Y_train)
    clf = lgb.train(params, final_whole_data, num_boost_round=best_params['boost_round_num'], verbose_eval=1000)

    joblib.dump(clf, '%smodel_%s_%s.m' % (output_path, year, season))
    importance = pd.DataFrame({'feature': clf.feature_name(), 'importance': clf.feature_importance('gain')})  # feature importance
    importance.to_csv('%sfeature_importance_%s_%s.csv' % (output_path, year, season), index=False)

    return best_params
Beispiel #8
0
def objective(params):
    global X_train
    # X_train = params['X_train']
    chg_pct = params['chg_pct']
    chg_threshold = params['chg_threshold']
    chain_len = 2 + params['chain_len']  # 2 ~ 10
    training_start_date = params['training_start_date']
    training_end_date = params['training_end_date']

    Y_train = loadY(chg_pct, chg_threshold, training_start_date,
                    training_end_date)
    if Y_train is None:  # failed to load Y
        best_sub_params = {'c1': np.nan, 'c2': np.nan}
        best_cv_score = 0
        return {
            'loss': 9999,
            'status': STATUS_FAIL,
            'c1': np.nan,
            'c2': np.nan
        }

    Y_train.loc[:,
                'Y'] = Y_train['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    Y_train = Y_train.loc[~Y_train['Y'].isnull()]  # drop nan

    tmp_columns = X_train.columns.tolist()
    tmp_columns.remove('date')

    all_data = X_train.merge(Y_train, on='date', how='inner')
    chain_X_train = all_data[tmp_columns]
    chain_Y_train = all_data['Y']

    chain_X_train = Xpoint2Set(chain_X_train, chain_len)
    chain_Y_train = Ypoint2Set(chain_Y_train, chain_len)

    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        # c1=0.1,
        # c2=0.1,
        max_iterations=100,
        all_possible_transitions=True)

    params_space = {
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }

    labels = ['-1.0', '1.0']
    val_scorer = make_scorer(metrics.flat_precision_score,
                             average='micro',
                             labels=labels)

    rs_cv = RandomizedSearchCV(crf,
                               params_space,
                               cv=3,
                               verbose=0,
                               n_jobs=3,
                               n_iter=50,
                               scoring=val_scorer)  # searching
    rs_cv.fit(chain_X_train, chain_Y_train)

    # calculate predict P&L
    tmp_crf = rs_cv.best_estimator_
    y_pred = tmp_crf.predict(chain_X_train)
    single_y_pred = y_pred[0][:-1]
    single_y_pred.extend([x[-1] for x in y_pred])
    Y_train.loc[:, 'predict'] = single_y_pred
    Y_train.loc[:, 'predict'] = Y_train['predict'].astype('float')
    Y_train.loc[:, 'predict'] = Y_train['predict'].shift(1)  # predict tomorrow
    tmp_buy = Y_train.loc[Y_train['predict'] == 1]
    tmp_buy.loc[:, 'cum_ret'] = tmp_buy['ret'].cumprod()
    final_pnl = tmp_buy['cum_ret'].iloc[-1]

    # best_cv_score = rs_cv.best_score_
    best_sub_params = rs_cv.best_params_

    obj_result = {'loss': -final_pnl, 'status': STATUS_OK}
    obj_result.update(best_sub_params)  # c1 c2

    return obj_result
Beispiel #9
0
def model_training(output_path, training_start_date, training_end_date):
    global X_train
    X_train = loadX(training_start_date, training_end_date)
    X_train = dataFillNA(X_train)  # fill na

    # ==== hyperopt validation
    params = {
        'chg_pct': hp.uniform('chg_pct', 0, 0.3),
        'chg_threshold': hp.uniform('chg_threshold', 0, 0.3),
        'chain_len': hp.randint('chain_len', 9),
        'training_start_date': training_start_date,
        'training_end_date': training_end_date
    }

    # chg_pct = scipy.stats.uniform(scale=0.3)
    # chg_threshold = scipy.stats.uniform(scale=0.3)
    # chain_len = scipy.stats.randint(low=2, high =10)

    # ==== cross validation
    # best_cv_score = 0
    # for i in range(500):
    #     tmp_chg_pct = chg_pct.rvs()
    #     tmp_chg_threshold = chg_threshold.rvs()
    #     tmp_chain_len = chain_len.rvs()
    #     tmp_sub_params, tmp_cv_score = objective(X_train, tmp_chg_pct, tmp_chg_threshold, tmp_chain_len, training_start_date, training_end_date)
    #     if tmp_cv_score > best_cv_score:
    #         best_cv_score = tmp_cv_score
    #         tmp_sub_params['chg_pct'] = tmp_chg_pct
    #         tmp_sub_params['chg_threshold'] = tmp_chg_threshold
    #         tmp_sub_params['chain_len'] = tmp_chain_len
    #         best_params = tmp_sub_params.copy()
    tmp_trial = Trials()
    best_params = fmin(objective,
                       space=params,
                       algo=tpe.suggest,
                       max_evals=100,
                       trials=tmp_trial)

    # get sub-params
    tmp_idx = np.argmin(np.array(tmp_trial.losses()))
    best_params['c1'] = tmp_trial.results[tmp_idx]['c1']
    best_params['c2'] = tmp_trial.results[tmp_idx]['c2']
    best_params['chain_len'] += 2  # adjust chain len

    print('best cv score:', tmp_trial.results[tmp_idx]['loss'])
    print('best params:', best_params)

    # ==== train with the best params
    Y_train = loadY(best_params['chg_pct'], best_params['chg_threshold'],
                    training_start_date, training_end_date)
    Y_train.loc[:,
                'Y'] = Y_train['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    Y_train = Y_train.loc[~Y_train['Y'].isnull()]  # drop nan

    tmp_columns = X_train.columns.tolist()
    tmp_columns.remove('date')

    all_data = X_train.merge(Y_train, on='date', how='inner')
    X_train = all_data[tmp_columns]
    Y_train = all_data['Y']

    X_train = Xpoint2Set(X_train, best_params['chain_len'])
    Y_train = Ypoint2Set(Y_train, best_params['chain_len'])

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=best_params['c1'],
                               c2=best_params['c2'],
                               max_iterations=100,
                               all_possible_transitions=True)

    crf.fit(X_train, Y_train)

    # tmp_columns = X_train.columns.tolist()
    # tmp_columns.remove('date')
    #
    # all_data = X_train.merge(Y_train, on='date', how='inner')
    # X_train = all_data[tmp_columns]
    # Y_train = all_data['Y']
    # del all_data
    # gc.collect()
    #
    # X_train = Xpoint2Set(X_train, chain_len)
    # Y_train = Ypoint2Set(Y_train, chain_len)
    #
    # # search parameter by cross validation
    # crf = sklearn_crfsuite.CRF(
    #     algorithm='lbfgs',
    #     # c1=0.1,
    #     # c2=0.1,
    #     max_iterations=100,
    #     all_possible_transitions=True
    # )
    #
    # params_space = {
    #     'c1': scipy.stats.expon(scale=0.5),
    #     'c2': scipy.stats.expon(scale=0.05),
    # }
    #
    # labels = ['-1.0', '1.0']
    # # val_scorer = make_scorer(precision_score, average='micro', labels=labels)
    # val_scorer = make_scorer(metrics.flat_precision_score, average='micro', labels=labels)
    #
    # rs_cv = RandomizedSearchCV(crf, params_space, cv=3, verbose=10, n_jobs=-1, n_iter=50, scoring=val_scorer)  # searching
    # rs_cv.fit(X_train, Y_train)
    #
    # crf = rs_cv.best_estimator_
    # # crf.fit(X_train, y_train)

    with open(output_path + 'crf_model.pkl', 'wb') as tmp_fo:  # dump model
        pickle.dump(crf, tmp_fo)

    return best_params