Python DataHelper 예제들, code.DataHelper Python 예제들

예제 #1

0

파일 보기

파일: PlotHelper.py 프로젝트: jy247/Thesis

def plot_quartiles(data, dates_test, title, true_values_test = '', all_predictions = '', experts =''):

    labels = ['min', 'lower_quartile', 'median', 'upper_quartile', 'max' ]
    quartiles = dh.get_quartiles(data)

    fig = plt.figure(98)

    if not isinstance(true_values_test, str):
        plt.plot(dates_test, true_values_test, 'g', label='actual')
    if not isinstance(all_predictions, str):
        plt.plot(dates_test, all_predictions, 'darkorange', label='predictions')
        sparecolour = 'blue'
    if not isinstance(experts, str):
        plt.plot(dates_test, experts, 'b', label='experts')
        sparecolour = 'darkorange'

    plt.fill_between(dates_test, quartiles[1], quartiles[2], color=sparecolour, alpha=0.4)
    plt.fill_between(dates_test, quartiles[2], quartiles[3], color=sparecolour, alpha=0.4)
    plt.fill_between(dates_test, quartiles[0], quartiles[1], color=sparecolour, alpha=0.2)
    plt.fill_between(dates_test, quartiles[3], quartiles[4], color=sparecolour, alpha=0.2)

    stds = np.std(data, axis=1)
    #print(stds)
    print('mean std: ' + str(round(np.mean(stds),4)))

    plt.ylabel("growth %")
    plt.xlabel('Date')
    plt.legend()
    plt.title(title)

    plt.show()

예제 #2

0

파일 보기

파일: PlotHelper.py 프로젝트: jy247/Thesis

def plot_percentage_beaten(data, dates_test, true_values_test, all_predictions):

    percentage_beaten = dh.percentage_beaten(true_values_test, all_predictions, data)
    fig = plt.figure(99)
    plt.plot(dates_test, percentage_beaten, 'r', label='percentage beaten')
    average = np.ones(dates_test.shape[0]) * np.mean(percentage_beaten)
    plt.plot(dates_test, average, 'b', label='average beaten')
    plt.ylabel("% experts")
    plt.legend()
    plt.title('Model Prediction Closer to the True Value than % of Experts')

예제 #3

0

파일 보기

def get_validation_score(x_train, y_train, model):
    n_train = x_train.shape[0]
    n_valid = int(n_train / 4)
    n_train = n_train - n_valid

    x_valid = x_train[n_train:]
    x_train = x_train[0:n_train - 1]
    y_valid = y_train[n_train:]
    y_train = y_train[0:n_train - 1]

    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_valid = scaler.transform(x_valid)

    model.fit(x_train, y_train)
    predictions = model.predict(x_valid)
    if VALID_SCORE_MODEL == 'corr':
        score = np.corrcoef(y_valid, predictions)[0, 1]
    elif VALID_SCORE_MODEL == 'r2':
        score = metrics.r2_score(y_valid, predictions)
        if score > 0.5:
            ph.plot_forecast_performance(1, predictions, y_valid, y_valid,
                                         range(n_valid), 'test')
            ph.show_plots()
    elif VALID_SCORE_MODEL == 'mse':
        score = -metrics.mean_squared_error(y_valid, predictions)
    elif VALID_SCORE_MODEL == 'mae':
        score = -metrics.mean_absolute_error(y_valid, predictions)
    elif VALID_SCORE_MODEL == 'hinge':
        score = -dh.hinge_loss(y_valid, predictions, 0.25)
    elif VALID_SCORE_MODEL == 'joint':
        score = np.corrcoef(y_valid, predictions)[0, 1] - dh.hinge_loss(
            y_valid, predictions, 0.25)
    elif VALID_SCORE_MODEL == 'median':
        score = -np.median(np.abs(y_valid - predictions))
    else:
        raise (ValueError('unknown VALID_SCORE_MODEL: ' +
                          str(VALID_SCORE_MODEL)))

    return score

예제 #4

0

파일 보기

파일: Main.py 프로젝트: jy247/Thesis

def main():

    import os
    data_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + '/data/'

    #data_file = data_dir + 'input_data_full.csv'
    #data_file = data_dir + 'InputData_small.csv'
    #results_file = data_dir + 'consumer_spending.csv'
    #results_file = data_dir + 'Fake_Results.csv'
    data = dh.get_all_data(LOAD_FROM_FILE, LOAD_DELTAS, data_dir)
    results = data[target_col]

    num_data_items = data.shape[0]
    season_info = np.zeros([num_data_items,4])
    j = 0
    for i in range(num_data_items):
        season_info[i,j] = 1
        j += 1
        if j > 3:
            j = 0

    seasons_df = pd.DataFrame(data=season_info, columns=['SEASON_1', 'SEASON_2', 'SEASON_3', 'SEASON_4'])
    seasons_df.index = data.index

    if USE_SEASONS:
        data = pd.concat([data, seasons_df], axis=1)

    if DO_VALIDATION:
        model = valid.do_validation(data, results, MODEL_TYPE, USE_ENSEMBLE)
        if DO_TEST:
            run_tests(model, data, results)
    elif DO_FUTURE_FORECAST:
        model = mh.get_model_fixed(MODEL_TYPE)
        do_fwd_prediction(model,data,results)
    else:
        model = mh.get_model_fixed(MODEL_TYPE)
        #do_fwd_prediction(model,data,results)
        run_tests(model, data, results)

    end = timer()
    print('finished in: ' + str(end - start))
    ph.show_plots()

예제 #5

0

파일 보기

def DisplayFactors():

    use_deltas = True
    data = dh.get_all_data(LOAD_FROM_FILE, use_deltas)
    titles = data.columns
    data = data[data.index < START_TEST_DATE]
    check_one_forecast_gap(data, 1)
    [corr_1fwd, corr_1fwd_delta] = check_one_forecast_gap(data, 1, use_deltas)
    [corr_4fwd, corr_4fwd_delta] = check_one_forecast_gap(data, 4, use_deltas)

    for i in range(corr_1fwd.shape[0]):
        if use_deltas:
            print(titles[i] + ' & pch & ' + str(round(corr_1fwd[i, 0], 4)) +
                  ' &  ' + str(round(corr_1fwd_delta[i, 0], 4)) + ' &  ' +
                  str(round(corr_4fwd[i, 0], 4)) + ' & ' +
                  str(round(corr_4fwd_delta[i, 0], 4)) + ' \\\\')
        else:
            print(titles[i] + ' & pch & ' + str(round(corr_1fwd[i, 0], 4)) +
                  ' &  ' + str(round(corr_4fwd[i, 0], 4)) + '\\\\')
    ph.draw_hist(
        corr_1fwd,
        '1 Period Forward Correlations Between Input Variables and Target',
        'Correlation Coefficient')
    plt.show()

예제 #6

0

파일 보기

파일: Main.py 프로젝트: jy247/Thesis

def do_test(model, data, results):

    base_model = model
    if USE_ENSEMBLE:
        model = EnsembleSVR(MODEL_TYPE)

    for fwd_index in range(1, FORECAST_QUARTERS+1):

        start_train = 1
        data, results = dh.shift_data_one_quarter(data, results)
        num_data_items = data.shape[0]

        if num_data_items != results.shape[0]:
            raise ValueError('Number of items in data does not match number of items in target!')

        if TEST_ON_TRAIN:
            num_test_items = 1
        else:
            num_test_items = data[data.index >= START_TEST_DATE].shape[0]

        end_train = num_data_items - num_test_items - 1
        start_test = end_train + 1

        dates_train = data.index[start_train:end_train]
        dates_test = data.index[start_test:]

        all_predictions = np.zeros([num_test_items])
        all_std = np.zeros([num_test_items])
        true_values_test = results[start_test:].values.ravel()

        for j in range(num_test_items):

            # expanding window
            if EXPANDING_WINDOW:
                end_train = num_data_items - num_test_items - 1 + j
                start_test = end_train + 1
                end_test = start_test + 1
            elif ROLLING_WINDOW:
                start_train += 1
                end_train = num_data_items - num_test_items - 1 + j
                start_test = end_train + 1
                end_test = start_test + 1

            x_train = data[start_train:end_train]  # data.iloc[1:num_train,2:]
            y_train = results[start_train:end_train].values.ravel()
            x_test = data[start_test:end_test]  # data.iloc[num_train + 1:,2:]

            if USE_ENSEMBLE and not ENSEMBLE_EQUALLY_WEIGHTED and j == 0:
                model.reweight(x_train, y_train)

            # normalise columns
            scaler = preprocessing.StandardScaler().fit(x_train)
            x_train = scaler.transform(x_train)
            x_test = scaler.transform(x_test)

            if TEST_ON_TRAIN:
                x_test = x_train
                dates_test = dates_train
                true_values_test = y_train

            model.fit(x_train, y_train)
            # temp = model.coef_
            # temp2 = model.dual_coef_
            prediction = model.predict(x_test)
            if TEST_ON_TRAIN:
                all_predictions = prediction
            else:
                all_predictions[j] = prediction


        end_date = dates_test[-1]
        experts = dh.get_experts(LOAD_FROM_FILE)
        expert_col = experts['fwd' + str(fwd_index)]
        experts_test = expert_col[(expert_col.index >= START_TEST_DATE) & (expert_col.index <= end_date)]

        if ANALYSE_VARIANCE:
            return all_predictions, true_values_test, dates_test, experts_test
        else:
            process_predictions(all_predictions, true_values_test, dates_test, base_model, fwd_index, experts_test)

예제 #7

0

파일 보기

파일: Main.py 프로젝트: jy247/Thesis

def process_predictions(all_predictions, true_values_test, dates_test, model, fwd_index, experts_test):

    end_date = dates_test[-1]
    if USE_ALL_EXPERTS:
        experts = dh.get_all_experts(LOAD_FROM_FILE)
        expert_col = experts[fwd_index - 1]
        experts_test = expert_col[(expert_col.index >= START_TEST_DATE) & (expert_col.index <= end_date)]
        title = str(fwd_index) + ' periods forward, Model=Ensemble SVR, KERNEL=' + model.get_params()['kernel']
        ph.plot_quartiles(experts_test,dates_test,title,true_values_test=true_values_test,all_predictions=all_predictions)
        ph.plot_percentage_beaten(experts_test,dates_test,true_values_test,all_predictions)

    mse = metrics.mean_squared_error(true_values_test, all_predictions)
    expert_mse = metrics.mean_squared_error(true_values_test, experts_test)

    mae = metrics.mean_absolute_error(true_values_test, all_predictions)
    expert_mae = metrics.mean_absolute_error(true_values_test, experts_test)

    hinge_loss = str(round(dh.hinge_loss(true_values_test, all_predictions, 0.5), 4))
    expert_hinge_loss = str(round(dh.hinge_loss(true_values_test, experts_test, 0.5), 4))

    right_direction_score = dh.right_direction_score(true_values_test, all_predictions)
    expert_right_direction_score = dh.right_direction_score(true_values_test, experts_test)

    r_squared = metrics.r2_score(true_values_test, all_predictions)
    expert_r_squared = metrics.r2_score(true_values_test, experts_test)

    # print(np.var(all_predictions))
    # print(np.var(experts_test))

    correlation = np.corrcoef(true_values_test, all_predictions)
    expert_correlation = np.corrcoef(true_values_test, experts_test)

    if EXAMINE_RESIDUALS:
        residuals = true_values_test - all_predictions
        title = str(fwd_index) + ' periods forward residuals, correlation = ' + str(round(np.corrcoef(all_predictions, residuals)[0, 1],4))
        ph.plot_one_scatter(all_predictions, residuals, title, fwd_index + FORECAST_QUARTERS * 2, 'Predicted CS Growth', 'True Value - Prediction')

    if MODEL_TYPE == mh.RBF:
        gamma_string = str(round(model.get_params()['gamma'],5))
    else:
        gamma_string = ' - '

    if MODEL_TYPE == mh.Rand_F or MODEL_TYPE == mh.DT:
        epsilon_string = ' - '
        C_string = ' - '
        model_string = 'Random Forest'
    else:
        epsilon_string = str(round(model.get_params()['epsilon'],4))
        C_string = str(round(model.get_params()['C'],4))
        model_string = model.get_params()['kernel'] + ' SVR'

    title = str(fwd_index) + ' periods forward, Model=' + model_string # KERNEL=' + 'Linear' # model.get_params()['kernel']) #SVR, Kernel = POLY3')
    ph.plot_forecast_performance(fwd_index, all_predictions, true_values_test, experts_test, dates_test, title)

    # print(str(i) + ' & Mean Experts & - & - & - & ' + str(round(expert_mse,4)) + ' & ' + str(round(expert_mae,4)) + ' & ' + expert_hinge_loss + ' & '
    #             + str(round(expert_correlation[0,1],4)) + ' & ' + str(round(expert_r_squared,4)))
    #print(str(round(correlation[0,1],4)))

    print(str(fwd_index) + ' & ' + model_string + ' & ' + C_string +
          ' & ' + epsilon_string + ' & ' + gamma_string +
          ' & ' + str(round(mse,4)) + ' & ' + str(round(mae,4)) + ' & ' + hinge_loss + ' & '
                + str(round(correlation[0,1],4)) + ' & ' + str(round(r_squared,4)))

예제 #8

0

파일 보기

def do_validation(data, results, model_type, use_ensemble):

    best_sum_score = float('-inf')
    best_c = 0
    best_epsilon = 0
    best_gamma = 0
    all_c = []
    all_epsilon = []
    all_score = []
    all_gamma = []
    #residuals_correlation_scorer = make_scorer(residuals_correlation_score)
    for i in range(VALID_QUARTERS):
        data, results = dh.shift_data_one_quarter(data, results)

    for precision_loop in range(2):

        if precision_loop == 1:
            print('Do more precise grid search')
            # more precise
            epsilon_increment = best_epsilon / 10
            low_gamma = best_gamma / 8
            low_epsilon = best_epsilon - (5 * epsilon_increment)
            low_c = best_c / 8

        for c_factor in range(1):
            for epsilon_factor in range(10):
                for gamma_factor in range(get_gamma_range(model_type)):

                    if precision_loop == 0:
                        gamma = 0.0001 * 10**gamma_factor
                        epsilon = 0.1 * epsilon_factor
                        c = 100 * 10**c_factor
                    else:
                        # more precise
                        gamma = low_gamma * 2**gamma_factor
                        epsilon = low_epsilon + epsilon_increment * epsilon_factor
                        c = low_c * 2**c_factor

                    model = mh.get_model(model_type, c, epsilon, gamma)
                    if use_ensemble:
                        #model = AdaBoostRegressor(model)
                        #model = BaggingRegressor(model, max_features=10, n_estimators=20, max_samples=80)
                        model = EnsembleSVR(model_type)

                    #pipeline = make_pipeline(preprocessing.StandardScaler(), model)
                    #scores = cross_val_score(pipeline, data, results, cv=5, scoring=corr_test)
                    score = get_validation_score(data, results, model)

                    all_c.append(c)
                    all_epsilon.append(epsilon)
                    all_gamma.append(gamma)
                    all_score.append(score)
                    if score > best_sum_score:
                        print(score)
                        best_sum_score = score
                        best_c = c
                        best_epsilon = epsilon
                        best_gamma = gamma

                    # model = mh.get_model_fixed(MODEL_TYPE)
                    # pipeline = make_pipeline(preprocessing.StandardScaler(), model)
                    #score = get_validation_score(pipeline, data, results, cv=5, scoring=X_VALID_SCORE_MODEL)
                    #print(score)

            print('iter: ' + str(c_factor))
        #ph.plot_surf(all_c, all_epsilon, all_score, 1)
        if EXPORT_VALID and precision_loop == 0:
            #try:
            #   df = pd.read_csv('xvalid_analysis3.csv')
            #except:
            df = pd.DataFrame()
            df['c'] = all_c
            df['epsilon'] = all_epsilon
            df['gamma'] = all_gamma
            df['score' + VALID_SCORE_MODEL + ' ' +
               str(VALID_QUARTERS)] = all_score
            df.to_csv('xvalid_analysis.csv', index=False)

    if use_ensemble:
        kernel = 'RBF'  #model.models[0].get_params()['kernel']
    else:
        kernel = model.get_params()['kernel']

    print(kernel + ' & ' + VALID_SCORE_MODEL + ' & ' + str(round(best_c, 3)) +
          ' & ' + str(round(best_epsilon, 3)) + ' & ' +
          str(round(best_gamma, 5)) + ' & ' + str(round(best_sum_score, 3)))

    return mh.get_model(model_type, best_c, best_epsilon, best_gamma)