Ejemplo n.º 1
0
def predict(data):
    """
    Predicts the fitted model. Takes as an argument raw data, preprocesses it and does test-train splitting

    Args:

    data: directory with raw data without any preprocessing or test train splitting

    Returns:

    result_xgb(int64): predicted count of bikes

    """

    hour_d = whole_preprocessing(data)

    hour_d_train_x, hour_d_train_y, hour_d_test_x, hour_d_test_y = test_train_split(
        hour_d)

    try:
        result_xgb = clf.predict(hour_d_test_x)
        print("R-squared for Train: %.2f" %
              clf.score(hour_d_train_x, hour_d_train_y))
        print("R-squared for Test: %.2f" %
              clf.score(hour_d_test_x, hour_d_test_y))
        RMSE = np.sqrt(np.mean((hour_d_test_y * 2 - result_xgb * 2)**2))
        MSE = RMSE**2
        print("MSE ={}".format(MSE))
        print("RMSE = {}".format(RMSE))
    except:
        try:
            clf = load("model.pkl")
            result_xgb = clf.predict(hour_d_test_x)
            print("R-squared for Train: %.2f" %
                  clf.score(hour_d_train_x, hour_d_train_y))
            print("R-squared for Test: %.2f" %
                  clf.score(hour_d_test_x, hour_d_test_y))
            RMSE = np.sqrt(np.mean((hour_d_test_y * 2 - result_xgb * 2)**2))
            MSE = RMSE**2
            print("MSE ={}".format(MSE))
            print("RMSE = {}".format(RMSE))
        except:
            try:
                clf = train_and_persist(hour_d_train_x, hour_d_train_y,
                                        hour_d_test_x, hour_d_test_y)
                result_xgb = clf.predict(hour_d_test_x)
                print("R-squared for Train: %.2f" %
                      clf.score(hour_d_train_x, hour_d_train_y))
                print("R-squared for Test: %.2f" %
                      clf.score(hour_d_test_x, hour_d_test_y))
                RMSE = np.sqrt(np.mean(
                    (hour_d_test_y * 2 - result_xgb * 2)**2))
                MSE = RMSE**2
                print("MSE ={}".format(MSE))
                print("RMSE = {}".format(RMSE))
            except:
                pass

    return "predictions-available"
Ejemplo n.º 2
0
def test_train_split(data=whole_preprocessing(loading_data())):
    hour_d_train = data.iloc[0:15211]
    hour_d_test = data.iloc[15212:17379]
    hour_d_train = hour_d_train.drop(
        columns=["dteday", "casual", "atemp", "registered"])
    hour_d_test = hour_d_test.drop(
        columns=["dteday", "casual", "registered", "atemp"])

    # seperate the independent and target variable on testing data
    hour_d_train_x = hour_d_train.drop(columns=["cnt"], axis=1)
    hour_d_train_y = hour_d_train["cnt"]

    # seperate the independent and target variable on test data
    hour_d_test_x = hour_d_test.drop(columns=["cnt"], axis=1)
    hour_d_test_y = hour_d_test["cnt"]
    return hour_d_train_x, hour_d_train_y, hour_d_test_x, hour_d_test_y
Ejemplo n.º 3
0
def test_predict_data_cols_len():
    output = len(whole_preprocessing(loading_data()).columns)
    expected_output = 85
    assert output == expected_output
Ejemplo n.º 4
0
def test_predict_data_rows_len():
    output = len(whole_preprocessing(loading_data()))
    expected_output = 17379
    assert output == expected_output
Ejemplo n.º 5
0
def test_test_train_split(data=whole_preprocessing(loading_data())):
    output = test_train_split(data)
    assert type(output) is tuple
Ejemplo n.º 6
0
def test_test_train_split_train_not_cnt(data=whole_preprocessing(
    loading_data())):
    output = test_train_split(data)[2].columns
    expected_output = "cnt"
    assert expected_output not in output
Ejemplo n.º 7
0
def test_test_train_split_target_type(data=whole_preprocessing(
    loading_data())):
    cnt = test_train_split(data)[1]
    output = type(cnt)
    expected_output = pd.core.series.Series
    assert output == expected_output
Ejemplo n.º 8
0
def test_test_train_split_column_length(data=whole_preprocessing(
    loading_data())):
    col_num = test_train_split(data)[0]
    output = len(col_num.columns)
    expected_output = 80
    assert output == expected_output
Ejemplo n.º 9
0
def test_test_train_split_length(data=whole_preprocessing(loading_data())):
    output = len(test_train_split(data)[0])
    expected_output = 15211
    assert output == expected_output
Ejemplo n.º 10
0
def test_test_train_split_length_tuple(data=whole_preprocessing(
    loading_data())):
    output = len(test_train_split(data))
    expected_output = 4
    assert output == expected_output