Example #1
0
def cleaning_reg_ex(data=loading_data()):
    regex = re.compile(r"\[|\]|<", re.IGNORECASE)
    data.columns = [
        regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
        for col in data.columns.values
    ]
    return data
Example #2
0
def test_loading_data_return_numberColumns():
    output = loading_data()
    CSV = pd.read_csv(
        "https://ndownloader.figshare.com/files/20976540",
        index_col="instant",
        parse_dates=True,
    )
    expected_output = len(CSV.columns)
    assert len(output.columns) == expected_output
Example #3
0
def test_loading_data_retur_describe():
    output = loading_data()
    CSV = pd.read_csv(
        "https://ndownloader.figshare.com/files/20976540",
        index_col="instant",
        parse_dates=True,
    )
    expected_output = print(CSV.dtypes)
    assert print(output.dtypes) == expected_output
Example #4
0
def test_train_split(data=whole_preprocessing(loading_data())):
    hour_d_train = data.iloc[0:15211]
    hour_d_test = data.iloc[15212:17379]
    hour_d_train = hour_d_train.drop(
        columns=["dteday", "casual", "atemp", "registered"])
    hour_d_test = hour_d_test.drop(
        columns=["dteday", "casual", "registered", "atemp"])

    # seperate the independent and target variable on testing data
    hour_d_train_x = hour_d_train.drop(columns=["cnt"], axis=1)
    hour_d_train_y = hour_d_train["cnt"]

    # seperate the independent and target variable on test data
    hour_d_test_x = hour_d_test.drop(columns=["cnt"], axis=1)
    hour_d_test_y = hour_d_test["cnt"]
    return hour_d_train_x, hour_d_train_y, hour_d_test_x, hour_d_test_y
Example #5
0
def feature_engeneering(data=loading_data()):
    data["IsOfficeHour"] = np.where(
        (data["hr"] >= 9) & (data["hr"] < 17) & (data["weekday"] == 1), 1, 0
    )
    data["IsDaytime"] = np.where((data["hr"] >= 6) & (data["hr"] < 22), 1, 0)
    data["IsRushHourMorning"] = np.where(
        (data["hr"] >= 6) & (data["hr"] < 10) & (data["weekday"] == 1), 1, 0
    )
    data["IsRushHourEvening"] = np.where(
        (data["hr"] >= 15) & (data["hr"] < 19) & (data["weekday"] == 1), 1, 0
    )
    data["IsHighSeason"] = np.where((data["season"] == 3), 1, 0)

    bins = [0, 0.19, 0.49, 0.69, 0.89, 1]
    data["temp_binned"] = pd.cut(data["temp"], bins)
    data["hum_binned"] = pd.cut(data["hum"], bins)
    return data
Example #6
0
def convert_to_category(data=loading_data()):
    convert_to_category = [
        "season",
        "yr",
        "mnth",
        "hr",
        "holiday",
        "weekday",
        "workingday",
        "weathersit",
        "IsOfficeHour",
        "IsDaytime",
        "IsRushHourMorning",
        "IsRushHourEvening",
        "IsHighSeason",
        "temp_binned",
        "hum_binned",
    ]
    for col in convert_to_category:
        data[col] = data[col].astype("category")
    return data
Example #7
0
def test_predict_data_cols_len():
    output = len(whole_preprocessing(loading_data()).columns)
    expected_output = 85
    assert output == expected_output
Example #8
0
def test_predict_output_len():
    output = len(predict(loading_data()))
    expected_output = 21
    assert output == expected_output
Example #9
0
def test_predict_type():
    output = type(predict(loading_data()))
    expected_output = str
    assert output == expected_output
Example #10
0
def test_loading_data_return_DF():
    output = loading_data()
    assert type(output) is pd.DataFrame
Example #11
0
def test_predict_data_rows_len():
    output = len(whole_preprocessing(loading_data()))
    expected_output = 17379
    assert output == expected_output
Example #12
0
def test_test_train_split_column_length(data=whole_preprocessing(
    loading_data())):
    col_num = test_train_split(data)[0]
    output = len(col_num.columns)
    expected_output = 80
    assert output == expected_output
Example #13
0
def test_test_train_split_length(data=whole_preprocessing(loading_data())):
    output = len(test_train_split(data)[0])
    expected_output = 15211
    assert output == expected_output
Example #14
0
def get_dummies(data=loading_data()):
    data = pd.get_dummies(data)
    return data
Example #15
0
def test_test_train_split_target_type(data=whole_preprocessing(
    loading_data())):
    cnt = test_train_split(data)[1]
    output = type(cnt)
    expected_output = pd.core.series.Series
    assert output == expected_output
Example #16
0
def test_test_train_split_length_tuple(data=whole_preprocessing(
    loading_data())):
    output = len(test_train_split(data))
    expected_output = 4
    assert output == expected_output
Example #17
0
def test_test_train_split_train_not_cnt(data=whole_preprocessing(
    loading_data())):
    output = test_train_split(data)[2].columns
    expected_output = "cnt"
    assert expected_output not in output
Example #18
0
def skewness(data=loading_data()):
    data["windspeed"] = np.log1p(data.windspeed)
    data["cnt"] = np.sqrt(data.cnt)
    return data
Example #19
0
def convert_date_time(data=loading_data()):
    data["dteday"] = pd.to_datetime(data["dteday"])
    return data
Example #20
0
def test_test_train_split(data=whole_preprocessing(loading_data())):
    output = test_train_split(data)
    assert type(output) is tuple