def cleaning_reg_ex(data=loading_data()): regex = re.compile(r"\[|\]|<", re.IGNORECASE) data.columns = [ regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col for col in data.columns.values ] return data
def test_loading_data_return_numberColumns(): output = loading_data() CSV = pd.read_csv( "https://ndownloader.figshare.com/files/20976540", index_col="instant", parse_dates=True, ) expected_output = len(CSV.columns) assert len(output.columns) == expected_output
def test_loading_data_retur_describe(): output = loading_data() CSV = pd.read_csv( "https://ndownloader.figshare.com/files/20976540", index_col="instant", parse_dates=True, ) expected_output = print(CSV.dtypes) assert print(output.dtypes) == expected_output
def test_train_split(data=whole_preprocessing(loading_data())): hour_d_train = data.iloc[0:15211] hour_d_test = data.iloc[15212:17379] hour_d_train = hour_d_train.drop( columns=["dteday", "casual", "atemp", "registered"]) hour_d_test = hour_d_test.drop( columns=["dteday", "casual", "registered", "atemp"]) # seperate the independent and target variable on testing data hour_d_train_x = hour_d_train.drop(columns=["cnt"], axis=1) hour_d_train_y = hour_d_train["cnt"] # seperate the independent and target variable on test data hour_d_test_x = hour_d_test.drop(columns=["cnt"], axis=1) hour_d_test_y = hour_d_test["cnt"] return hour_d_train_x, hour_d_train_y, hour_d_test_x, hour_d_test_y
def feature_engeneering(data=loading_data()): data["IsOfficeHour"] = np.where( (data["hr"] >= 9) & (data["hr"] < 17) & (data["weekday"] == 1), 1, 0 ) data["IsDaytime"] = np.where((data["hr"] >= 6) & (data["hr"] < 22), 1, 0) data["IsRushHourMorning"] = np.where( (data["hr"] >= 6) & (data["hr"] < 10) & (data["weekday"] == 1), 1, 0 ) data["IsRushHourEvening"] = np.where( (data["hr"] >= 15) & (data["hr"] < 19) & (data["weekday"] == 1), 1, 0 ) data["IsHighSeason"] = np.where((data["season"] == 3), 1, 0) bins = [0, 0.19, 0.49, 0.69, 0.89, 1] data["temp_binned"] = pd.cut(data["temp"], bins) data["hum_binned"] = pd.cut(data["hum"], bins) return data
def convert_to_category(data=loading_data()): convert_to_category = [ "season", "yr", "mnth", "hr", "holiday", "weekday", "workingday", "weathersit", "IsOfficeHour", "IsDaytime", "IsRushHourMorning", "IsRushHourEvening", "IsHighSeason", "temp_binned", "hum_binned", ] for col in convert_to_category: data[col] = data[col].astype("category") return data
def test_predict_data_cols_len(): output = len(whole_preprocessing(loading_data()).columns) expected_output = 85 assert output == expected_output
def test_predict_output_len(): output = len(predict(loading_data())) expected_output = 21 assert output == expected_output
def test_predict_type(): output = type(predict(loading_data())) expected_output = str assert output == expected_output
def test_loading_data_return_DF(): output = loading_data() assert type(output) is pd.DataFrame
def test_predict_data_rows_len(): output = len(whole_preprocessing(loading_data())) expected_output = 17379 assert output == expected_output
def test_test_train_split_column_length(data=whole_preprocessing( loading_data())): col_num = test_train_split(data)[0] output = len(col_num.columns) expected_output = 80 assert output == expected_output
def test_test_train_split_length(data=whole_preprocessing(loading_data())): output = len(test_train_split(data)[0]) expected_output = 15211 assert output == expected_output
def get_dummies(data=loading_data()): data = pd.get_dummies(data) return data
def test_test_train_split_target_type(data=whole_preprocessing( loading_data())): cnt = test_train_split(data)[1] output = type(cnt) expected_output = pd.core.series.Series assert output == expected_output
def test_test_train_split_length_tuple(data=whole_preprocessing( loading_data())): output = len(test_train_split(data)) expected_output = 4 assert output == expected_output
def test_test_train_split_train_not_cnt(data=whole_preprocessing( loading_data())): output = test_train_split(data)[2].columns expected_output = "cnt" assert expected_output not in output
def skewness(data=loading_data()): data["windspeed"] = np.log1p(data.windspeed) data["cnt"] = np.sqrt(data.cnt) return data
def convert_date_time(data=loading_data()): data["dteday"] = pd.to_datetime(data["dteday"]) return data
def test_test_train_split(data=whole_preprocessing(loading_data())): output = test_train_split(data) assert type(output) is tuple