def load_input_data_frame(traffic_level_label): df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",", encoding="ISO-8859-1") #let's load up the dataset for ora , i.e., the first dataset existing siti_codsito = get_siti_codsito_given_index(0, df_tratte) input_data_frame = load_df_traffic_station( siti_codsito, input_path="./d/Stations_Past_2018") input_data_frame_processed = preprocess_data_frame(input_data_frame) input_data_frame = input_data_frame_processed.dropna() assert (len(input_data_frame) > 0) #Let's check if there are indeed no NAs assert (sum(len(input_data_frame) - input_data_frame.count()) == 0) traffic_y_df = input_data_frame[[traffic_level_label]] traffic_X_df = input_data_frame[[ "TEMPERATURE", "NIEDERSCHLAG", "HOUR", "WEEK_DAY" ]] #Our data suffers from data unbalancing. Let's re-balance it via random resampling traffic_X_df_balanced, traffic_y_df_balanced = random_resample_from_categories( traffic_X_df, traffic_y_df, traffic_level_label) return (traffic_X_df, traffic_y_df, traffic_X_df_balanced, traffic_y_df_balanced)
def test_preprocess_data_frame(): #let's see if we can correctly preprocess a data frame and add meta-data information to it df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",", encoding = "ISO-8859-1") #Let's get the first siti_codsito siti_codsito_init = get_siti_codsito_given_index(0, df_tratte) input_data_frame = load_df_traffic_station(siti_codsito_init, input_path="./d/Stations_Past_2018") preprocessed_data_frame = preprocess_data_frame(input_data_frame) display_info_about_station(0, "000000002", 1, df_tratte) display_info_about_station(0, "000000002", 2, df_tratte) #the preprocessed data frame has more than 1 row assert(len(preprocessed_data_frame) > 1) #the number of rows in the pre-processed data frame is the same as in the original loaded data frame assert(len(preprocessed_data_frame) == len(input_data_frame)) #Let's check if the columns "WEEK_DAY" and "HOUR" do exist in the preprocessed data frame assert("WEEK_DAY" in preprocessed_data_frame.columns) assert("HOUR" in preprocessed_data_frame.columns)
def test_train_nn_numerical_output(): traffic_num_label = "COUNT_1" df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",", encoding="ISO-8859-1") siti_codsito = get_siti_codsito_given_index(0, df_tratte) input_data_frame = load_df_traffic_station( siti_codsito, input_path="./d/Stations_Past_2018") input_data_frame_processed = preprocess_data_frame(input_data_frame) train_nn_model_numerical_output(input_data_frame_processed, traffic_num_label, siti_codsito, "./d") #Let's check if the scalers, encoders and the numeric NN model have been successfully dumped to disk with not_raises(Exception): check_list_files_exist([ "./d/scaler_temperature_COUNT_1_00000002.pck", "./d/scaler_niederschlag_COUNT_1_00000002.pck", "./d/encoder_week_day_COUNT_1_00000002.pck", "./d/encoder_hours_COUNT_1_00000002.pck", "./d/scaler_number_vehicles_COUNT_1_00000002.pck", "./d/nn_numeric_COUNT_1_00000002.pck" ]) #let's remove the generated scalers, encoders and the NN model os.remove("./d/scaler_temperature_COUNT_1_00000002.pck") os.remove("./d/scaler_niederschlag_COUNT_1_00000002.pck") os.remove("./d/encoder_week_day_COUNT_1_00000002.pck") os.remove("./d/encoder_hours_COUNT_1_00000002.pck") os.remove("./d/scaler_number_vehicles_COUNT_1_00000002.pck") os.remove("./d/nn_numeric_COUNT_1_00000002.pck")
def test_nn_numeric_scalers_encoders(): traffic_num_label = "COUNT_1" df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",", encoding="ISO-8859-1") #let's load up the dataset for ora , i.e., the first dataset existing siti_codsito = get_siti_codsito_given_index(0, df_tratte) input_data_frame = load_df_traffic_station( siti_codsito, input_path="./d/Stations_Past_2018") input_data_frame_processed = preprocess_data_frame(input_data_frame) traffic_y_df_numeric = input_data_frame_processed[[traffic_num_label]] traffic_X_df = input_data_frame_processed[[ "TEMPERATURE", "NIEDERSCHLAG", "HOUR", "WEEK_DAY" ]] #let's check that all values in the y labels are indeed numeric assert (all( isinstance(y_label, int) for y_label in traffic_y_df_numeric[traffic_num_label])) #let's check that all values in the y labels are > 0 assert (all(y_label >= 0 for y_label in traffic_y_df_numeric[traffic_num_label])) assert (len(traffic_X_df) == len(input_data_frame) and len(traffic_y_df_numeric) == len(input_data_frame)) #Test if the 'fit_scalers_encoders' method does indeed work scaler_temperature, scaler_niederschlag, encoder_week_day, encoder_hours = fit_scalers_encoders( traffic_X_df) #let's see if the scalers have been successfully generated scaler_number_vehicles = MinMaxScaler() scaler_number_vehicles.fit(traffic_y_df_numeric) assert (scaler_temperature != None and scaler_niederschlag != None and encoder_week_day != None and encoder_hours != None and scaler_number_vehicles != None) assert (type(scaler_temperature) == StandardScaler and type(scaler_niederschlag) == StandardScaler and type(encoder_week_day) == OneHotEncoder and type(encoder_hours) == OneHotEncoder and type(scaler_number_vehicles) == MinMaxScaler) #Now let's use the scalers and encoders for scaling & encoding the input data traffic_y_df_scaled_balanced = scaler_number_vehicles.transform( traffic_y_df_numeric).flatten("C") #Let's check if the shape of the scaled y labels is correct assert (traffic_y_df_scaled_balanced.shape[0] == len(input_data_frame) and len(traffic_y_df_scaled_balanced) == len(input_data_frame)) assert (traffic_y_df_scaled_balanced.ndim == 1) #Let's see if the values in the scaled y labels are indeed between 0 and 1 assert (all(0 <= y_label <= 1 for y_label in traffic_y_df_scaled_balanced) == True)
def test_load_df_traffic_station(): ###Test for loading up a valid dataset df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",", encoding = "ISO-8859-1") #Let's get the first siti_codsito siti_codsito_init = get_siti_codsito_given_index(0, df_tratte) input_data_frame = load_df_traffic_station(siti_codsito_init, input_path="./d/Stations_Past_2018") #Check if a data frame was indeed loaded up assert(type(input_data_frame) == pd.core.frame.DataFrame) #Let's see if the loaded dataframe has more than 1 row assert(len(input_data_frame) > 1) ###Test for loading up a file in a non-existing folder with pytest.raises(FileNotFoundError): input_data_frame = load_df_traffic_station(siti_codsito_init, input_path="./d/Stations_Dummy_2018")
def test_fit_scalers_encoders(): #let's load up one dummy dataset df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",", encoding = "ISO-8859-1") #Let's get the first siti_codsito siti_codsito_init = get_siti_codsito_given_index(0, df_tratte) input_data_frame = load_df_traffic_station(siti_codsito_init, input_path="./d/Stations_Past_2018") traffic_X_preprocessed = preprocess_data_frame(input_data_frame) #Let's get the X data only traffic_X_df = traffic_X_preprocessed[["TEMPERATURE", "NIEDERSCHLAG", "HOUR", "WEEK_DAY"]] #create scalers and encoders for X data scaler_temperature, scaler_niederschlag, encoder_week_day, encoder_hours = fit_scalers_encoders(traffic_X_df) assert(scaler_temperature != None and scaler_niederschlag != None and encoder_week_day != None and encoder_hours != None ) #let's apply the scaler to the X data X_data_concat = preprocess_X_data_nn(traffic_X_df, scaler_temperature, scaler_niederschlag, encoder_week_day, encoder_hours) #Let's check if the X data preprocessed has the same length as the input X df assert(len(X_data_concat) == len(traffic_X_df)) #Let's check if the scalers have been successfully applied, namely if: #HOURS are categorical --> one-hot encoded. So only one value must be 1 among them assert(sum(X_data_concat[0][0:24]) == 1) #Let's check if the hour has been correctly encoded, namely midnight is correctly encoded to [1,0,0,......0] assert(X_data_concat[0][0] == 1 and traffic_X_df.iloc[0]["HOUR"] == 0) #WEEK_DAY are categorical --> one-hot encoded. So only one value must be 1 among them assert(sum(X_data_concat[0][24:31]) == 1) #Let's check if the WEEK_DAY has been correctly encoded, namely monday is correctly encoded to [1,0,0,......0] assert(X_data_concat[0][24] == 1 and traffic_X_df.iloc[0]["WEEK_DAY"] == 0) #NIEDERSCHLAG is numeric --> Check if it has been Z-scaled properly to a known value assert(X_data_concat[0][31] < -0.144774 and X_data_concat[0][31] > -0.144775) #TEMPERATURE is numeric --> check if it has been Z-scaled properly to a known value assert(X_data_concat[0][32] < -1.403439 and X_data_concat[0][32] > -1.40344)
def test_train_models_categorical_output(): df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",", encoding="ISO-8859-1") #let's load up the dataset for ora , i.e., the first dataset existing siti_codsito = get_siti_codsito_given_index(0, df_tratte) input_data_frame = load_df_traffic_station( siti_codsito, input_path="./d/Stations_Past_2018") input_data_frame_preprocessed = preprocess_data_frame(input_data_frame) list_traffic_labels = ["TRAFFIC_1", "TRAFFIC_2"] for traffic_level_label in list_traffic_labels: list_models = train_models_categorical_output( input_data_frame_preprocessed, traffic_level_label, "00000002", "./d/") assert (type(list_models[0]) == DecisionTreeClassifier) assert (type(list_models[1]) == RandomForestClassifier) assert (type(list_models[2]) == KNeighborsClassifier) #let's check if the models have been indeed successfully created assert (not (list_models == None)) assert (len(list_models) == 3) #let's delete the created models with not_raises(Exception): check_list_files_exist([ "./d/kn_model_" + traffic_level_label + "_00000002.pck", "./d/opt_dec_tree_" + traffic_level_label + "_00000002.pck", "./d/simple_rf_" + traffic_level_label + "_00000002.pck" ]) os.remove("./d/kn_model_" + traffic_level_label + "_00000002.pck") os.remove("./d/opt_dec_tree_" + traffic_level_label + "_00000002.pck") os.remove("./d/simple_rf_" + traffic_level_label + "_00000002.pck")
def test_nn_numeric_model_creation(): #Data loading - Ora Nord. traffic_num_label = "COUNT_1" df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",", encoding="ISO-8859-1") siti_codsito = get_siti_codsito_given_index(0, df_tratte) input_data_frame = load_df_traffic_station( siti_codsito, input_path="./d/Stations_Past_2018") input_data_frame_processed = preprocess_data_frame(input_data_frame) traffic_X_df = input_data_frame_processed[[ "TEMPERATURE", "NIEDERSCHLAG", "HOUR", "WEEK_DAY" ]] traffic_y_df_numeric = input_data_frame[[traffic_num_label]] #Scalers and encoders scaler_temperature, scaler_niederschlag, encoder_week_day, encoder_hours = fit_scalers_encoders( traffic_X_df) scaler_number_vehicles = MinMaxScaler() scaler_number_vehicles.fit(traffic_y_df_numeric) traffic_y_df_scaled_balanced = scaler_number_vehicles.transform( traffic_y_df_numeric).flatten("C") traffic_X_df_processed = preprocess_X_data_nn(traffic_X_df, scaler_temperature, scaler_niederschlag, encoder_week_day, encoder_hours) #Holdout 80-20 X_train, X_test, y_train, y_test = train_test_split( traffic_X_df_processed, traffic_y_df_scaled_balanced, test_size=0.2, random_state=101) #Neural Network model generation nn_model = Sequential() nn_model.add( Dense(100, input_shape=(X_train.shape[1], ), kernel_initializer='normal', activation='relu')) nn_model.add(Dense(100, activation="relu")) nn_model.add(Dense(100, activation="relu")) nn_model.add(Dense(1, kernel_initializer='normal')) # Compile model nn_model.compile(loss='mse', optimizer=Adam()) run_hist = nn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, verbose=1) plot_train_valid_loss(run_hist) #Let's check if the NN model has been successfully generated and fitted assert (nn_model != None) assert (type(nn_model) == Sequential) #Number of times for which the model has been trained assert (len(run_hist.history["val_loss"]) == 10 and len(run_hist.history["loss"]) == 10) acc_train = round(run_hist.history["loss"][-1], 3) acc_test = round(run_hist.history["val_loss"][-1], 3) #Test assert (acc_train > 0) assert (acc_test > 0) mean_diff_perc_rounded_train = compute_percentage_error( nn_model, X_train, y_train, scaler_number_vehicles, "Training") mean_diff_perc_rounded_test = compute_percentage_error( nn_model, X_test, y_test, scaler_number_vehicles, "Test") assert (mean_diff_perc_rounded_test > 0 and mean_diff_perc_rounded_train > 0)