Beispiel #1
0
def test_get_siti_codsito_given_index():
    
    #load up the df_tratte
    df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",",  encoding = "ISO-8859-1")

    assert(df_tratte is not None)
    
    #single-digit siti_codsito, first one in the folder
    assert(get_siti_codsito_given_index(0, df_tratte) == "00000002")
    
    #two-digit siti_codsito
    assert(get_siti_codsito_given_index(11, df_tratte) == "00000013")
Beispiel #2
0
def load_input_data_frame(traffic_level_label):
    df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv",
                            sep=",",
                            encoding="ISO-8859-1")

    #let's load up the dataset for ora , i.e., the first dataset existing
    siti_codsito = get_siti_codsito_given_index(0, df_tratte)
    input_data_frame = load_df_traffic_station(
        siti_codsito, input_path="./d/Stations_Past_2018")
    input_data_frame_processed = preprocess_data_frame(input_data_frame)
    input_data_frame = input_data_frame_processed.dropna()

    assert (len(input_data_frame) > 0)

    #Let's check if there are indeed no NAs
    assert (sum(len(input_data_frame) - input_data_frame.count()) == 0)

    traffic_y_df = input_data_frame[[traffic_level_label]]
    traffic_X_df = input_data_frame[[
        "TEMPERATURE", "NIEDERSCHLAG", "HOUR", "WEEK_DAY"
    ]]
    #Our data suffers from data unbalancing. Let's re-balance it via random resampling
    traffic_X_df_balanced, traffic_y_df_balanced = random_resample_from_categories(
        traffic_X_df, traffic_y_df, traffic_level_label)

    return (traffic_X_df, traffic_y_df, traffic_X_df_balanced,
            traffic_y_df_balanced)
Beispiel #3
0
def test_preprocess_data_frame():
    
    #let's see if we can correctly preprocess a data frame and add meta-data information to it
    df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",",  encoding = "ISO-8859-1")
    
    #Let's get the first siti_codsito
    siti_codsito_init = get_siti_codsito_given_index(0, df_tratte)
    
    input_data_frame = load_df_traffic_station(siti_codsito_init, input_path="./d/Stations_Past_2018")
    
    preprocessed_data_frame = preprocess_data_frame(input_data_frame)
    
    display_info_about_station(0, "000000002", 1, df_tratte)
    
    display_info_about_station(0, "000000002", 2, df_tratte)

    
    #the preprocessed data frame has more than 1 row
    assert(len(preprocessed_data_frame) > 1)
    
    #the number of rows in the pre-processed data frame is the same as in the original loaded data frame
    assert(len(preprocessed_data_frame) == len(input_data_frame))
    
    #Let's check if the columns "WEEK_DAY" and "HOUR" do exist in the preprocessed data frame
    assert("WEEK_DAY" in preprocessed_data_frame.columns)
    
    assert("HOUR" in preprocessed_data_frame.columns)
Beispiel #4
0
def test_train_nn_numerical_output():
    traffic_num_label = "COUNT_1"

    df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv",
                            sep=",",
                            encoding="ISO-8859-1")
    siti_codsito = get_siti_codsito_given_index(0, df_tratte)
    input_data_frame = load_df_traffic_station(
        siti_codsito, input_path="./d/Stations_Past_2018")

    input_data_frame_processed = preprocess_data_frame(input_data_frame)

    train_nn_model_numerical_output(input_data_frame_processed,
                                    traffic_num_label, siti_codsito, "./d")

    #Let's check if the scalers, encoders and the numeric NN model have been successfully dumped to disk
    with not_raises(Exception):
        check_list_files_exist([
            "./d/scaler_temperature_COUNT_1_00000002.pck",
            "./d/scaler_niederschlag_COUNT_1_00000002.pck",
            "./d/encoder_week_day_COUNT_1_00000002.pck",
            "./d/encoder_hours_COUNT_1_00000002.pck",
            "./d/scaler_number_vehicles_COUNT_1_00000002.pck",
            "./d/nn_numeric_COUNT_1_00000002.pck"
        ])

    #let's remove the generated scalers, encoders and the NN model
    os.remove("./d/scaler_temperature_COUNT_1_00000002.pck")
    os.remove("./d/scaler_niederschlag_COUNT_1_00000002.pck")
    os.remove("./d/encoder_week_day_COUNT_1_00000002.pck")
    os.remove("./d/encoder_hours_COUNT_1_00000002.pck")
    os.remove("./d/scaler_number_vehicles_COUNT_1_00000002.pck")
    os.remove("./d/nn_numeric_COUNT_1_00000002.pck")
Beispiel #5
0
def test_nn_numeric_scalers_encoders():
    traffic_num_label = "COUNT_1"
    df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv",
                            sep=",",
                            encoding="ISO-8859-1")
    #let's load up the dataset for ora , i.e., the first dataset existing
    siti_codsito = get_siti_codsito_given_index(0, df_tratte)

    input_data_frame = load_df_traffic_station(
        siti_codsito, input_path="./d/Stations_Past_2018")
    input_data_frame_processed = preprocess_data_frame(input_data_frame)

    traffic_y_df_numeric = input_data_frame_processed[[traffic_num_label]]
    traffic_X_df = input_data_frame_processed[[
        "TEMPERATURE", "NIEDERSCHLAG", "HOUR", "WEEK_DAY"
    ]]

    #let's check that all values in the y labels are indeed numeric
    assert (all(
        isinstance(y_label, int)
        for y_label in traffic_y_df_numeric[traffic_num_label]))
    #let's check that all values in the y labels are > 0
    assert (all(y_label >= 0
                for y_label in traffic_y_df_numeric[traffic_num_label]))

    assert (len(traffic_X_df) == len(input_data_frame)
            and len(traffic_y_df_numeric) == len(input_data_frame))

    #Test if the 'fit_scalers_encoders' method does indeed work
    scaler_temperature, scaler_niederschlag, encoder_week_day, encoder_hours = fit_scalers_encoders(
        traffic_X_df)

    #let's see if the scalers have been successfully generated
    scaler_number_vehicles = MinMaxScaler()
    scaler_number_vehicles.fit(traffic_y_df_numeric)

    assert (scaler_temperature != None and scaler_niederschlag != None
            and encoder_week_day != None and encoder_hours != None
            and scaler_number_vehicles != None)
    assert (type(scaler_temperature) == StandardScaler
            and type(scaler_niederschlag) == StandardScaler
            and type(encoder_week_day) == OneHotEncoder
            and type(encoder_hours) == OneHotEncoder
            and type(scaler_number_vehicles) == MinMaxScaler)

    #Now let's use the scalers and encoders for scaling & encoding the input data
    traffic_y_df_scaled_balanced = scaler_number_vehicles.transform(
        traffic_y_df_numeric).flatten("C")

    #Let's check if the shape of the scaled y labels is correct
    assert (traffic_y_df_scaled_balanced.shape[0] == len(input_data_frame)
            and len(traffic_y_df_scaled_balanced) == len(input_data_frame))
    assert (traffic_y_df_scaled_balanced.ndim == 1)

    #Let's see if the values in the scaled y labels are indeed between 0 and 1
    assert (all(0 <= y_label <= 1
                for y_label in traffic_y_df_scaled_balanced) == True)
Beispiel #6
0
def test_fit_scalers_encoders():
    
    #let's load up one dummy dataset
    df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",",  encoding = "ISO-8859-1")
    #Let's get the first siti_codsito
    siti_codsito_init = get_siti_codsito_given_index(0, df_tratte)
    input_data_frame = load_df_traffic_station(siti_codsito_init, input_path="./d/Stations_Past_2018")
    
    traffic_X_preprocessed = preprocess_data_frame(input_data_frame)
    
    #Let's get the X data only
    traffic_X_df = traffic_X_preprocessed[["TEMPERATURE", "NIEDERSCHLAG", "HOUR", "WEEK_DAY"]]
    
    #create scalers and encoders for X data
    scaler_temperature, scaler_niederschlag, encoder_week_day, encoder_hours = fit_scalers_encoders(traffic_X_df)
    
    assert(scaler_temperature != None and scaler_niederschlag != None and encoder_week_day != None and encoder_hours != None )
    
    #let's apply the scaler to the X data
    
    X_data_concat = preprocess_X_data_nn(traffic_X_df, scaler_temperature, scaler_niederschlag, encoder_week_day, encoder_hours)
    
    #Let's check if the X data preprocessed has the same length as the input X df
    assert(len(X_data_concat) == len(traffic_X_df))
    
    #Let's check if the scalers have been successfully applied, namely if:
    
    #HOURS are categorical --> one-hot encoded. So only one value must be 1 among them
    assert(sum(X_data_concat[0][0:24]) == 1)
    
    #Let's check if the hour has been correctly encoded, namely midnight is correctly encoded to [1,0,0,......0]
    assert(X_data_concat[0][0] == 1 and traffic_X_df.iloc[0]["HOUR"] == 0)
    
    #WEEK_DAY are categorical --> one-hot encoded. So only one value must be 1 among them
    assert(sum(X_data_concat[0][24:31]) == 1)
    
    #Let's check if the WEEK_DAY has been correctly encoded, namely monday is correctly encoded to [1,0,0,......0]
    assert(X_data_concat[0][24] == 1 and traffic_X_df.iloc[0]["WEEK_DAY"] == 0)
    
    #NIEDERSCHLAG is numeric --> Check if it has been Z-scaled properly to a known value
    assert(X_data_concat[0][31] < -0.144774 and X_data_concat[0][31] > -0.144775)
    
    #TEMPERATURE is numeric --> check if it has been Z-scaled properly to a known value
    assert(X_data_concat[0][32] < -1.403439 and X_data_concat[0][32] > -1.40344)
Beispiel #7
0
def test_load_df_traffic_station():
    
    ###Test for loading up a valid dataset
    df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv", sep=",",  encoding = "ISO-8859-1")
    
    #Let's get the first siti_codsito
    siti_codsito_init = get_siti_codsito_given_index(0, df_tratte)
    
    input_data_frame = load_df_traffic_station(siti_codsito_init, input_path="./d/Stations_Past_2018")
    
    #Check if a data frame was indeed loaded up
    assert(type(input_data_frame) == pd.core.frame.DataFrame)
    
    #Let's see if the loaded dataframe has more than 1 row
    assert(len(input_data_frame) > 1)
    
    ###Test for loading up a file in a non-existing folder
    with pytest.raises(FileNotFoundError):
        input_data_frame = load_df_traffic_station(siti_codsito_init, input_path="./d/Stations_Dummy_2018")
Beispiel #8
0
def test_train_models_categorical_output():
    df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv",
                            sep=",",
                            encoding="ISO-8859-1")

    #let's load up the dataset for ora , i.e., the first dataset existing
    siti_codsito = get_siti_codsito_given_index(0, df_tratte)
    input_data_frame = load_df_traffic_station(
        siti_codsito, input_path="./d/Stations_Past_2018")

    input_data_frame_preprocessed = preprocess_data_frame(input_data_frame)

    list_traffic_labels = ["TRAFFIC_1", "TRAFFIC_2"]

    for traffic_level_label in list_traffic_labels:
        list_models = train_models_categorical_output(
            input_data_frame_preprocessed, traffic_level_label, "00000002",
            "./d/")

        assert (type(list_models[0]) == DecisionTreeClassifier)
        assert (type(list_models[1]) == RandomForestClassifier)
        assert (type(list_models[2]) == KNeighborsClassifier)

        #let's check if the models have been indeed successfully created
        assert (not (list_models == None))
        assert (len(list_models) == 3)

        #let's delete the created models
        with not_raises(Exception):
            check_list_files_exist([
                "./d/kn_model_" + traffic_level_label + "_00000002.pck",
                "./d/opt_dec_tree_" + traffic_level_label + "_00000002.pck",
                "./d/simple_rf_" + traffic_level_label + "_00000002.pck"
            ])

        os.remove("./d/kn_model_" + traffic_level_label + "_00000002.pck")
        os.remove("./d/opt_dec_tree_" + traffic_level_label + "_00000002.pck")
        os.remove("./d/simple_rf_" + traffic_level_label + "_00000002.pck")
Beispiel #9
0
def test_nn_numeric_model_creation():
    #Data loading - Ora Nord.
    traffic_num_label = "COUNT_1"

    df_tratte = pd.read_csv("./d/data_frame_tratte_meteo_suedtirol_fixed.csv",
                            sep=",",
                            encoding="ISO-8859-1")
    siti_codsito = get_siti_codsito_given_index(0, df_tratte)
    input_data_frame = load_df_traffic_station(
        siti_codsito, input_path="./d/Stations_Past_2018")

    input_data_frame_processed = preprocess_data_frame(input_data_frame)
    traffic_X_df = input_data_frame_processed[[
        "TEMPERATURE", "NIEDERSCHLAG", "HOUR", "WEEK_DAY"
    ]]
    traffic_y_df_numeric = input_data_frame[[traffic_num_label]]

    #Scalers and encoders
    scaler_temperature, scaler_niederschlag, encoder_week_day, encoder_hours = fit_scalers_encoders(
        traffic_X_df)
    scaler_number_vehicles = MinMaxScaler()
    scaler_number_vehicles.fit(traffic_y_df_numeric)
    traffic_y_df_scaled_balanced = scaler_number_vehicles.transform(
        traffic_y_df_numeric).flatten("C")
    traffic_X_df_processed = preprocess_X_data_nn(traffic_X_df,
                                                  scaler_temperature,
                                                  scaler_niederschlag,
                                                  encoder_week_day,
                                                  encoder_hours)

    #Holdout 80-20
    X_train, X_test, y_train, y_test = train_test_split(
        traffic_X_df_processed,
        traffic_y_df_scaled_balanced,
        test_size=0.2,
        random_state=101)

    #Neural Network model generation
    nn_model = Sequential()
    nn_model.add(
        Dense(100,
              input_shape=(X_train.shape[1], ),
              kernel_initializer='normal',
              activation='relu'))
    nn_model.add(Dense(100, activation="relu"))
    nn_model.add(Dense(100, activation="relu"))
    nn_model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    nn_model.compile(loss='mse', optimizer=Adam())
    run_hist = nn_model.fit(X_train,
                            y_train,
                            validation_data=(X_test, y_test),
                            epochs=10,
                            verbose=1)

    plot_train_valid_loss(run_hist)

    #Let's check if the NN model has been successfully generated and fitted

    assert (nn_model != None)
    assert (type(nn_model) == Sequential)
    #Number of times for which the model has been trained
    assert (len(run_hist.history["val_loss"]) == 10
            and len(run_hist.history["loss"]) == 10)

    acc_train = round(run_hist.history["loss"][-1], 3)
    acc_test = round(run_hist.history["val_loss"][-1], 3)

    #Test
    assert (acc_train > 0)
    assert (acc_test > 0)

    mean_diff_perc_rounded_train = compute_percentage_error(
        nn_model, X_train, y_train, scaler_number_vehicles, "Training")
    mean_diff_perc_rounded_test = compute_percentage_error(
        nn_model, X_test, y_test, scaler_number_vehicles, "Test")

    assert (mean_diff_perc_rounded_test > 0
            and mean_diff_perc_rounded_train > 0)