def test(self,
             tabular_path: str,
             join_result_path: str,
             model_path: str,
             model_weights_path=None,
             histogram_path=None) -> (float, float, float, float):
        """
        Evaluate the accuracy metrics of a trained  model for spatial join cost estimator
        :return mean_squared_error, mean_absolute_percentage_error, mean_squared_logarithmic_error, mean_absolute_error
        """

        # Extract train and test data, but only use test data
        # X_train, y_train, X_test, y_test = datasets.load_tabular_features_hadoop(RegressionModel.DISTRIBUTION, RegressionModel.MATCHED, RegressionModel.SCALE, RegressionModel.MINUS_ONE)
        # X_train, y_train, X_test, y_test = datasets.load_tabular_features(join_result_path, tabular_path, RegressionModel.NORMALIZE, RegressionModel.MINUS_ONE, RegressionModel.TARGET)
        X_test, y_test, join_df = datasets.load_data(
            tabular_path, RegressionModel.TARGET, RegressionModel.DROP_COLUMNS,
            RegressionModel.SELECTED_COLUMNS)

        # Load the model and use it for prediction
        loaded_model = pickle.load(open(model_path, 'rb'))
        y_pred = loaded_model.predict(X_test)

        # Convert back to 1 - y if need
        if RegressionModel.MINUS_ONE:
            y_test, y_pred = 1 - y_test, 1 - y_pred

        # TODO: delete this dumping action. This is just for debugging
        test_df = pd.DataFrame()
        test_df['y_test'] = y_test
        test_df['y_pred'] = y_pred
        test_df.to_csv('data/temp/test_df.csv')

        # Compute accuracy metrics
        mae = mean_absolute_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        msle = np.mean(mean_squared_logarithmic_error(y_test, y_pred))

        return mae, mape, mse, msle
Esempio n. 2
0
def root_mean_squared_log_error(y_true, y_prod):
    return K.sqrt(mean_squared_logarithmic_error(y_true, y_prod))
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

error = pred_test - target_test
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [OIL_RATE]")
_ = plt.ylabel("Count")

# new Data predict
#[['CHOKE','WHP','FLP','WHT']]
new_test_data =[[39,1719,339,189]]
new_pred_test = model.predict(pd.DataFrame(new_test_data, columns=['CHOKE','WHP','FLP','WHT']))

print("Shape: {}".format(pred_test.shape))
print(pred_test)

mean_square_error = mean_squared_error(target_test,pred_test)
# The mean squared error
print('Mean squared error: %.2f' % mean_square_error)

mean_square_logarithmic_error = mean_squared_logarithmic_error(target_test, pred_test)
# The mean squared error
print('Mean squared logarithmic error: %.2f' % mean_square_logarithmic_error)


acc = metrics.accuracy(target_test, pred_test)
print('Accuracy: {}'.format(acc))


#print_summary(model, line_length=None, positions=None, print_fn=None)
Esempio n. 4
0
def run(tabular_path,
        histogram_path,
        join_result_path,
        model_path,
        model_weights_path,
        is_train=True):
    print('Training the join cardinality estimator')
    print('Tabular data: {}'.format(tabular_path))
    print('Histogram path: {}'.format(histogram_path))
    print('Join result data: {}'.format(join_result_path))

    target = 'join_selectivity'
    num_rows, num_columns = 16, 16
    tabular_features_df = datasets.load_datasets_feature(tabular_path)
    join_data, ds1_histograms, ds2_histograms, ds_all_histogram, ds_bops_histogram = datasets.load_join_data(
        tabular_features_df, join_result_path, histogram_path, num_rows,
        num_columns)

    num_features = len(join_data.columns) - 11

    if is_train:
        train_attributes, test_attributes, ds1_histograms_train, ds1_histograms_test, ds2_histograms_train, ds2_histograms_test, ds_all_histogram_train, ds_all_histogram_test, ds_bops_histogram_train, ds_bops_histogram_test = train_test_split(
            join_data,
            ds1_histograms,
            ds2_histograms,
            ds_all_histogram,
            ds_bops_histogram,
            test_size=0.20,
            random_state=42)
        X_train = pd.DataFrame.to_numpy(
            train_attributes[[i for i in range(num_features)]])
        X_test = pd.DataFrame.to_numpy(
            test_attributes[[i for i in range(num_features)]])
        y_train = train_attributes[target]
        y_test = test_attributes[target]
    else:
        X_test = pd.DataFrame.to_numpy(
            join_data[[i for i in range(num_features)]])
        y_test = join_data[target]
        ds_bops_histogram_test = ds_bops_histogram

    mlp = create_mlp(X_test.shape[1], regress=False)
    cnn1 = create_cnn(num_rows, num_columns, 1, regress=False)
    # cnn2 = models.create_cnn(num_rows, num_columns, 1, regress=False)
    # cnn3 = models.create_cnn(num_rows, num_columns, 1, regress=False)

    # combined_input = concatenate([mlp.output, cnn1.output, cnn2.output, cnn3.output])
    combined_input = concatenate([mlp.output, cnn1.output])

    x = Dense(4, activation="relu")(combined_input)
    x = Dense(1, activation="linear")(x)

    # model = Model(inputs=[mlp.input, cnn1.input, cnn2.input, cnn3.input], outputs=x)
    model = Model(inputs=[mlp.input, cnn1.input], outputs=x)

    EPOCHS = 40
    LR = 1e-2
    opt = Adam(lr=LR, decay=LR / EPOCHS)
    model.compile(loss="mean_absolute_percentage_error", optimizer=opt)

    if is_train:
        print('Training the model')
        model.fit([X_train, ds_bops_histogram_train],
                  y_train,
                  validation_data=([X_test, ds_bops_histogram_test], y_test),
                  epochs=EPOCHS,
                  batch_size=256)

        model.save(model_path)
        model.save_weights(model_weights_path)
    else:
        print('Loading the saved model and model weights')
        model = load_model(model_path)
        model.load_weights(model_weights_path)

    print('Testing')
    y_pred = model.predict([X_test, ds_bops_histogram_test])

    print('r2 score: {}'.format(r2_score(y_test, y_pred)))

    # diff = y_pred.flatten() - y_test
    # percent_diff = (diff / y_test)
    # abs_percent_diff = np.abs(percent_diff)
    #
    # # Compute the mean and standard deviation of the absolute percentage difference
    # mean = np.mean(abs_percent_diff)
    # std = np.std(abs_percent_diff)

    # NOTICE: mean is the MAPE value, which is the target we want to minimize
    # print ('mean = {}, std = {}'.format(mean, std))
    minus_one = True
    if minus_one:
        y_test, y_pred = 1 - y_test, 1 - y_pred

    # Compute accuracy metrics
    mse = metrics.mean_squared_error(y_test, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_test, y_pred)
    msle = np.mean(mean_squared_logarithmic_error(y_test, y_pred))
    mae = metrics.mean_absolute_error(y_test, y_pred)
    print('mae: {}\nmape: {}\nmse: {}\nmlse: {}'.format(mae, mape, mse, msle))
    print('{}\t{}\t{}\t{}'.format(mae, mape, mse, msle))
Esempio n. 5
0
def mean_squared_logarithmic_error(y_true, y_pred):
    return losses.mean_squared_logarithmic_error(y_true, y_pred)
def loss2(y_true, y_pred):
    return tf.sqrt(mean_squared_logarithmic_error(y_true, y_pred))
Esempio n. 7
0
def root_mean_squared_logarithmic_error(y_true, y_pred):
    ret = losses.mean_squared_logarithmic_error(y_true, y_pred)
    return K.sqrt(ret)
    def train(self,
              tabular_path: str,
              join_result_path: str,
              model_path: str,
              model_weights_path=None,
              histogram_path=None) -> None:
        """
        Train a regression model for spatial join cost estimator, then save the trained model to file
        """

        # Extract train and test data, but only use train data
        # target = 'join_selectivity'
        num_rows, num_columns = 32, 32
        y, ds1_histograms, ds2_histograms, ds_bops_histogram = datasets.load_histogram_features(
            join_result_path, tabular_path, histogram_path, num_rows,
            num_columns)
        y_train, y_test, ds1_histograms_train, ds1_histograms_test, ds2_histograms_train, ds2_histograms_test, ds_bops_histogram_train, ds_bops_histogram_test \
            = train_test_split(y, ds1_histograms, ds2_histograms, ds_bops_histogram, test_size=0.2, random_state=42)

        # model = HistogramDNNModel.create_cnn(num_rows, num_columns, 1, regress=True)

        # create CNN model
        model = Sequential()
        model.add(
            Conv2D(16,
                   kernel_size=3,
                   activation='relu',
                   input_shape=(num_rows, num_columns, 1)))
        model.add(Conv2D(8, kernel_size=3, activation='relu'))
        model.add(Flatten())
        model.add(Dense(4, activation='relu'))
        model.add(Dense(1, activation='linear'))

        EPOCHS = 40
        LR = 1e-2
        opt = Adam(lr=LR, decay=LR / EPOCHS)

        early_stopping = EarlyStopping(
            monitor="val_loss",
            min_delta=0,
            patience=10,
            verbose=1,
            mode="auto",
            baseline=None,
            restore_best_weights=True,
        )

        model.compile(metrics=['mean_absolute_percentage_error'],
                      loss="mean_absolute_percentage_error",
                      optimizer=opt)
        model.fit(ds_bops_histogram_train,
                  y_train,
                  validation_data=(ds_bops_histogram_test, y_test),
                  epochs=EPOCHS,
                  batch_size=256,
                  callbacks=[early_stopping])

        y_pred = model.predict(ds_bops_histogram_test)

        # Convert back to 1 - y if need
        if HistogramDNNModel.MINUS_ONE:
            y_test, y_pred = 1 - y_test, 1 - y_pred

        # Compute accuracy metrics
        mse = metrics.mean_squared_error(y_test, y_pred)
        mape = metrics.mean_absolute_percentage_error(y_test, y_pred)
        msle = np.mean(mean_squared_logarithmic_error(y_test, y_pred))
        mae = metrics.mean_absolute_error(y_test, y_pred)
        print('mae: {}\nmape: {}\nmse: {}\nmlse: {}'.format(
            mae, mape, mse, msle))
        print('{}\t{}\t{}\t{}'.format(mae, mape, mse, msle))
Esempio n. 9
0
    def train(self,
              tabular_path: str,
              join_result_path: str,
              model_path: str,
              model_weights_path=None,
              histogram_path=None) -> None:
        """
        Train a regression model for spatial join cost estimator, then save the trained model to file
        """

        # Extract train and test data, but only use train data
        X_train, y_train, X_test, y_test = datasets.load_tabular_features_hadoop(
            DNNModel.DISTRIBUTION, DNNModel.MATCHED, DNNModel.SCALE,
            DNNModel.MINUS_ONE)
        # X_train, y_train, X_test, y_test = datasets.load_tabular_features(join_result_path, tabular_path, DNNModel.NORMALIZE)

        # Define a sequential deep neural network model
        model = Sequential()
        model.add(Dense(16, input_dim=X_train.shape[1], activation='relu'))
        # model.add(Dense(4, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        # Compile and fit the model
        LR = 1e-2
        opt = Adam(lr=LR)
        model.compile(optimizer=opt,
                      loss=mean_absolute_percentage_error,
                      metrics=[mean_absolute_error, mean_squared_error])
        early_stopping = EarlyStopping(
            monitor="val_loss",
            min_delta=0,
            patience=10,
            verbose=1,
            mode="auto",
            baseline=None,
            restore_best_weights=True,
        )
        history = model.fit(
            x=X_train,
            y=y_train,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            verbose=1,
            callbacks=[early_stopping],
            validation_split=VAL_SIZE,
        )

        # plt.plot(history.history['loss'])
        # plt.plot(history.history['val_loss'])
        # plt.title('model loss')
        # plt.ylabel('loss')
        # plt.xlabel('epoch')
        # plt.legend(['train', 'test'], loc='upper left')
        # plt.show()

        test_loss = model.evaluate(X_test, y_test)
        print(test_loss)
        # print('Accuracy: %.2f' % (test_loss * 100))

        y_pred = model.predict(X_test)
        # TODO: delete this dumping action. This is just for debugging
        test_df = pd.DataFrame()
        test_df['y_test'] = y_test
        test_df['y_pred'] = y_pred
        test_df.to_csv('data/temp/test_df.csv')

        # Convert back to 1 - y if need
        if DNNModel.MINUS_ONE:
            y_test, y_pred = 1 - y_test, 1 - y_pred

        # Compute accuracy metrics
        mse = metrics.mean_squared_error(y_test, y_pred)
        mape = metrics.mean_absolute_percentage_error(y_test, y_pred)
        msle = np.mean(mean_squared_logarithmic_error(y_test, y_pred))
        mae = metrics.mean_absolute_error(y_test, y_pred)
        print('mae: {}\nmape: {}\nmse: {}\nmlse: {}'.format(
            mae, mape, mse, msle))
        print('{}\t{}\t{}\t{}'.format(mae, mape, mse, msle))
Esempio n. 10
0
        X = np.zeros((len(wiener_all), 1, 257 * 5))
        for i in range(np.size(X, 0)):
            x = wiener_all[i].reshape((1, 257 * 5))

            X[i][0] = x

        Y = np.zeros((len(wiener_all), 1, 257))
        for i in range(np.size(Y, 0)):
            y = real_wiener_all[i].reshape((1, 257))

            Y[i][0] = y

        Y1 = loaded_model.predict(X, verbose=1)
        losses_stat[filename] = np.mean(
            losses.mean_squared_logarithmic_error(Y, Y1).eval())
        noise_type_losses[noise_type].append(losses_stat[filename])

sorted_by_value = sorted(losses_stat.items(), key=lambda kv: kv[1])
sorted_by_value_reverse = sorted(losses_stat.items(),
                                 key=lambda kv: kv[1],
                                 reverse=True)
print("10 best")
for i in range(10):
    print(sorted_by_value[i])

print("10 worst")
for i in range(10):
    print(sorted_by_value_reverse[i])

plt.figure(1)