def test(self, tabular_path: str, join_result_path: str, model_path: str, model_weights_path=None, histogram_path=None) -> (float, float, float, float): """ Evaluate the accuracy metrics of a trained model for spatial join cost estimator :return mean_squared_error, mean_absolute_percentage_error, mean_squared_logarithmic_error, mean_absolute_error """ # Extract train and test data, but only use test data # X_train, y_train, X_test, y_test = datasets.load_tabular_features_hadoop(RegressionModel.DISTRIBUTION, RegressionModel.MATCHED, RegressionModel.SCALE, RegressionModel.MINUS_ONE) # X_train, y_train, X_test, y_test = datasets.load_tabular_features(join_result_path, tabular_path, RegressionModel.NORMALIZE, RegressionModel.MINUS_ONE, RegressionModel.TARGET) X_test, y_test, join_df = datasets.load_data( tabular_path, RegressionModel.TARGET, RegressionModel.DROP_COLUMNS, RegressionModel.SELECTED_COLUMNS) # Load the model and use it for prediction loaded_model = pickle.load(open(model_path, 'rb')) y_pred = loaded_model.predict(X_test) # Convert back to 1 - y if need if RegressionModel.MINUS_ONE: y_test, y_pred = 1 - y_test, 1 - y_pred # TODO: delete this dumping action. This is just for debugging test_df = pd.DataFrame() test_df['y_test'] = y_test test_df['y_pred'] = y_pred test_df.to_csv('data/temp/test_df.csv') # Compute accuracy metrics mae = mean_absolute_error(y_test, y_pred) mape = mean_absolute_percentage_error(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) msle = np.mean(mean_squared_logarithmic_error(y_test, y_pred)) return mae, mape, mse, msle
def root_mean_squared_log_error(y_true, y_prod): return K.sqrt(mean_squared_logarithmic_error(y_true, y_prod))
plt.xlim(lims) plt.ylim(lims) _ = plt.plot(lims, lims) error = pred_test - target_test plt.hist(error, bins = 25) plt.xlabel("Prediction Error [OIL_RATE]") _ = plt.ylabel("Count") # new Data predict #[['CHOKE','WHP','FLP','WHT']] new_test_data =[[39,1719,339,189]] new_pred_test = model.predict(pd.DataFrame(new_test_data, columns=['CHOKE','WHP','FLP','WHT'])) print("Shape: {}".format(pred_test.shape)) print(pred_test) mean_square_error = mean_squared_error(target_test,pred_test) # The mean squared error print('Mean squared error: %.2f' % mean_square_error) mean_square_logarithmic_error = mean_squared_logarithmic_error(target_test, pred_test) # The mean squared error print('Mean squared logarithmic error: %.2f' % mean_square_logarithmic_error) acc = metrics.accuracy(target_test, pred_test) print('Accuracy: {}'.format(acc)) #print_summary(model, line_length=None, positions=None, print_fn=None)
def run(tabular_path, histogram_path, join_result_path, model_path, model_weights_path, is_train=True): print('Training the join cardinality estimator') print('Tabular data: {}'.format(tabular_path)) print('Histogram path: {}'.format(histogram_path)) print('Join result data: {}'.format(join_result_path)) target = 'join_selectivity' num_rows, num_columns = 16, 16 tabular_features_df = datasets.load_datasets_feature(tabular_path) join_data, ds1_histograms, ds2_histograms, ds_all_histogram, ds_bops_histogram = datasets.load_join_data( tabular_features_df, join_result_path, histogram_path, num_rows, num_columns) num_features = len(join_data.columns) - 11 if is_train: train_attributes, test_attributes, ds1_histograms_train, ds1_histograms_test, ds2_histograms_train, ds2_histograms_test, ds_all_histogram_train, ds_all_histogram_test, ds_bops_histogram_train, ds_bops_histogram_test = train_test_split( join_data, ds1_histograms, ds2_histograms, ds_all_histogram, ds_bops_histogram, test_size=0.20, random_state=42) X_train = pd.DataFrame.to_numpy( train_attributes[[i for i in range(num_features)]]) X_test = pd.DataFrame.to_numpy( test_attributes[[i for i in range(num_features)]]) y_train = train_attributes[target] y_test = test_attributes[target] else: X_test = pd.DataFrame.to_numpy( join_data[[i for i in range(num_features)]]) y_test = join_data[target] ds_bops_histogram_test = ds_bops_histogram mlp = create_mlp(X_test.shape[1], regress=False) cnn1 = create_cnn(num_rows, num_columns, 1, regress=False) # cnn2 = models.create_cnn(num_rows, num_columns, 1, regress=False) # cnn3 = models.create_cnn(num_rows, num_columns, 1, regress=False) # combined_input = concatenate([mlp.output, cnn1.output, cnn2.output, cnn3.output]) combined_input = concatenate([mlp.output, cnn1.output]) x = Dense(4, activation="relu")(combined_input) x = Dense(1, activation="linear")(x) # model = Model(inputs=[mlp.input, cnn1.input, cnn2.input, cnn3.input], outputs=x) model = Model(inputs=[mlp.input, cnn1.input], outputs=x) EPOCHS = 40 LR = 1e-2 opt = Adam(lr=LR, decay=LR / EPOCHS) model.compile(loss="mean_absolute_percentage_error", optimizer=opt) if is_train: print('Training the model') model.fit([X_train, ds_bops_histogram_train], y_train, validation_data=([X_test, ds_bops_histogram_test], y_test), epochs=EPOCHS, batch_size=256) model.save(model_path) model.save_weights(model_weights_path) else: print('Loading the saved model and model weights') model = load_model(model_path) model.load_weights(model_weights_path) print('Testing') y_pred = model.predict([X_test, ds_bops_histogram_test]) print('r2 score: {}'.format(r2_score(y_test, y_pred))) # diff = y_pred.flatten() - y_test # percent_diff = (diff / y_test) # abs_percent_diff = np.abs(percent_diff) # # # Compute the mean and standard deviation of the absolute percentage difference # mean = np.mean(abs_percent_diff) # std = np.std(abs_percent_diff) # NOTICE: mean is the MAPE value, which is the target we want to minimize # print ('mean = {}, std = {}'.format(mean, std)) minus_one = True if minus_one: y_test, y_pred = 1 - y_test, 1 - y_pred # Compute accuracy metrics mse = metrics.mean_squared_error(y_test, y_pred) mape = metrics.mean_absolute_percentage_error(y_test, y_pred) msle = np.mean(mean_squared_logarithmic_error(y_test, y_pred)) mae = metrics.mean_absolute_error(y_test, y_pred) print('mae: {}\nmape: {}\nmse: {}\nmlse: {}'.format(mae, mape, mse, msle)) print('{}\t{}\t{}\t{}'.format(mae, mape, mse, msle))
def mean_squared_logarithmic_error(y_true, y_pred): return losses.mean_squared_logarithmic_error(y_true, y_pred)
def loss2(y_true, y_pred): return tf.sqrt(mean_squared_logarithmic_error(y_true, y_pred))
def root_mean_squared_logarithmic_error(y_true, y_pred): ret = losses.mean_squared_logarithmic_error(y_true, y_pred) return K.sqrt(ret)
def train(self, tabular_path: str, join_result_path: str, model_path: str, model_weights_path=None, histogram_path=None) -> None: """ Train a regression model for spatial join cost estimator, then save the trained model to file """ # Extract train and test data, but only use train data # target = 'join_selectivity' num_rows, num_columns = 32, 32 y, ds1_histograms, ds2_histograms, ds_bops_histogram = datasets.load_histogram_features( join_result_path, tabular_path, histogram_path, num_rows, num_columns) y_train, y_test, ds1_histograms_train, ds1_histograms_test, ds2_histograms_train, ds2_histograms_test, ds_bops_histogram_train, ds_bops_histogram_test \ = train_test_split(y, ds1_histograms, ds2_histograms, ds_bops_histogram, test_size=0.2, random_state=42) # model = HistogramDNNModel.create_cnn(num_rows, num_columns, 1, regress=True) # create CNN model model = Sequential() model.add( Conv2D(16, kernel_size=3, activation='relu', input_shape=(num_rows, num_columns, 1))) model.add(Conv2D(8, kernel_size=3, activation='relu')) model.add(Flatten()) model.add(Dense(4, activation='relu')) model.add(Dense(1, activation='linear')) EPOCHS = 40 LR = 1e-2 opt = Adam(lr=LR, decay=LR / EPOCHS) early_stopping = EarlyStopping( monitor="val_loss", min_delta=0, patience=10, verbose=1, mode="auto", baseline=None, restore_best_weights=True, ) model.compile(metrics=['mean_absolute_percentage_error'], loss="mean_absolute_percentage_error", optimizer=opt) model.fit(ds_bops_histogram_train, y_train, validation_data=(ds_bops_histogram_test, y_test), epochs=EPOCHS, batch_size=256, callbacks=[early_stopping]) y_pred = model.predict(ds_bops_histogram_test) # Convert back to 1 - y if need if HistogramDNNModel.MINUS_ONE: y_test, y_pred = 1 - y_test, 1 - y_pred # Compute accuracy metrics mse = metrics.mean_squared_error(y_test, y_pred) mape = metrics.mean_absolute_percentage_error(y_test, y_pred) msle = np.mean(mean_squared_logarithmic_error(y_test, y_pred)) mae = metrics.mean_absolute_error(y_test, y_pred) print('mae: {}\nmape: {}\nmse: {}\nmlse: {}'.format( mae, mape, mse, msle)) print('{}\t{}\t{}\t{}'.format(mae, mape, mse, msle))
def train(self, tabular_path: str, join_result_path: str, model_path: str, model_weights_path=None, histogram_path=None) -> None: """ Train a regression model for spatial join cost estimator, then save the trained model to file """ # Extract train and test data, but only use train data X_train, y_train, X_test, y_test = datasets.load_tabular_features_hadoop( DNNModel.DISTRIBUTION, DNNModel.MATCHED, DNNModel.SCALE, DNNModel.MINUS_ONE) # X_train, y_train, X_test, y_test = datasets.load_tabular_features(join_result_path, tabular_path, DNNModel.NORMALIZE) # Define a sequential deep neural network model model = Sequential() model.add(Dense(16, input_dim=X_train.shape[1], activation='relu')) # model.add(Dense(4, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile and fit the model LR = 1e-2 opt = Adam(lr=LR) model.compile(optimizer=opt, loss=mean_absolute_percentage_error, metrics=[mean_absolute_error, mean_squared_error]) early_stopping = EarlyStopping( monitor="val_loss", min_delta=0, patience=10, verbose=1, mode="auto", baseline=None, restore_best_weights=True, ) history = model.fit( x=X_train, y=y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1, callbacks=[early_stopping], validation_split=VAL_SIZE, ) # plt.plot(history.history['loss']) # plt.plot(history.history['val_loss']) # plt.title('model loss') # plt.ylabel('loss') # plt.xlabel('epoch') # plt.legend(['train', 'test'], loc='upper left') # plt.show() test_loss = model.evaluate(X_test, y_test) print(test_loss) # print('Accuracy: %.2f' % (test_loss * 100)) y_pred = model.predict(X_test) # TODO: delete this dumping action. This is just for debugging test_df = pd.DataFrame() test_df['y_test'] = y_test test_df['y_pred'] = y_pred test_df.to_csv('data/temp/test_df.csv') # Convert back to 1 - y if need if DNNModel.MINUS_ONE: y_test, y_pred = 1 - y_test, 1 - y_pred # Compute accuracy metrics mse = metrics.mean_squared_error(y_test, y_pred) mape = metrics.mean_absolute_percentage_error(y_test, y_pred) msle = np.mean(mean_squared_logarithmic_error(y_test, y_pred)) mae = metrics.mean_absolute_error(y_test, y_pred) print('mae: {}\nmape: {}\nmse: {}\nmlse: {}'.format( mae, mape, mse, msle)) print('{}\t{}\t{}\t{}'.format(mae, mape, mse, msle))
X = np.zeros((len(wiener_all), 1, 257 * 5)) for i in range(np.size(X, 0)): x = wiener_all[i].reshape((1, 257 * 5)) X[i][0] = x Y = np.zeros((len(wiener_all), 1, 257)) for i in range(np.size(Y, 0)): y = real_wiener_all[i].reshape((1, 257)) Y[i][0] = y Y1 = loaded_model.predict(X, verbose=1) losses_stat[filename] = np.mean( losses.mean_squared_logarithmic_error(Y, Y1).eval()) noise_type_losses[noise_type].append(losses_stat[filename]) sorted_by_value = sorted(losses_stat.items(), key=lambda kv: kv[1]) sorted_by_value_reverse = sorted(losses_stat.items(), key=lambda kv: kv[1], reverse=True) print("10 best") for i in range(10): print(sorted_by_value[i]) print("10 worst") for i in range(10): print(sorted_by_value_reverse[i]) plt.figure(1)