def _record_results(self, x, y, y_pred, label): """Record the prediction results :param x: the input data :param y: the actual output :param y_pred: the predicted output :param label: the result label ("resource" or "impact") """ # Result files metrics_path = "{}/global_{}_model_metrics.csv".format( self.model_results_path, label) prediction_path = "{}/global_{}_model_prediction.csv".format( self.model_results_path, label) result_writing_util.create_metrics_and_prediction_files( metrics_path, prediction_path) # Log the prediction results ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0) io_util.write_csv_result(metrics_path, "Ratio Error", ratio_error) result_writing_util.record_predictions((x, y_pred, y), prediction_path) # Print Error summary to command line if label == "resource": original_ratio_error = np.average(np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0) else: original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1), axis=0) logging.info('Model Original Ratio Error ({}): {}'.format( label, original_ratio_error)) logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error)) logging.info('')
def _global_model_training_process(x, y, methods, test_ratio, metrics_path, prediction_path): """Training process for the global models :param x: input feature :param y: labels :param methods: ML models to enumerate :param test_ratio: train-test split ratio :param metrics_path: to store the prediction metrics :param prediction_path: to store the raw prediction results :return: (the best model, the indices for the test data for additional metric calculation) """ global_model = None result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path, False) n_samples = x.shape[0] indices = np.arange(n_samples) x_train, x_test, y_train, y_test, indices_train, indices_test = model_selection.train_test_split( x, y, indices, test_size=test_ratio, random_state=0) min_percentage_error = 1 pred_results = None elapsed_us_index = data_info.instance.target_csv_index[Target.ELAPSED_US] for method in methods: # Train the model logging.info("Training the global model with {}".format(method)) regressor = model.Model(method) regressor.train(x_train, y_train) # Evaluate on both the training and test set results = [] evaluate_data = [(x_train, y_train), (x_test, y_test)] train_test_label = ["Train", "Test"] for i, d in enumerate(evaluate_data): evaluate_x = d[0] evaluate_y = d[1] y_pred = regressor.predict(evaluate_x) logging.debug("x shape: {}".format(evaluate_x.shape)) logging.debug("y shape: {}".format(y_pred.shape)) percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + 1), axis=0) results += list(percentage_error) + [""] logging.info('{} Ratio Error: {}'.format(train_test_label[i], percentage_error)) # Record the model with the lowest elapsed time prediction (since that might be the most # important prediction) if i == 1 and percentage_error[elapsed_us_index] < min_percentage_error: min_percentage_error = percentage_error[elapsed_us_index] global_model = regressor pred_results = (evaluate_x, y_pred, evaluate_y) io_util.write_csv_result(metrics_path, method, results) logging.info("") # Record the best prediction results on the test data result_writing_util.record_predictions(pred_results, prediction_path) return global_model, indices_test
def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label): """Record the prediction results :param x: the input data :param y: the actual output :param y_pred: the predicted output :param label: the result label ("resource" or "impact") """ # Result files metrics_path = "{}/global_{}_model_metrics.csv".format(self.model_results_path, label) prediction_path = "{}/global_{}_model_prediction.csv".format(self.model_results_path, label) result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path, True) # Log the prediction results ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0) io_util.write_csv_result(metrics_path, "Model Ratio Error", ratio_error) result_writing_util.record_predictions((x, y_pred, y), prediction_path) # Print Error summary to command line if label == "resource": original_ratio_error = np.average(np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0) else: original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1), axis=0) logging.info('Model Original Ratio Error ({}): {}'.format(label, original_ratio_error)) logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error)) logging.info('') if label != "resource": # Calculate the accumulated ratio error epsilon = global_model_config.RATIO_DIVISION_EPSILON mini_model_y_pred = np.array(mini_model_y_pred) raw_y = np.array(raw_y) raw_y_pred = (mini_model_y_pred + epsilon) * y_pred accumulated_raw_y = np.sum(raw_y, axis=0) accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0) original_ratio_error = np.average(np.abs(raw_y - mini_model_y_pred) / (raw_y + epsilon), axis=0) ratio_error = np.average(np.abs(raw_y - raw_y_pred) / (raw_y + epsilon), axis=0) accumulated_percentage_error = np.abs(accumulated_raw_y - accumulated_raw_y_pred) / ( accumulated_raw_y + epsilon) original_accumulated_percentage_error = np.abs(accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / ( accumulated_raw_y + epsilon) logging.info('Original Ratio Error: {}'.format(original_ratio_error)) io_util.write_csv_result(metrics_path, "Original Ratio Error", original_ratio_error) logging.info('Ratio Error: {}'.format(ratio_error)) io_util.write_csv_result(metrics_path, "Ratio Error", ratio_error) logging.info('Original Accumulated Ratio Error: {}'.format(original_accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Original Accumulated Ratio Error", original_accumulated_percentage_error) logging.info('Accumulated Ratio Error: {}'.format(accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Accumulated Ratio Error", accumulated_percentage_error) logging.info('Accumulated Actual: {}'.format(accumulated_raw_y)) logging.info('Original Accumulated Predict: {}'.format(np.sum(mini_model_y_pred, axis=0))) logging.info('Accumulated Predict: {}'.format(accumulated_raw_y_pred))
def _train_data(self, data, summary_file): x_train, x_test, y_train, y_test = model_selection.train_test_split( data.x, data.y, test_size=self.test_ratio, random_state=0) # Write the first header rwo to the result file metrics_path = "{}/{}.csv".format(self.model_metrics_path, data.opunit.name.lower()) prediction_path = "{}/{}_prediction.csv".format( self.model_metrics_path, data.opunit.name.lower()) result_writing_util.create_metrics_and_prediction_files( metrics_path, prediction_path, False) methods = self.ml_models # Test the prediction with/without the target transformer y_transformers = [ None, data_transforming_util.OPUNIT_Y_TRANSFORMER_MAP[data.opunit] ] # modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit] # if modeling_transformer is not None: # transformers.append(modeling_transformer) x_transformer = data_transforming_util.OPUNIT_X_TRANSFORMER_MAP[ data.opunit] error_bias = 1 min_percentage_error = 2 pred_results = None elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US] memory_b_index = data_info.TARGET_CSV_INDEX[Target.MEMORY_B] best_y_transformer = -1 best_method = -1 for i, y_transformer in enumerate(y_transformers): for m, method in enumerate(methods): # Train the model label = method if i == 0 else method + " transform" logging.info("{} {}".format(data.opunit.name, label)) regressor = model.Model(method, y_transformer=y_transformer, x_transformer=x_transformer) regressor.train(x_train, y_train) # Evaluate on both the training and test set results = [] evaluate_data = [(x_train, y_train), (x_test, y_test)] train_test_label = ["Train", "Test"] for j, d in enumerate(evaluate_data): evaluate_x = d[0] evaluate_y = d[1] y_pred = regressor.predict(evaluate_x) logging.debug("x shape: {}".format(evaluate_x.shape)) logging.debug("y shape: {}".format(y_pred.shape)) # In order to avoid the percentage error to explode when the actual label is very small, # we omit the data point with the actual label <= 5 when calculating the percentage error (by # essentially giving the data points with small labels a very small weight) evaluate_threshold = 5 weights = np.where(evaluate_y > evaluate_threshold, np.ones(evaluate_y.shape), np.full(evaluate_y.shape, 1e-6)) percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + error_bias), axis=0, weights=weights) results += list(percentage_error) + [""] logging.info('{} Percentage Error: {}'.format( train_test_label[j], percentage_error)) # The default method of determining whether a model is better is by comparing the model error # on the elapsed us. For any opunits in MEM_EVALUATE_OPUNITS, we evaluate by comparing the # model error on memory_b. eval_error = percentage_error[elapsed_us_index] if data.opunit in data_info.MEM_EVALUATE_OPUNITS: eval_error = percentage_error[memory_b_index] # Record the model with the lowest elapsed time prediction (since that might be the most # important prediction) # Only use linear regression for the arithmetic operating units if (j == 1 and eval_error < min_percentage_error and y_transformer == y_transformers[-1] and (data.opunit not in data_info.ARITHMETIC_OPUNITS or method == 'lr')): min_percentage_error = eval_error if self.expose_all: best_y_transformer = i best_method = m else: self.model_map[data.opunit] = regressor pred_results = (evaluate_x, y_pred, evaluate_y) if j == 1: io_util.write_csv_result(summary_file, data.opunit.name, [label] + list(percentage_error)) # Dump the prediction results io_util.write_csv_result(metrics_path, label, results) logging.info("") io_util.write_csv_result(metrics_path, "", []) # Record the best prediction results on the test data result_writing_util.record_predictions(pred_results, prediction_path) return best_y_transformer, best_method
def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label, data_list): """Record the prediction results :param x: the input data :param y: the actual output :param y_pred: the predicted output :param label: the result label ("resource" or "impact") """ # Result files metrics_path = "{}/global_{}_model_metrics.csv".format( self.model_results_path, label) prediction_path = "{}/global_{}_model_prediction.csv".format( self.model_results_path, label) result_writing_util.create_metrics_and_prediction_files( metrics_path, prediction_path, True) # Log the prediction results ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0) io_util.write_csv_result(metrics_path, "Model Ratio Error", ratio_error) result_writing_util.record_predictions((x, y_pred, y), prediction_path) # Print Error summary to command line if label == "resource": avg_original_ratio_error = np.average( np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0) else: avg_original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1), axis=0) logging.info('Model Original Ratio Error ({}): {}'.format( label, avg_original_ratio_error)) logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error)) logging.info('') if label != "resource": # Calculate the accumulated ratio error epsilon = global_model_config.RATIO_DIVISION_EPSILON mini_model_y_pred = np.array(mini_model_y_pred) raw_y = np.array(raw_y) raw_y_pred = (mini_model_y_pred + epsilon) * y_pred accumulated_raw_y = np.sum(raw_y, axis=0) accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0) original_ratio_error = np.abs(raw_y - mini_model_y_pred) / ( raw_y + epsilon) avg_original_ratio_error = np.average(original_ratio_error, axis=0) ratio_error = np.abs(raw_y - raw_y_pred) / (raw_y + epsilon) avg_ratio_error = np.average(ratio_error, axis=0) accumulated_percentage_error = np.abs( accumulated_raw_y - accumulated_raw_y_pred) / (accumulated_raw_y + epsilon) original_accumulated_percentage_error = np.abs( accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / ( accumulated_raw_y + epsilon) logging.info( 'Original Ratio Error: {}'.format(avg_original_ratio_error)) io_util.write_csv_result(metrics_path, "Original Ratio Error", avg_original_ratio_error) logging.info('Ratio Error: {}'.format(avg_ratio_error)) io_util.write_csv_result(metrics_path, "Ratio Error", avg_ratio_error) logging.info('Original Accumulated Ratio Error: {}'.format( original_accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Original Accumulated Ratio Error", original_accumulated_percentage_error) logging.info('Accumulated Ratio Error: {}'.format( accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Accumulated Ratio Error", accumulated_percentage_error) logging.info('Accumulated Actual: {}'.format(accumulated_raw_y)) logging.info('Original Accumulated Predict: {}'.format( np.sum(mini_model_y_pred, axis=0))) logging.info( 'Accumulated Predict: {}'.format(accumulated_raw_y_pred)) if label == 'direct': prediction_path = "{}/grouped_opunit_prediction.csv".format( self.model_results_path) io_util.create_csv_file(prediction_path, [ "Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error" ]) for i, data in enumerate(data_list): io_util.write_csv_result(prediction_path, data.name, [""] + list(raw_y[i]) + [""] + list(raw_y_pred[i]) + [""] + list(ratio_error[i])) average_result_path = "{}/interval_average_prediction.csv".format( self.model_results_path) io_util.create_csv_file( average_result_path, ["Timestamp", "Actual Average", "Predicted Average"]) interval_y_map = {} interval_y_pred_map = {} mark_list = None #mark_list = _generate_mark_list(data_list) for i, data in enumerate(data_list): # Don't count the create index OU # TODO(lin): needs better way to evaluate... maybe add a id_query field to GroupedOpunitData if data.concurrency > 0: continue if mark_list is not None and not mark_list[i]: continue interval_time = _round_to_interval( data.start_time, global_model_config.AVERAGING_INTERVAL) if interval_time not in interval_y_map: interval_y_map[interval_time] = [] interval_y_pred_map[interval_time] = [] interval_y_map[interval_time].append(raw_y[i][-5]) interval_y_pred_map[interval_time].append( raw_y_pred[i][-5]) for time in sorted(interval_y_map.keys()): if mark_list is None: io_util.write_csv_result(average_result_path, time, [ np.average(interval_y_map[time]), np.average(interval_y_pred_map[time]) ]) else: io_util.write_csv_result(average_result_path, time, [ np.sum(interval_y_map[time]), np.sum(interval_y_pred_map[time]) ])
def _train_data(self, data, summary_file): x_train, x_test, y_train, y_test = model_selection.train_test_split( data.x, data.y, test_size=self.test_ratio, random_state=0) # Write the first header rwo to the result file metrics_path = "{}/{}.csv".format(self.model_metrics_path, data.opunit.name.lower()) prediction_path = "{}/{}_prediction.csv".format( self.model_metrics_path, data.opunit.name.lower()) result_writing_util.create_metrics_and_prediction_files( metrics_path, prediction_path) methods = self.ml_models # Test the prediction with/without the target transformer transformers = [ None, data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit] ] # modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit] # if modeling_transformer is not None: # transformers.append(modeling_transformer) error_bias = 1 min_percentage_error = 2 pred_results = None elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US] for i, transformer in enumerate(transformers): for method in methods: # Train the model label = method if i == 0 else method + " transform" logging.info("{} {}".format(data.opunit.name, label)) regressor = model.Model(method, modeling_transformer=transformer) regressor.train(x_train, y_train) # Evaluate on both the training and test set results = [] evaluate_data = [(x_train, y_train), (x_test, y_test)] train_test_label = ["Train", "Test"] for j, d in enumerate(evaluate_data): evaluate_x = d[0] evaluate_y = d[1] for x, y in zip(evaluate_x, evaluate_y): stat_vec = [data.opunit] stat_vec.extend(x) self.stats_map[tuple(stat_vec)] = y y_pred = regressor.predict(evaluate_x) logging.debug("x shape: {}".format(evaluate_x.shape)) logging.debug("y shape: {}".format(y_pred.shape)) percentage_error = np.average( np.abs(evaluate_y - y_pred) / (evaluate_y + 1 + error_bias), axis=0) results += list(percentage_error) + [""] logging.info('{} Percentage Error: {}'.format( train_test_label[j], percentage_error)) # Record the model with the lowest elapsed time prediction (since that might be the most # important prediction) # Only use linear regression for the arithmetic operating units if (j == 1 and percentage_error[elapsed_us_index] < min_percentage_error and transformer == transformers[-1] and (data.opunit not in data_info.ARITHMETIC_OPUNITS or method == 'lr')): min_percentage_error = percentage_error[ elapsed_us_index] self.model_map[data.opunit] = regressor pred_results = (evaluate_x, y_pred, evaluate_y) if j == 1: io_util.write_csv_result(summary_file, data.opunit.name, [label] + list(percentage_error)) # Dump the prediction results io_util.write_csv_result(metrics_path, label, results) logging.info("") io_util.write_csv_result(metrics_path, "", []) # Record the best prediction results on the test data result_writing_util.record_predictions(pred_results, prediction_path)
def train(self): """Train the mini-models :return: the map of the trained models """ data_list = [] # First get the data for all mini runners for filename in glob.glob(os.path.join(self.input_path, '*.csv')): print(filename) data_list += opunit_data.get_mini_runner_data(filename) model_map = {} # train the models for all the operating units for data in data_list: x_train, x_test, y_train, y_test = model_selection.train_test_split(data.x, data.y, test_size=self.test_ratio, random_state=0) # Write the first header rwo to the result file metrics_path = "{}/{}.csv".format(self.model_metrics_path, data.opunit.name.lower()) prediction_path = "{}/{}_prediction.csv".format(self.model_metrics_path, data.opunit.name.lower()) result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path) methods = self.ml_models # Only use linear regression for the arithmetic operating units if data.opunit in data_info.ARITHMETIC_OPUNITS: methods = ["lr"] # Also test the prediction with the target transformer (if specified for the operating unit) transformers = [None] modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit] if modeling_transformer is not None: transformers.append(modeling_transformer) min_percentage_error = 1 pred_results = None elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US] for transformer in transformers: for method in methods: # Train the model logging.info("{} {}".format(data.opunit.name, method)) regressor = model.Model(method, modeling_transformer=transformer) regressor.train(x_train, y_train) # Evaluate on both the training and test set results = [] evaluate_data = [(x_train, y_train), (x_test, y_test)] train_test_label = ["Train", "Test"] for i, d in enumerate(evaluate_data): evaluate_x = d[0] evaluate_y = d[1] y_pred = regressor.predict(evaluate_x) logging.debug("x shape: {}".format(evaluate_x.shape)) logging.debug("y shape: {}".format(y_pred.shape)) percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + 1), axis=0) results += list(percentage_error) + [""] logging.info('{} Percentage Error: {}'.format(train_test_label[i], percentage_error)) # Record the model with the lowest elapsed time prediction (since that might be the most # important prediction) if (i == 1 and percentage_error[elapsed_us_index] < min_percentage_error and transformer == transformers[-1]): min_percentage_error = percentage_error[elapsed_us_index] model_map[data.opunit] = regressor pred_results = (evaluate_x, y_pred, evaluate_y) # Dump the prediction results transform = " " if transformer is not None: transform = " transform" io_util.write_csv_result(metrics_path, method + transform, results) logging.info("") io_util.write_csv_result(metrics_path, "", []) # Record the best prediction results on the test data result_writing_util.record_predictions(pred_results, prediction_path) return model_map