def train(self): """Train the mini-models :return: the map of the trained models """ self.model_map = {} # Create the results files for the paper header = ["OpUnit", "Method"] + [ target.name for target in data_info.MINI_MODEL_TARGET_LIST ] summary_file = "{}/mini_runner.csv".format(self.model_metrics_path) io_util.create_csv_file(summary_file, header) # First get the data for all mini runners for filename in sorted( glob.glob(os.path.join(self.input_path, '*.csv'))): print(filename) data_list = opunit_data.get_mini_runner_data( filename, self.model_metrics_path, self.txn_sample_interval, self.model_map, self.stats_map, self.trim) for data in data_list: best_y_transformer, best_method = self._train_data( data, summary_file) if self.expose_all: self._train_specific_model(data, best_y_transformer, best_method) return self.model_map
def _txn_get_mini_runner_data(filename, model_results_path, txn_sample_interval): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename) file_name = os.path.splitext(os.path.basename(filename))[0] # prepending a column of ones as the base transaction data feature base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int)) df = pd.concat([base_x, df], axis=1) x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target. START_TIME]].values cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target. CPU_ID]].values logging.info("Loaded file: {}".format(OpUnit[file_name.upper()])) # change the data based on the interval for the periodically invoked operating units prediction_path = "{}/{}_txn_converted_data.csv".format( model_results_path, file_name) io_util.create_csv_file(prediction_path, [""]) interval = data_info.CONTENDING_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} interval_id_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_id_map[rounded_time] = set() interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) interval_id_map[rounded_time].add(cpu_ids[i]) # Construct the new data x_list = [] y_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Concatenate the number of different threads x_new = np.concatenate((x_new, [len(interval_id_map[rounded_time])])) x_new *= txn_sample_interval + 1 x_list.append(x_new) # The prediction is the average behavior y_list.append(np.average(interval_y_map[rounded_time], axis=0)) io_util.write_csv_result(prediction_path, rounded_time, np.concatenate((x_list[-1], y_list[-1]))) return [ OpUnitData(OpUnit[file_name.upper()], np.array(x_list), np.array(y_list)) ]
def _construct_interval_based_global_model_data(data_list, model_results_path): """Construct the GlobalImpactData used for the global model training :param data_list: The list of GroupedOpUnitData objects :param model_results_path: directory path to log the result information :return: (GlobalResourceData list, GlobalImpactData list) """ prediction_path = "{}/global_resource_data.csv".format(model_results_path) io_util.create_csv_file(prediction_path, ["Elapsed us", "# Concurrent OpUnit Groups"]) start_time_list = sorted( [d.get_start_time(ConcurrentCountingMode.INTERVAL) for d in data_list]) rounded_start_time_list = [_round_to_second(start_time_list[0])] # Map from interval start time to the data in this interval interval_data_map = {rounded_start_time_list[0]: []} # Get all the interval start times and initialize the map for t in start_time_list: rounded_time = _round_to_second(t) if rounded_time > rounded_start_time_list[-1]: rounded_start_time_list.append(rounded_time) interval_data_map[rounded_time] = [] for data in tqdm.tqdm(data_list, desc="Find Interval Data"): # For each data, find the intervals that might overlap with it interval_start_time = _round_to_second( data.get_start_time(ConcurrentCountingMode.EXACT) - global_model_config.INTERVAL_SIZE + global_model_config.INTERVAL_SEGMENT) while interval_start_time <= data.get_end_time( ConcurrentCountingMode.ESTIMATED): if interval_start_time in interval_data_map: interval_data_map[interval_start_time].append(data) interval_start_time += global_model_config.INTERVAL_SEGMENT # Get the global resource data resource_data_map = {} for start_time in tqdm.tqdm(rounded_start_time_list, desc="Construct GlobalResourceData"): resource_data_map[start_time] = _get_global_resource_data( start_time, interval_data_map[start_time], prediction_path) # Now construct the global impact data impact_data_list = [] for data in data_list: interval_start_time = _round_to_second( data.get_start_time(ConcurrentCountingMode.INTERVAL)) resource_data_list = [] while interval_start_time <= data.get_end_time( ConcurrentCountingMode.ESTIMATED): if interval_start_time in resource_data_map: resource_data_list.append( resource_data_map[interval_start_time]) interval_start_time += global_model_config.INTERVAL_SIZE impact_data_list.append( global_model_data.GlobalImpactData(data, resource_data_list)) return list(resource_data_map.values()), impact_data_list
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path): """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction result in place :param data_list: The list of the GroupedOpUnitData objects :param mini_model_map: The trained mini models :param model_results_path: file path to log the prediction results """ prediction_path = "{}/grouped_opunit_prediction.csv".format( model_results_path) io_util.create_csv_file( prediction_path, ["Pipeline", "Actual Us", "Predicted Us", "", "Ratio Error"]) # Have to use a prediction cache when having lots of global data... prediction_cache = {} # First run a prediction on the global running data with the mini model results for i, data in enumerate( tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")): y = data.y logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1])) pipeline_y_pred = 0 x = None for opunit_feature in data.opunit_features: opunit = opunit_feature[0] opunit_model = mini_model_map[opunit] x = np.array(opunit_feature[1]).reshape(1, -1) key = (opunit, x.tobytes()) if key not in prediction_cache: y_pred = opunit_model.predict(x) # subtract scan from certain double-counted opunits if opunit in data_info.SCAN_SUBSTRACT_UNITS: scan_y_pred = mini_model_map[OpUnit.SEQ_SCAN].predict(x) y_pred -= scan_y_pred y_pred = np.clip(y_pred, 0, None) prediction_cache[key] = y_pred else: y_pred = prediction_cache[key] logging.debug( "Predicted {} elapsed time with feature {}: {}".format( opunit_feature[0].name, x[0], y_pred[0, -1])) pipeline_y_pred += y_pred[0] # Record the predicted data.y_pred = pipeline_y_pred logging.debug("{} pipeline predicted time: {}".format( data.name, pipeline_y_pred[-1])) ratio_error = abs(y - pipeline_y_pred) / (y + 1e-6) logging.debug("|Actual - Predict| / Actual: {}".format( ratio_error[-1])) io_util.write_csv_result(prediction_path, data.name + " " + str(x[0][-1]), [y[-1], pipeline_y_pred[-1], "", ratio_error]) logging.debug("")
def create_metrics_and_prediction_files(metrics_path, prediction_path): """Create the prediction result files :param metrics_path: the file to store the prediction metrics :param prediction_path: the file to store the raw predictions :return: """ # First write the header to the result files io_util.create_csv_file(metrics_path, ["Method"] + _get_result_labels()) io_util.create_csv_file(prediction_path, None)
def _interval_get_mini_runner_data(filename, model_results_path): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename, skipinitialspace=True) headers = list(df.columns.values) data_info.parse_csv_header(headers, False) file_name = os.path.splitext(os.path.basename(filename))[0] x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.RAW_TARGET_CSV_INDEX[Target. START_TIME]].values logging.info("Loaded file: {}".format(OpUnit[file_name.upper()])) # change the data based on the interval for the periodically invoked operating units prediction_path = "{}/{}_interval_converted_data.csv".format( model_results_path, file_name) io_util.create_csv_file(prediction_path, [""]) interval = data_info.PERIODIC_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) # Construct the new data x_list = [] y_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Keep the interval parameter the same # TODO: currently the interval parameter is always the last. Change the hard-coding later x_new[-1] /= len(interval_x_map[rounded_time]) x_list.append(x_new) # The prediction is the average behavior y_list.append(np.average(interval_y_map[rounded_time], axis=0)) io_util.write_csv_result(prediction_path, rounded_time, np.concatenate((x_list[-1], y_list[-1]))) return [ OpUnitData(OpUnit[file_name.upper()], np.array(x_list), np.array(y_list)) ]
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path): """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction result in place :param data_list: The list of the GroupedOpUnitData objects :param mini_model_map: The trained mini models :param model_results_path: file path to log the prediction results """ prediction_path = "{}/grouped_opunit_prediction.csv".format(model_results_path) io_util.create_csv_file(prediction_path, ["Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error"]) # Have to use a prediction cache when having lots of global data... prediction_cache = {} # First run a prediction on the global running data with the mini model results for i, data in enumerate(tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")): y = data.y logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1])) pipeline_y_pred = 0 x = None for opunit_feature in data.opunit_features: opunit = opunit_feature[0] opunit_model = mini_model_map[opunit] x = np.array(opunit_feature[1]).reshape(1, -1) key = (opunit, x.tobytes()) if key not in prediction_cache: y_pred = opunit_model.predict(x) y_pred = np.clip(y_pred, 0, None) prediction_cache[key] = y_pred else: y_pred = prediction_cache[key] logging.debug("Predicted {} elapsed time with feature {}: {}".format(opunit_feature[0].name, x[0], y_pred[0, -1])) if opunit in data_info.MEM_ADJUST_OPUNITS: # Compute the number of "slots" (based on row feature or cardinality feature num_tuple = opunit_feature[1][data_info.TUPLE_NUM_INDEX] if opunit == OpUnit.AGG_BUILD: num_tuple = opunit_feature[1][data_info.CARDINALITY_INDEX] # SORT/AGG/HASHJOIN_BUILD all allocate a "pointer" buffer # that contains the first pow2 larger than num_tuple entries pow_high = 2 ** math.ceil(math.log(num_tuple, 2)) buffer_size = pow_high * data_info.POINTER_SIZE if opunit == OpUnit.AGG_BUILD and num_tuple <= 256: # For AGG_BUILD, if slots <= AggregationHashTable::K_DEFAULT_INITIAL_TABLE_SIZE # the buffer is not recorded as part of the pipeline buffer_size = 0 pred_mem = y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] if pred_mem <= buffer_size: logging.warning("{} feature {} {} with prediction {} exceeds buffer {}" .format(data.name, opunit_feature, opunit_feature[1], y_pred[0], buffer_size)) # Poorly encapsulated, but memory scaling factor is located as the 2nd last of feature # slightly inaccurate since ignores load factors for hash tables adj_mem = (pred_mem - buffer_size) * opunit_feature[1][-2] + buffer_size # Don't modify prediction cache y_pred = copy.deepcopy(y_pred) y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] = adj_mem pipeline_y_pred += y_pred[0] # Record the predicted data.y_pred = pipeline_y_pred logging.debug("{} pipeline predicted time: {}".format(data.name, pipeline_y_pred[-1])) ratio_error = abs(y - pipeline_y_pred) / (y + 1) logging.debug("|Actual - Predict| / Actual: {}".format(ratio_error[-1])) io_util.write_csv_result(prediction_path, data.name, [""] + list(y) + [""] + list(pipeline_y_pred) + [""] + list(ratio_error)) logging.debug("")
def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label, data_list): """Record the prediction results :param x: the input data :param y: the actual output :param y_pred: the predicted output :param label: the result label ("resource" or "impact") """ # Result files metrics_path = "{}/global_{}_model_metrics.csv".format( self.model_results_path, label) prediction_path = "{}/global_{}_model_prediction.csv".format( self.model_results_path, label) result_writing_util.create_metrics_and_prediction_files( metrics_path, prediction_path, True) # Log the prediction results ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0) io_util.write_csv_result(metrics_path, "Model Ratio Error", ratio_error) result_writing_util.record_predictions((x, y_pred, y), prediction_path) # Print Error summary to command line if label == "resource": avg_original_ratio_error = np.average( np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0) else: avg_original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1), axis=0) logging.info('Model Original Ratio Error ({}): {}'.format( label, avg_original_ratio_error)) logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error)) logging.info('') if label != "resource": # Calculate the accumulated ratio error epsilon = global_model_config.RATIO_DIVISION_EPSILON mini_model_y_pred = np.array(mini_model_y_pred) raw_y = np.array(raw_y) raw_y_pred = (mini_model_y_pred + epsilon) * y_pred accumulated_raw_y = np.sum(raw_y, axis=0) accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0) original_ratio_error = np.abs(raw_y - mini_model_y_pred) / ( raw_y + epsilon) avg_original_ratio_error = np.average(original_ratio_error, axis=0) ratio_error = np.abs(raw_y - raw_y_pred) / (raw_y + epsilon) avg_ratio_error = np.average(ratio_error, axis=0) accumulated_percentage_error = np.abs( accumulated_raw_y - accumulated_raw_y_pred) / (accumulated_raw_y + epsilon) original_accumulated_percentage_error = np.abs( accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / ( accumulated_raw_y + epsilon) logging.info( 'Original Ratio Error: {}'.format(avg_original_ratio_error)) io_util.write_csv_result(metrics_path, "Original Ratio Error", avg_original_ratio_error) logging.info('Ratio Error: {}'.format(avg_ratio_error)) io_util.write_csv_result(metrics_path, "Ratio Error", avg_ratio_error) logging.info('Original Accumulated Ratio Error: {}'.format( original_accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Original Accumulated Ratio Error", original_accumulated_percentage_error) logging.info('Accumulated Ratio Error: {}'.format( accumulated_percentage_error)) io_util.write_csv_result(metrics_path, "Accumulated Ratio Error", accumulated_percentage_error) logging.info('Accumulated Actual: {}'.format(accumulated_raw_y)) logging.info('Original Accumulated Predict: {}'.format( np.sum(mini_model_y_pred, axis=0))) logging.info( 'Accumulated Predict: {}'.format(accumulated_raw_y_pred)) if label == 'direct': prediction_path = "{}/grouped_opunit_prediction.csv".format( self.model_results_path) io_util.create_csv_file(prediction_path, [ "Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error" ]) for i, data in enumerate(data_list): io_util.write_csv_result(prediction_path, data.name, [""] + list(raw_y[i]) + [""] + list(raw_y_pred[i]) + [""] + list(ratio_error[i])) average_result_path = "{}/interval_average_prediction.csv".format( self.model_results_path) io_util.create_csv_file( average_result_path, ["Timestamp", "Actual Average", "Predicted Average"]) interval_y_map = {} interval_y_pred_map = {} mark_list = None #mark_list = _generate_mark_list(data_list) for i, data in enumerate(data_list): # Don't count the create index OU # TODO(lin): needs better way to evaluate... maybe add a id_query field to GroupedOpunitData if data.concurrency > 0: continue if mark_list is not None and not mark_list[i]: continue interval_time = _round_to_interval( data.start_time, global_model_config.AVERAGING_INTERVAL) if interval_time not in interval_y_map: interval_y_map[interval_time] = [] interval_y_pred_map[interval_time] = [] interval_y_map[interval_time].append(raw_y[i][-5]) interval_y_pred_map[interval_time].append( raw_y_pred[i][-5]) for time in sorted(interval_y_map.keys()): if mark_list is None: io_util.write_csv_result(average_result_path, time, [ np.average(interval_y_map[time]), np.average(interval_y_pred_map[time]) ]) else: io_util.write_csv_result(average_result_path, time, [ np.sum(interval_y_map[time]), np.sum(interval_y_pred_map[time]) ])
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path): """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction result in place :param data_list: The list of the GroupedOpUnitData objects :param mini_model_map: The trained mini models :param model_results_path: file path to log the prediction results """ prediction_path = "{}/grouped_opunit_prediction.csv".format(model_results_path) pipeline_path = "{}/grouped_pipeline.csv".format(model_results_path) io_util.create_csv_file(prediction_path, ["Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error"]) io_util.create_csv_file(pipeline_path, ["Number", "Percentage", "Pipeline", "Actual Us", "Predicted Us", "Us Error", "Absolute Us", "Assolute Us %"]) # Track pipeline cumulative numbers num_pipelines = 0 total_actual = None total_predicted = [] actual_pipelines = {} predicted_pipelines = {} count_pipelines = {} query_prediction_path = "{}/grouped_query_prediction.csv".format(model_results_path) io_util.create_csv_file(query_prediction_path, ["Query", "", "Actual", "", "Predicted", "", "Ratio Error"]) current_query_id = None query_y = None query_y_pred = None # Have to use a prediction cache when having lots of global data... prediction_cache = {} # First run a prediction on the global running data with the mini model results for i, data in enumerate(tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")): y = data.y logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1])) pipeline_y_pred = 0 x = None for opunit_feature in data.opunit_features: opunit = opunit_feature[0] opunit_model = mini_model_map[opunit] x = np.array(opunit_feature[1]).reshape(1, -1) key = (opunit, x.tobytes()) if key not in prediction_cache: y_pred = opunit_model.predict(x) y_pred = np.clip(y_pred, 0, None) prediction_cache[key] = y_pred else: y_pred = prediction_cache[key] logging.debug("Predicted {} elapsed time with feature {}: {}".format(opunit_feature[0].name, x[0], y_pred[0, -1])) if opunit in data_info.MEM_ADJUST_OPUNITS: # Compute the number of "slots" (based on row feature or cardinality feature num_tuple = opunit_feature[1][data_info.TUPLE_NUM_INDEX] if opunit == OpUnit.AGG_BUILD: num_tuple = opunit_feature[1][data_info.CARDINALITY_INDEX] # SORT/AGG/HASHJOIN_BUILD all allocate a "pointer" buffer # that contains the first pow2 larger than num_tuple entries pow_high = 2 ** math.ceil(math.log(num_tuple, 2)) buffer_size = pow_high * data_info.POINTER_SIZE if opunit == OpUnit.AGG_BUILD and num_tuple <= 256: # For AGG_BUILD, if slots <= AggregationHashTable::K_DEFAULT_INITIAL_TABLE_SIZE # the buffer is not recorded as part of the pipeline buffer_size = 0 pred_mem = y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] if pred_mem <= buffer_size: logging.warning("{} feature {} {} with prediction {} exceeds buffer {}" .format(data.name, opunit_feature, opunit_feature[1], y_pred[0], buffer_size)) # Poorly encapsulated, but memory scaling factor is located as the 2nd last of feature # slightly inaccurate since ignores load factors for hash tables adj_mem = (pred_mem - buffer_size) * opunit_feature[1][-3] + buffer_size # Don't modify prediction cache y_pred = copy.deepcopy(y_pred) y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] = adj_mem pipeline_y_pred += y_pred[0] pipeline_y = copy.deepcopy(pipeline_y_pred) # Grouping when we're predicting queries if data.name[0] == 'q': query_id = data.name[1:data.name.rfind(" p")] if query_id != current_query_id: if current_query_id is not None: io_util.write_csv_result(query_prediction_path, current_query_id, [""] + list(query_y) + [""] + list(query_y_pred) + [""] + list(abs(query_y - query_y_pred) / (query_y + 1))) current_query_id = query_id query_y = y query_y_pred = pipeline_y_pred else: query_y += y query_y_pred += pipeline_y_pred data.y_pred = pipeline_y logging.debug("{} pipeline prediction: {}".format(data.name, pipeline_y)) logging.debug("{} pipeline predicted time: {}".format(data.name, pipeline_y[-1])) ratio_error = abs(y - pipeline_y) / (y + 1) logging.debug("|Actual - Predict| / Actual: {}".format(ratio_error[-1])) io_util.write_csv_result(prediction_path, data.name, [""] + list(y) + [""] + list(pipeline_y) + [""] + list(ratio_error)) logging.debug("") # Record cumulative numbers if data.name not in actual_pipelines: actual_pipelines[data.name] = copy.deepcopy(y) predicted_pipelines[data.name] = copy.deepcopy(pipeline_y) count_pipelines[data.name] = 1 else: actual_pipelines[data.name] += y predicted_pipelines[data.name] += pipeline_y count_pipelines[data.name] += 1 # Update totals if total_actual is None: total_actual = copy.deepcopy(y) total_predicted = copy.deepcopy(pipeline_y) else: total_actual += y total_predicted += pipeline_y num_pipelines += 1 total_elapsed_err = 0 for pipeline in actual_pipelines: actual = actual_pipelines[pipeline] predicted = predicted_pipelines[pipeline] total_elapsed_err = total_elapsed_err + (abs(actual - predicted))[-1] for pipeline in actual_pipelines: actual = actual_pipelines[pipeline] predicted = predicted_pipelines[pipeline] num = count_pipelines[pipeline] ratio_error = abs(actual - predicted) / (actual + 1) abs_error = abs(actual - predicted)[-1] pabs_error = abs_error / total_elapsed_err io_util.write_csv_result(pipeline_path, pipeline, [num, num*1.0/num_pipelines, actual[-1], predicted[-1], ratio_error[-1], abs_error, pabs_error] + [""] + list(actual) + [""] + list(predicted) + [""] + list(ratio_error)) ratio_error = abs(total_actual - total_predicted) / (total_actual + 1) io_util.write_csv_result(pipeline_path, "Total Pipeline", [num_pipelines, 1, total_actual[-1], total_predicted[-1], ratio_error[-1], total_elapsed_err, 1] + [""] + list(total_actual) + [""] + list(total_predicted) + [""] + list(ratio_error))