def _default_get_mini_runner_data(filename): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename, skipinitialspace=True) headers = list(df.columns.values) data_info.parse_csv_header(headers, False) file_name = os.path.splitext(os.path.basename(filename))[0] x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values return [OpUnitData(OpUnit[file_name.upper()], x, y)]
def _interval_get_mini_runner_data(filename, model_results_path): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename, skipinitialspace=True) headers = list(df.columns.values) data_info.parse_csv_header(headers, False) file_name = os.path.splitext(os.path.basename(filename))[0] x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.RAW_TARGET_CSV_INDEX[Target. START_TIME]].values logging.info("Loaded file: {}".format(OpUnit[file_name.upper()])) # change the data based on the interval for the periodically invoked operating units prediction_path = "{}/{}_interval_converted_data.csv".format( model_results_path, file_name) io_util.create_csv_file(prediction_path, [""]) interval = data_info.PERIODIC_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) # Construct the new data x_list = [] y_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Keep the interval parameter the same # TODO: currently the interval parameter is always the last. Change the hard-coding later x_new[-1] /= len(interval_x_map[rounded_time]) x_list.append(x_new) # The prediction is the average behavior y_list.append(np.average(interval_y_map[rounded_time], axis=0)) io_util.write_csv_result(prediction_path, rounded_time, np.concatenate((x_list[-1], y_list[-1]))) return [ OpUnitData(OpUnit[file_name.upper()], np.array(x_list), np.array(y_list)) ]
def _interval_get_grouped_op_unit_data(filename): # In the default case, the data does not need any pre-processing and the file name indicates the opunit df = pd.read_csv(filename, skipinitialspace=True) headers = list(df.columns.values) data_info.parse_csv_header(headers, False) file_name = os.path.splitext(os.path.basename(filename))[0] x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[Target.START_TIME]].values cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[Target.CPU_ID]].values interval = data_info.PERIODIC_OPUNIT_INTERVAL # Map from interval start time to the data in this interval interval_x_map = {} interval_y_map = {} interval_cpu_map = {} n = x.shape[0] for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"): rounded_time = data_util.round_to_interval(start_times[i], interval) if rounded_time not in interval_x_map: interval_x_map[rounded_time] = [] interval_y_map[rounded_time] = [] interval_x_map[rounded_time].append(x[i]) interval_y_map[rounded_time].append(y[i]) interval_cpu_map[rounded_time] = cpu_ids[i] # Construct the new data opunit = OpUnit[file_name.upper()] data_list = [] for rounded_time in interval_x_map: # Sum the features x_new = np.sum(interval_x_map[rounded_time], axis=0) # Keep the interval parameter the same # TODO: currently the interval parameter is always the last. Change the hard-coding later x_new[-1] /= len(interval_x_map[rounded_time]) # Change all the opunits in the group for this interval to be the new feature opunits = [(opunit, x_new)] # The prediction is the average behavior y_new = np.average(interval_y_map[rounded_time], axis=0) n = len(interval_x_map[rounded_time]) for i in range(n): metrics = np.concatenate(([rounded_time + i * interval // n], [interval_cpu_map[rounded_time]], y_new)) data_list.append( GroupedOpUnitData("{}".format(file_name), opunits, metrics)) return data_list
def _execution_get_mini_runner_data(filename, model_map, predict_cache, trim): """Get the training data from the mini runner :param filename: the input data file :param model_map: the map from OpUnit to the mini model :param predict_cache: cache for the mini model prediction :param trim: % of too high/too low anomalies to prune :return: the list of Data for execution operating units """ # Get the mini runner data for the execution engine data_map = {} raw_data_map = {} input_output_boundary = math.nan with open(filename, "r") as f: reader = csv.reader(f, delimiter=",", skipinitialspace=True) indexes = next(reader) data_info.parse_csv_header(indexes, True) features_vector_index = data_info.RAW_FEATURES_CSV_INDEX[ ExecutionFeature.FEATURES] raw_boundary = data_info.RAW_FEATURES_CSV_INDEX[ data_info.INPUT_OUTPUT_BOUNDARY] input_output_boundary = len(data_info.INPUT_CSV_INDEX) for line in reader: # drop query_id, pipeline_id, num_features, features_vector record = [d for i, d in enumerate(line) if i >= raw_boundary] data = list(map(data_util.convert_string_to_numeric, record)) x_multiple = data[:input_output_boundary] y_merged = np.array(data[-data_info.MINI_MODEL_TARGET_NUM:]) # Get the opunits located within opunits = [] features = line[features_vector_index].split(';') for idx, feature in enumerate(features): opunit = OpUnit[feature] x_loc = [v[idx] if type(v) == list else v for v in x_multiple] if opunit in model_map: key = [opunit] + x_loc if tuple(key) not in predict_cache: predict = model_map[opunit].predict( np.array(x_loc).reshape(1, -1))[0] predict_cache[tuple(key)] = predict assert len(predict) == len(y_merged) y_merged = y_merged - predict else: predict = predict_cache[tuple(key)] assert len(predict) == len(y_merged) y_merged = y_merged - predict y_merged = np.clip(y_merged, 0, None) else: opunits.append((opunit, x_loc)) if len(opunits) > 1: raise Exception( 'Unmodelled OperatingUnits detected: {}'.format(opunits)) # Record into predict_cache key = tuple([opunits[0][0]] + opunits[0][1]) if key not in raw_data_map: raw_data_map[key] = [] raw_data_map[key].append(y_merged) # Postprocess the raw_data_map -> data_map # We need to do this here since we need to have seen all the data # before we can start pruning. This step is done here so dropped # data don't actually become a part of the model. for key in raw_data_map: len_vec = len(raw_data_map[key]) raw_data_map[key].sort(key=lambda x: x[-1]) # compute how much to trim trim_side = trim * len_vec low = int(math.ceil(trim_side)) high = len_vec - low if low >= high: # if bounds are bad, just take the median raw_data_map[key] = np.median(raw_data_map[key], axis=0) else: # otherwise, x% trimmed mean raw_data_map[key] = np.average(raw_data_map[key][low:high], axis=0) # Expose the singular data point opunit = key[0] if opunit not in data_map: data_map[opunit] = [] predict = raw_data_map[key] predict_cache[key] = predict data_map[opunit].append(list(key[1:]) + list(predict)) data_list = [] for opunit, values in data_map.items(): np_value = np.array(values) x = np_value[:, :input_output_boundary] y = np_value[:, -data_info.MINI_MODEL_TARGET_NUM:] data_list.append(OpUnitData(opunit, x, y)) return data_list
def _pipeline_get_grouped_op_unit_data(filename, warmup_period, ee_sample_interval): # Get the global running data for the execution engine start_time = None data_list = [] with open(filename, "r") as f: reader = csv.reader(f, delimiter=",", skipinitialspace=True) indexes = next(reader) data_info.parse_csv_header(indexes, True) features_vector_index = data_info.RAW_FEATURES_CSV_INDEX[ ExecutionFeature.FEATURES] input_output_boundary = data_info.RAW_FEATURES_CSV_INDEX[ data_info.INPUT_OUTPUT_BOUNDARY] input_end_boundary = len(data_info.INPUT_CSV_INDEX) for line in reader: # extract the time cpu_time = line[data_info.RAW_TARGET_CSV_INDEX[Target.START_TIME]] if start_time is None: start_time = cpu_time if int(cpu_time) - int(start_time) < warmup_period * 1000000: continue sample_interval = ee_sample_interval # drop query_id, pipeline_id, num_features, features_vector record = [ d for i, d in enumerate(line) if i >= input_output_boundary ] data = list(map(data_util.convert_string_to_numeric, record)) x_multiple = data[:input_end_boundary] metrics = np.array(data[-data_info.METRICS_OUTPUT_NUM:]) # Get the opunits located within opunits = [] features = line[features_vector_index].split(';') concurrency = 0 for idx, feature in enumerate(features): if feature == 'LIMIT': continue x_loc = [v[idx] if type(v) == list else v for v in x_multiple] opunit = OpUnit[feature] if x_loc[data_info.INPUT_CSV_INDEX[ ExecutionFeature.NUM_ROWS]] == 0: logging.info("Skipping {} OU with 0 tuple num".format( opunit.name)) continue if opunit == OpUnit.CREATE_INDEX: concurrency = x_loc[data_info.CONCURRENCY_INDEX] # TODO(lin): we won't do sampling for CREATE_INDEX. We probably should encapsulate this when # generating the data sample_interval = 0 # TODO(lin): skip the main thing for interference model for now if opunit == OpUnit.CREATE_INDEX_MAIN: continue opunits.append((opunit, x_loc)) if len(opunits) == 0: continue # TODO(lin): Again, we won't do sampling for TPCH queries (with the assumption that the query id < 10). # Should encapsulate this wit the metrics if int(line[0]) < 10: sample_interval = 0 data_list.append( GroupedOpUnitData("q{} p{}".format(line[0], line[1]), opunits, np.array(metrics), sample_interval, concurrency)) return data_list
def _pipeline_get_grouped_op_unit_data(filename, warmup_period, tpcc_hack, ee_sample_interval): # Get the global running data for the execution engine start_time = None data_list = [] with open(filename, "r") as f: reader = csv.reader(f, delimiter=",", skipinitialspace=True) indexes = next(reader) data_info.parse_csv_header(indexes, True) features_vector_index = data_info.RAW_FEATURES_CSV_INDEX[ ExecutionFeature.FEATURES] input_output_boundary = data_info.RAW_FEATURES_CSV_INDEX[ data_info.INPUT_OUTPUT_BOUNDARY] input_end_boundary = len(data_info.INPUT_CSV_INDEX) for line in reader: # extract the time cpu_time = line[data_info.RAW_TARGET_CSV_INDEX[Target.START_TIME]] if start_time is None: start_time = cpu_time if int(cpu_time) - int(start_time) < warmup_period * 1000000: continue # drop query_id, pipeline_id, num_features, features_vector record = [ d for i, d in enumerate(line) if i >= input_output_boundary ] data = list(map(data_util.convert_string_to_numeric, record)) x_multiple = data[:input_end_boundary] metrics = np.array(data[-data_info.METRICS_OUTPUT_NUM:]) # Get the opunits located within opunits = [] features = line[features_vector_index].split(';') for idx, feature in enumerate(features): if feature == 'LIMIT': continue opunit = OpUnit[feature] x_loc = [v[idx] if type(v) == list else v for v in x_multiple] q_id = int(line[0]) p_id = int(line[1]) if tpcc_hack: x_loc = tpcc_fixer.transform_feature( feature, q_id, p_id, x_loc) if x_loc[data_info.INPUT_CSV_INDEX[ ExecutionFeature.NUM_ROWS]] == 0: logging.info("Skipping {} OU with 0 tuple num".format( opunit.name)) continue opunits.append((opunit, x_loc)) if len(opunits) == 0: continue data_list.append( GroupedOpUnitData("q{} p{}".format(line[0], line[1]), opunits, np.array(metrics), ee_sample_interval)) return data_list