def split_online_b(online_b_data, test_idx): primer = pd.DataFrame(columns=online_b_data.get_dataframe().columns) eval = pd.DataFrame(columns=online_b_data.get_dataframe().columns) for wl_id in online_b_data.get_workload_ids(): curr_ds = online_b_data.get_specific_workload(wl_id) for idx in range(curr_ds.get_dataframe().values.shape[0]): if idx == test_idx: eval = eval.append(curr_ds.get_dataframe().iloc[idx:idx + 1], ignore_index=True) else: primer = primer.append(curr_ds.get_dataframe().iloc[idx:idx + 1], ignore_index=True) primer = Dataset(dataframe=primer) eval = Dataset(dataframe=eval) latency_gt = eval.get_column_values('latency') eval = eval.prune_columns(['workload id'] + eval.get_tuning_knob_headers()) return primer, eval, latency_gt
def main(): # only training GPRs for offline loads dataset = Dataset(file_path=DATASET_PATHS['offline_workload']) # load the pruned metric headers pruned_metrics = Dataset.load_pruned_metrics() # prune the dataset dataset = dataset.prune_columns(pruned_metrics + ['workload id'] + dataset.get_tuning_knob_headers()) # build the GPRs start = time() gprs = WorkloadGPR(dataset=dataset) LOG.info(f"Finished building GPRs in {round(time() - start)} seconds.") # pickle 'em LOG.info("Pickling GPRs...") start = time() gprs.pickle_models() LOG.info(f"Finished pickling models in {round(time() - start)} seconds.")
def main(): LOG.debug('Clearing out all of the workload models.') clear_wl_models() dataset = Dataset(file_path=DATASET_PATHS['offline_workload']) pruned_metrics = Dataset.load_pruned_metrics() pruned_dataset = dataset.prune_columns(pruned_metrics + ['workload id'] + dataset.get_tuning_knob_headers()) df = pruned_dataset.get_dataframe() # pick the ith data to use as validation i = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] workload_ids = pruned_dataset.get_workload_ids() validation_df = pd.concat( [df[df['workload id'] == wid].iloc[i] for wid in workload_ids]) validation_idx = validation_df.index valid_dataset = Dataset(dataframe=validation_df) diff_idx = df.index.difference(validation_df.index) train_df = df.iloc[diff_idx] train_dataset = Dataset(dataframe=train_df) # LOG.info("Fitting input scaler...") # scaler = StandardScaler() # scaler.fit(train_df[dataset.get_tuning_knob_headers()].values) scaler = None LOG.info("Training workload GPRs...") gprs = WorkloadGPR(dataset=train_dataset, scaler=scaler) LOG.info("Validating GPRs...") train = {} result = {} for pm in pruned_metrics: for wid in workload_ids: name = f"{pm}|{wid}" model = gprs.get_model(wid, pm) # train # X = train_df[dataset.get_tuning_knob_headers()].values # X = scaler.transform(X) # y = train_df[pm].values # y_hat = model.predict(X) # mape = np.mean(np.abs((y - y_hat) / y)) * 100 # train[name] = mape # validation X = validation_df[dataset.get_tuning_knob_headers()].values if scaler is not None: X = scaler.transform(X) y = validation_df[pm].values y_hat = model.predict(X) mape = np.mean(np.abs((y - y_hat) / y)) * 100 result[name] = mape # LOG.info('%s: %s', name, mape) # LOG.info('Training average MAPE: %s', # np.array(list(train.values())).mean()) LOG.info('Validation average MAPE: %s', np.array(list(result.values())).mean())