def main(): # only training GPRs for offline loads dataset = Dataset(file_path=DATASET_PATHS['offline_workload']) # load the pruned metric headers pruned_metrics = Dataset.load_pruned_metrics() # prune the dataset dataset = dataset.prune_columns(pruned_metrics + ['workload id'] + dataset.get_tuning_knob_headers()) # build the GPRs start = time() gprs = WorkloadGPR(dataset=dataset) LOG.info(f"Finished building GPRs in {round(time() - start)} seconds.") # pickle 'em LOG.info("Pickling GPRs...") start = time() gprs.pickle_models() LOG.info(f"Finished pickling models in {round(time() - start)} seconds.")
def split_online_b(online_b_data, test_idx): primer = pd.DataFrame(columns=online_b_data.get_dataframe().columns) eval = pd.DataFrame(columns=online_b_data.get_dataframe().columns) for wl_id in online_b_data.get_workload_ids(): curr_ds = online_b_data.get_specific_workload(wl_id) for idx in range(curr_ds.get_dataframe().values.shape[0]): if idx == test_idx: eval = eval.append(curr_ds.get_dataframe().iloc[idx:idx + 1], ignore_index=True) else: primer = primer.append(curr_ds.get_dataframe().iloc[idx:idx + 1], ignore_index=True) primer = Dataset(dataframe=primer) eval = Dataset(dataframe=eval) latency_gt = eval.get_column_values('latency') eval = eval.prune_columns(['workload id'] + eval.get_tuning_knob_headers()) return primer, eval, latency_gt
def _build_models_from_dataset(self, dataset: Dataset, scaler=None): """ Build all of the GPR models from scratch """ df = dataset.get_dataframe() metrics = dataset.get_metric_headers() workload_ids = dataset.get_workload_ids() knob_headers = dataset.get_tuning_knob_headers() total_gprs = len(workload_ids) * len(metrics) with tqdm(total=total_gprs) as pbar: for w in workload_ids: workloads = df[df['workload id'] == w] for m in metrics: X = workloads[knob_headers].values if scaler is not None: X = scaler.transform(X) y = workloads[m].values m_file_name = m \ .replace('_', '-') \ .replace('/', '-') \ .replace('%', '-') # krasserm.github.io/2018/03/19/gaussian-processes#effect-of-kernel-parameters-and-noise-parameter restarts = 10 # sigma_f, l kernel = ConstantKernel(10.0) * RBF(y.std()) # sigma_y alpha = 0.1 model = GaussianProcessRegressor( kernel=kernel, n_restarts_optimizer=restarts, alpha=alpha, normalize_y=True) model.fit(X, y) self.models[f"wl_{w}_{m_file_name}.pickle"] = model pbar.update(1)
def main(): LOG.debug('Clearing out all of the workload models.') clear_wl_models() dataset = Dataset(file_path=DATASET_PATHS['offline_workload']) pruned_metrics = Dataset.load_pruned_metrics() pruned_dataset = dataset.prune_columns(pruned_metrics + ['workload id'] + dataset.get_tuning_knob_headers()) df = pruned_dataset.get_dataframe() # pick the ith data to use as validation i = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] workload_ids = pruned_dataset.get_workload_ids() validation_df = pd.concat( [df[df['workload id'] == wid].iloc[i] for wid in workload_ids]) validation_idx = validation_df.index valid_dataset = Dataset(dataframe=validation_df) diff_idx = df.index.difference(validation_df.index) train_df = df.iloc[diff_idx] train_dataset = Dataset(dataframe=train_df) # LOG.info("Fitting input scaler...") # scaler = StandardScaler() # scaler.fit(train_df[dataset.get_tuning_knob_headers()].values) scaler = None LOG.info("Training workload GPRs...") gprs = WorkloadGPR(dataset=train_dataset, scaler=scaler) LOG.info("Validating GPRs...") train = {} result = {} for pm in pruned_metrics: for wid in workload_ids: name = f"{pm}|{wid}" model = gprs.get_model(wid, pm) # train # X = train_df[dataset.get_tuning_knob_headers()].values # X = scaler.transform(X) # y = train_df[pm].values # y_hat = model.predict(X) # mape = np.mean(np.abs((y - y_hat) / y)) * 100 # train[name] = mape # validation X = validation_df[dataset.get_tuning_knob_headers()].values if scaler is not None: X = scaler.transform(X) y = validation_df[pm].values y_hat = model.predict(X) mape = np.mean(np.abs((y - y_hat) / y)) * 100 result[name] = mape # LOG.info('%s: %s', name, mape) # LOG.info('Training average MAPE: %s', # np.array(list(train.values())).mean()) LOG.info('Validation average MAPE: %s', np.array(list(result.values())).mean())