def map_workload(target_data): # Get the latest version of pipeline data that's been computed so far. latest_pipeline_run = PipelineRun.objects.get_latest() if target_data['bad']: assert target_data is not None return target_data assert latest_pipeline_run is not None newest_result = Result.objects.get(pk=target_data['newest_result_id']) target_workload = newest_result.workload X_columnlabels = np.array(target_data['X_columnlabels']) y_columnlabels = np.array(target_data['y_columnlabels']) # Find all pipeline data belonging to the latest version with the same # DBMS and hardware as the target pipeline_data = PipelineData.objects.filter( pipeline_run=latest_pipeline_run, workload__dbms=target_workload.dbms, workload__hardware=target_workload.hardware) # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs # and pruned metrics but we just use those from the first workload for now initialized = False global_ranked_knobs = None global_pruned_metrics = None ranked_knob_idxs = None pruned_metric_idxs = None # Compute workload mapping data for each unique workload unique_workloads = pipeline_data.values_list('workload', flat=True).distinct() assert len(unique_workloads) > 0 workload_data = {} for unique_workload in unique_workloads: workload_obj = Workload.objects.get(pk=unique_workload) wkld_results = Result.objects.filter(workload=workload_obj) if wkld_results.exists() is False: # delete the workload workload_obj.delete() continue # Load knob & metric data for this workload knob_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.KNOB_DATA) metric_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.METRIC_DATA) X_matrix = np.array(knob_data["data"]) y_matrix = np.array(metric_data["data"]) rowlabels = np.array(knob_data["rowlabels"]) assert np.array_equal(rowlabels, metric_data["rowlabels"]) if not initialized: # For now set ranked knobs & pruned metrics to be those computed # for the first workload global_ranked_knobs = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.RANKED_KNOBS)[:IMPORTANT_KNOB_NUMBER] global_pruned_metrics = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.PRUNED_METRICS) ranked_knob_idxs = [ i for i in range(X_matrix.shape[1]) if X_columnlabels[i] in global_ranked_knobs ] pruned_metric_idxs = [ i for i in range(y_matrix.shape[1]) if y_columnlabels[i] in global_pruned_metrics ] # Filter X & y columnlabels by top ranked_knobs & pruned_metrics X_columnlabels = X_columnlabels[ranked_knob_idxs] y_columnlabels = y_columnlabels[pruned_metric_idxs] initialized = True # Filter X & y matrices by top ranked_knobs & pruned_metrics X_matrix = X_matrix[:, ranked_knob_idxs] y_matrix = y_matrix[:, pruned_metric_idxs] # Combine duplicate rows (rows with same knob settings) X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows( X_matrix, y_matrix, rowlabels) workload_data[unique_workload] = { 'X_matrix': X_matrix, 'y_matrix': y_matrix, 'rowlabels': rowlabels, } assert len(workload_data) > 0 # Stack all X & y matrices for preprocessing Xs = np.vstack( [entry['X_matrix'] for entry in list(workload_data.values())]) ys = np.vstack( [entry['y_matrix'] for entry in list(workload_data.values())]) # Scale the X & y values, then compute the deciles for each column in y X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(bin_start=1, axis=0) y_binner.fit(ys) del Xs del ys # Filter the target's X & y data by the ranked knobs & pruned metrics. X_target = target_data['X_matrix'][:, ranked_knob_idxs] y_target = target_data['y_matrix'][:, pruned_metric_idxs] # Now standardize the target's data and bin it by the deciles we just # calculated X_target = X_scaler.transform(X_target) y_target = y_scaler.transform(y_target) y_target = y_binner.transform(y_target) scores = {} for workload_id, workload_entry in list(workload_data.items()): predictions = np.empty_like(y_target) X_workload = workload_entry['X_matrix'] X_scaled = X_scaler.transform(X_workload) y_workload = workload_entry['y_matrix'] y_scaled = y_scaler.transform(y_workload) for j, y_col in enumerate(y_scaled.T): # Using this workload's data, train a Gaussian process model # and then predict the performance of each metric for each of # the knob configurations attempted so far by the target. y_col = y_col.reshape(-1, 1) model = GPRNP(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE) model.fit(X_scaled, y_col, ridge=DEFAULT_RIDGE) predictions[:, j] = model.predict(X_target).ypreds.ravel() # Bin each of the predicted metric columns by deciles and then # compute the score (i.e., distance) between the target workload # and each of the known workloads predictions = y_binner.transform(predictions) dists = np.sqrt( np.sum(np.square(np.subtract(predictions, y_target)), axis=1)) scores[workload_id] = np.mean(dists) # Find the best (minimum) score best_score = np.inf best_workload_id = None # scores_info = {workload_id: (workload_name, score)} scores_info = {} for workload_id, similarity_score in list(scores.items()): workload_name = Workload.objects.get(pk=workload_id).name if similarity_score < best_score: best_score = similarity_score best_workload_id = workload_id best_workload_name = workload_name scores_info[workload_id] = (workload_name, similarity_score) target_data['mapped_workload'] = (best_workload_id, best_workload_name, best_score) target_data['scores'] = scores_info return target_data
def create_workload_mapping_data(): agg_datas = PipelineResult.objects.filter( task_type=PipelineTaskType.AGGREGATED_DATA) dbmss = set([ad.dbms.pk for ad in agg_datas]) hardwares = set([ad.hardware.pk for ad in agg_datas]) for dbms_id, hw_id in itertools.product(dbmss, hardwares): data = PipelineResult.get_latest(dbms_id, hw_id, PipelineTaskType.AGGREGATED_DATA) file_info = JSONUtil.loads(data.value) cluster_data = OrderedDict() for cluster, path in file_info['data'].iteritems(): compressed_data = np.load(path) X_matrix = compressed_data['X_matrix'] y_matrix = compressed_data['y_matrix'] X_columnlabels = compressed_data['X_columnlabels'] y_columnlabels = compressed_data['y_columnlabels'] rowlabels = compressed_data['rowlabels'] # Filter metrics and knobs ranked_knobs = JSONUtil.loads( PipelineResult.get_latest( dbms_id, hw_id, PipelineTaskType.RANKED_KNOBS).value)[:10] # FIXME pruned_metrics = JSONUtil.loads( PipelineResult.get_latest( dbms_id, hw_id, PipelineTaskType.PRUNED_METRICS).value) knob_idxs = [ i for i in range(X_matrix.shape[1]) if X_columnlabels[i] in ranked_knobs ] metric_idxs = [ i for i in range(y_matrix.shape[1]) if y_columnlabels[i] in pruned_metrics ] X_matrix = X_matrix[:, knob_idxs] X_columnlabels = X_columnlabels[knob_idxs] y_matrix = y_matrix[:, metric_idxs] y_columnlabels = y_columnlabels[metric_idxs] # Combine duplicate rows X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows( X_matrix, y_matrix, rowlabels) cluster_data[cluster] = { 'X_matrix': X_matrix, 'y_matrix': y_matrix, 'X_columnlabels': X_columnlabels, 'y_columnlabels': y_columnlabels, 'rowlabels': rowlabels, } Xs = np.vstack([entry['X_matrix'] for entry in cluster_data.values()]) ys = np.vstack([entry['y_matrix'] for entry in cluster_data.values()]) X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(axis=0) y_binner.fit(ys) del Xs del ys task_name = PipelineTaskType.TYPE_NAMES[ PipelineTaskType.WORKLOAD_MAPPING_DATA].replace(' ', '').upper() timestamp = data.creation_timestamp tsf = timestamp.strftime("%Y%m%d-%H%M%S") savepaths = {} for cluster, entry in cluster_data.iteritems(): X_scaler.transform(entry['X_matrix']) y_scaler.transform(entry['y_matrix']) fname = '{}_{}_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, cluster, tsf) savepath = os.path.join(PIPELINE_DIR, fname) savepaths[cluster] = savepath np.savez_compressed(savepath, **entry) X_scaler_path = os.path.join( PIPELINE_DIR, '{}_XSCALER_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf)) np.savez_compressed(X_scaler_path, mean=X_scaler.mean_, scale=X_scaler.scale_) y_scaler_path = os.path.join( PIPELINE_DIR, '{}_YSCALER_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf)) np.savez_compressed(y_scaler_path, mean=y_scaler.mean_, scale=y_scaler.scale_) y_deciles_path = os.path.join( PIPELINE_DIR, '{}_YDECILES_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf)) np.savez_compressed(y_deciles_path, deciles=y_binner.deciles_) value = { 'data': savepaths, 'X_scaler': X_scaler_path, 'y_scaler': y_scaler_path, 'y_deciles': y_deciles_path, 'X_columnlabels': cluster_data.values()[0]['X_columnlabels'].tolist(), 'y_columnlabels': cluster_data.values()[0]['y_columnlabels'].tolist(), } new_res = PipelineResult() new_res.dbms = DBMSCatalog.objects.get(pk=dbms_id) new_res.hardware = Hardware.objects.get(pk=hw_id) new_res.creation_timestamp = timestamp new_res.task_type = PipelineTaskType.WORKLOAD_MAPPING_DATA new_res.value = JSONUtil.dumps(value, pprint=True) new_res.save()
def map_workload(map_workload_input): start_ts = time.time() target_data, algorithm = map_workload_input if target_data['bad']: assert target_data is not None target_data['pipeline_run'] = None LOG.debug('%s: Skipping workload mapping.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(target_data, pprint=True)) return target_data, algorithm # Get the latest version of pipeline data that's been computed so far. latest_pipeline_run = PipelineRun.objects.get_latest() assert latest_pipeline_run is not None target_data['pipeline_run'] = latest_pipeline_run.pk newest_result = Result.objects.get(pk=target_data['newest_result_id']) session = newest_result.session params = JSONUtil.loads(session.hyperparameters) target_workload = newest_result.workload X_columnlabels = np.array(target_data['X_columnlabels']) y_columnlabels = np.array(target_data['y_columnlabels']) # Find all pipeline data belonging to the latest version with the same # DBMS and hardware as the target pipeline_data = PipelineData.objects.filter( pipeline_run=latest_pipeline_run, workload__dbms=target_workload.dbms, workload__hardware=target_workload.hardware, workload__project=target_workload.project) # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs # and pruned metrics but we just use those from the first workload for now initialized = False global_ranked_knobs = None global_pruned_metrics = None ranked_knob_idxs = None pruned_metric_idxs = None unique_workloads = pipeline_data.values_list('workload', flat=True).distinct() workload_data = {} # Compute workload mapping data for each unique workload for unique_workload in unique_workloads: workload_obj = Workload.objects.get(pk=unique_workload) wkld_results = Result.objects.filter(workload=workload_obj) if wkld_results.exists() is False: # delete the workload workload_obj.delete() continue # Load knob & metric data for this workload knob_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.KNOB_DATA) knob_data["data"], knob_data["columnlabels"] = clean_knob_data( knob_data["data"], knob_data["columnlabels"], newest_result.session) metric_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.METRIC_DATA) X_matrix = np.array(knob_data["data"]) y_matrix = np.array(metric_data["data"]) rowlabels = np.array(knob_data["rowlabels"]) assert np.array_equal(rowlabels, metric_data["rowlabels"]) if not initialized: # For now set ranked knobs & pruned metrics to be those computed # for the first workload global_ranked_knobs = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.RANKED_KNOBS )[:params['IMPORTANT_KNOB_NUMBER']] global_pruned_metrics = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.PRUNED_METRICS) ranked_knob_idxs = [ i for i in range(X_matrix.shape[1]) if X_columnlabels[i] in global_ranked_knobs ] pruned_metric_idxs = [ i for i in range(y_matrix.shape[1]) if y_columnlabels[i] in global_pruned_metrics ] # Filter X & y columnlabels by top ranked_knobs & pruned_metrics X_columnlabels = X_columnlabels[ranked_knob_idxs] y_columnlabels = y_columnlabels[pruned_metric_idxs] initialized = True # Filter X & y matrices by top ranked_knobs & pruned_metrics X_matrix = X_matrix[:, ranked_knob_idxs] y_matrix = y_matrix[:, pruned_metric_idxs] # Combine duplicate rows (rows with same knob settings) X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows( X_matrix, y_matrix, rowlabels) workload_data[unique_workload] = { 'X_matrix': X_matrix, 'y_matrix': y_matrix, 'rowlabels': rowlabels, } if len(workload_data) == 0: # The background task that aggregates the data has not finished running yet target_data.update(mapped_workload=None, scores=None) LOG.debug( '%s: Skipping workload mapping because there is no parsed workload.\n', AlgorithmType.name(algorithm)) return target_data, algorithm # Stack all X & y matrices for preprocessing Xs = np.vstack( [entry['X_matrix'] for entry in list(workload_data.values())]) ys = np.vstack( [entry['y_matrix'] for entry in list(workload_data.values())]) # Scale the X & y values, then compute the deciles for each column in y X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(bin_start=1, axis=0) y_binner.fit(ys) del Xs del ys # Filter the target's X & y data by the ranked knobs & pruned metrics. X_target = target_data['X_matrix'][:, ranked_knob_idxs] y_target = target_data['y_matrix'][:, pruned_metric_idxs] # Now standardize the target's data and bin it by the deciles we just # calculated X_target = X_scaler.transform(X_target) y_target = y_scaler.transform(y_target) y_target = y_binner.transform(y_target) scores = {} for workload_id, workload_entry in list(workload_data.items()): predictions = np.empty_like(y_target) X_workload = workload_entry['X_matrix'] X_scaled = X_scaler.transform(X_workload) y_workload = workload_entry['y_matrix'] y_scaled = y_scaler.transform(y_workload) for j, y_col in enumerate(y_scaled.T): # Using this workload's data, train a Gaussian process model # and then predict the performance of each metric for each of # the knob configurations attempted so far by the target. y_col = y_col.reshape(-1, 1) if params['GPR_USE_GPFLOW']: model_kwargs = { 'lengthscales': params['GPR_LENGTH_SCALE'], 'variance': params['GPR_MAGNITUDE'], 'noise_variance': params['GPR_RIDGE'] } tf.reset_default_graph() graph = tf.get_default_graph() gpflow.reset_default_session(graph=graph) m = gpr_models.create_model(params['GPR_MODEL_NAME'], X=X_scaled, y=y_col, **model_kwargs) gpr_result = gpflow_predict(m.model, X_target) else: model = GPRNP(length_scale=params['GPR_LENGTH_SCALE'], magnitude=params['GPR_MAGNITUDE'], max_train_size=params['GPR_MAX_TRAIN_SIZE'], batch_size=params['GPR_BATCH_SIZE']) model.fit(X_scaled, y_col, ridge=params['GPR_RIDGE']) gpr_result = model.predict(X_target) predictions[:, j] = gpr_result.ypreds.ravel() # Bin each of the predicted metric columns by deciles and then # compute the score (i.e., distance) between the target workload # and each of the known workloads predictions = y_binner.transform(predictions) dists = np.sqrt( np.sum(np.square(np.subtract(predictions, y_target)), axis=1)) scores[workload_id] = np.mean(dists) # Find the best (minimum) score best_score = np.inf best_workload_id = None best_workload_name = None scores_info = {} for workload_id, similarity_score in list(scores.items()): workload_name = Workload.objects.get(pk=workload_id).name if similarity_score < best_score: best_score = similarity_score best_workload_id = workload_id best_workload_name = workload_name scores_info[workload_id] = (workload_name, similarity_score) target_data.update(mapped_workload=(best_workload_id, best_workload_name, best_score), scores=scores_info) LOG.debug('%s: Finished mapping the workload.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(target_data, pprint=True)) save_execution_time(start_ts, "map_workload", newest_result) return target_data, algorithm
def map_workload(target_data): # Get the latest version of pipeline data that's been computed so far. latest_pipeline_run = PipelineRun.objects.get_latest() if target_data['bad']: assert target_data is not None return target_data assert latest_pipeline_run is not None newest_result = Result.objects.get(pk=target_data['newest_result_id']) target_workload = newest_result.workload X_columnlabels = np.array(target_data['X_columnlabels']) y_columnlabels = np.array(target_data['y_columnlabels']) # Find all pipeline data belonging to the latest version with the same # DBMS and hardware as the target pipeline_data = PipelineData.objects.filter( pipeline_run=latest_pipeline_run, workload__dbms=target_workload.dbms, workload__hardware=target_workload.hardware) # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs # and pruned metrics but we just use those from the first workload for now initialized = False global_ranked_knobs = None global_pruned_metrics = None ranked_knob_idxs = None pruned_metric_idxs = None # Compute workload mapping data for each unique workload unique_workloads = pipeline_data.values_list('workload', flat=True).distinct() assert len(unique_workloads) > 0 workload_data = {} for unique_workload in unique_workloads: # Load knob & metric data for this workload knob_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.KNOB_DATA) metric_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.METRIC_DATA) X_matrix = np.array(knob_data["data"]) y_matrix = np.array(metric_data["data"]) rowlabels = np.array(knob_data["rowlabels"]) assert np.array_equal(rowlabels, metric_data["rowlabels"]) if not initialized: # For now set ranked knobs & pruned metrics to be those computed # for the first workload global_ranked_knobs = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.RANKED_KNOBS)[:IMPORTANT_KNOB_NUMBER] global_pruned_metrics = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.PRUNED_METRICS) ranked_knob_idxs = [i for i in range(X_matrix.shape[1]) if X_columnlabels[ i] in global_ranked_knobs] pruned_metric_idxs = [i for i in range(y_matrix.shape[1]) if y_columnlabels[ i] in global_pruned_metrics] # Filter X & y columnlabels by top ranked_knobs & pruned_metrics X_columnlabels = X_columnlabels[ranked_knob_idxs] y_columnlabels = y_columnlabels[pruned_metric_idxs] initialized = True # Filter X & y matrices by top ranked_knobs & pruned_metrics X_matrix = X_matrix[:, ranked_knob_idxs] y_matrix = y_matrix[:, pruned_metric_idxs] # Combine duplicate rows (rows with same knob settings) X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows( X_matrix, y_matrix, rowlabels) workload_data[unique_workload] = { 'X_matrix': X_matrix, 'y_matrix': y_matrix, 'rowlabels': rowlabels, } # Stack all X & y matrices for preprocessing Xs = np.vstack([entry['X_matrix'] for entry in list(workload_data.values())]) ys = np.vstack([entry['y_matrix'] for entry in list(workload_data.values())]) # Scale the X & y values, then compute the deciles for each column in y X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(bin_start=1, axis=0) y_binner.fit(ys) del Xs del ys # Filter the target's X & y data by the ranked knobs & pruned metrics. X_target = target_data['X_matrix'][:, ranked_knob_idxs] y_target = target_data['y_matrix'][:, pruned_metric_idxs] # Now standardize the target's data and bin it by the deciles we just # calculated X_target = X_scaler.transform(X_target) y_target = y_scaler.transform(y_target) y_target = y_binner.transform(y_target) scores = {} for workload_id, workload_entry in list(workload_data.items()): predictions = np.empty_like(y_target) X_workload = workload_entry['X_matrix'] X_scaled = X_scaler.transform(X_workload) y_workload = workload_entry['y_matrix'] y_scaled = y_scaler.transform(y_workload) for j, y_col in enumerate(y_scaled.T): # Using this workload's data, train a Gaussian process model # and then predict the performance of each metric for each of # the knob configurations attempted so far by the target. y_col = y_col.reshape(-1, 1) model = GPRNP(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE) model.fit(X_scaled, y_col, ridge=DEFAULT_RIDGE) predictions[:, j] = model.predict(X_target).ypreds.ravel() # Bin each of the predicted metric columns by deciles and then # compute the score (i.e., distance) between the target workload # and each of the known workloads predictions = y_binner.transform(predictions) dists = np.sqrt(np.sum(np.square( np.subtract(predictions, y_target)), axis=1)) scores[workload_id] = np.mean(dists) # Find the best (minimum) score best_score = np.inf best_workload_id = None for workload_id, similarity_score in list(scores.items()): if similarity_score < best_score: best_score = similarity_score best_workload_id = workload_id target_data['mapped_workload'] = (best_workload_id, best_score) target_data['scores'] = scores return target_data