コード例 #1
0
ファイル: async_tasks.py プロジェクト: xixicat/ottertune
def aggregate_results():
    unique_clusters = WorkloadCluster.objects.all()
    unique_clusters = filter(lambda x: x.isdefault is False, unique_clusters)
    all_data = {}
    all_labels = {}
    for cluster in unique_clusters:
        results = ResultData.objects.filter(cluster=cluster)
        if len(results) < 2:
            continue
        if cluster.dbms.pk not in all_labels:
            knob_labels = np.asarray(
                sorted(JSONUtil.loads(results[0].param_data).keys()))
            metric_labels = np.asarray(
                sorted(JSONUtil.loads(results[0].metric_data).keys()))
            all_labels[cluster.dbms.pk] = (knob_labels, metric_labels)
        else:
            knob_labels, metric_labels = all_labels[cluster.dbms.pk]
        entry = DataUtil.aggregate_data(results, knob_labels, metric_labels)
        key = (cluster.dbms.pk, cluster.hardware.pk)
        if key not in all_data:
            all_data[key] = {}
        all_data[key][cluster.pk] = entry

    ts = now()
    tsf = ts.strftime("%Y%m%d-%H%M%S")
    for (dbkey, hwkey), cluster_data in all_data.iteritems():
        task_name = PipelineTaskType.TYPE_NAMES[
            PipelineTaskType.AGGREGATED_DATA].replace(' ', '').upper()
        savepaths = {}
        for clusterkey, entry in cluster_data.iteritems():
            fname = '{}_{}_{}_{}_{}.npz'.format(task_name, dbkey, hwkey,
                                                clusterkey, tsf)
            savepath = os.path.join(PIPELINE_DIR, fname)
            savepaths[clusterkey] = savepath
            np.savez_compressed(savepath, **entry)

        value = {'data': savepaths}

        new_res = PipelineResult()
        new_res.dbms = DBMSCatalog.objects.get(pk=dbkey)
        new_res.hardware = Hardware.objects.get(pk=hwkey)
        new_res.creation_timestamp = ts
        new_res.task_type = PipelineTaskType.AGGREGATED_DATA
        new_res.value = JSONUtil.dumps(value)
        new_res.save()
コード例 #2
0
ファイル: async_tasks.py プロジェクト: xixicat/ottertune
def create_workload_mapping_data():
    agg_datas = PipelineResult.objects.filter(
        task_type=PipelineTaskType.AGGREGATED_DATA)
    dbmss = set([ad.dbms.pk for ad in agg_datas])
    hardwares = set([ad.hardware.pk for ad in agg_datas])

    for dbms_id, hw_id in itertools.product(dbmss, hardwares):
        data = PipelineResult.get_latest(dbms_id, hw_id,
                                         PipelineTaskType.AGGREGATED_DATA)
        file_info = JSONUtil.loads(data.value)
        cluster_data = OrderedDict()
        for cluster, path in file_info['data'].iteritems():
            compressed_data = np.load(path)
            X_matrix = compressed_data['X_matrix']
            y_matrix = compressed_data['y_matrix']
            X_columnlabels = compressed_data['X_columnlabels']
            y_columnlabels = compressed_data['y_columnlabels']
            rowlabels = compressed_data['rowlabels']

            # Filter metrics and knobs
            ranked_knobs = JSONUtil.loads(
                PipelineResult.get_latest(
                    dbms_id, hw_id,
                    PipelineTaskType.RANKED_KNOBS).value)[:10]  # FIXME
            pruned_metrics = JSONUtil.loads(
                PipelineResult.get_latest(
                    dbms_id, hw_id, PipelineTaskType.PRUNED_METRICS).value)
            knob_idxs = [
                i for i in range(X_matrix.shape[1])
                if X_columnlabels[i] in ranked_knobs
            ]
            metric_idxs = [
                i for i in range(y_matrix.shape[1])
                if y_columnlabels[i] in pruned_metrics
            ]
            X_matrix = X_matrix[:, knob_idxs]
            X_columnlabels = X_columnlabels[knob_idxs]
            y_matrix = y_matrix[:, metric_idxs]
            y_columnlabels = y_columnlabels[metric_idxs]

            # Combine duplicate rows
            X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows(
                X_matrix, y_matrix, rowlabels)
            cluster_data[cluster] = {
                'X_matrix': X_matrix,
                'y_matrix': y_matrix,
                'X_columnlabels': X_columnlabels,
                'y_columnlabels': y_columnlabels,
                'rowlabels': rowlabels,
            }

        Xs = np.vstack([entry['X_matrix'] for entry in cluster_data.values()])
        ys = np.vstack([entry['y_matrix'] for entry in cluster_data.values()])

        X_scaler = StandardScaler(copy=False)
        X_scaler.fit(Xs)
        y_scaler = StandardScaler(copy=False)
        y_scaler.fit_transform(ys)
        y_binner = Bin(axis=0)
        y_binner.fit(ys)
        del Xs
        del ys

        task_name = PipelineTaskType.TYPE_NAMES[
            PipelineTaskType.WORKLOAD_MAPPING_DATA].replace(' ', '').upper()
        timestamp = data.creation_timestamp
        tsf = timestamp.strftime("%Y%m%d-%H%M%S")
        savepaths = {}
        for cluster, entry in cluster_data.iteritems():
            X_scaler.transform(entry['X_matrix'])
            y_scaler.transform(entry['y_matrix'])
            fname = '{}_{}_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id,
                                                cluster, tsf)
            savepath = os.path.join(PIPELINE_DIR, fname)
            savepaths[cluster] = savepath
            np.savez_compressed(savepath, **entry)

        X_scaler_path = os.path.join(
            PIPELINE_DIR,
            '{}_XSCALER_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf))
        np.savez_compressed(X_scaler_path,
                            mean=X_scaler.mean_,
                            scale=X_scaler.scale_)
        y_scaler_path = os.path.join(
            PIPELINE_DIR,
            '{}_YSCALER_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf))
        np.savez_compressed(y_scaler_path,
                            mean=y_scaler.mean_,
                            scale=y_scaler.scale_)
        y_deciles_path = os.path.join(
            PIPELINE_DIR,
            '{}_YDECILES_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf))
        np.savez_compressed(y_deciles_path, deciles=y_binner.deciles_)

        value = {
            'data': savepaths,
            'X_scaler': X_scaler_path,
            'y_scaler': y_scaler_path,
            'y_deciles': y_deciles_path,
            'X_columnlabels':
            cluster_data.values()[0]['X_columnlabels'].tolist(),
            'y_columnlabels':
            cluster_data.values()[0]['y_columnlabels'].tolist(),
        }

        new_res = PipelineResult()
        new_res.dbms = DBMSCatalog.objects.get(pk=dbms_id)
        new_res.hardware = Hardware.objects.get(pk=hw_id)
        new_res.creation_timestamp = timestamp
        new_res.task_type = PipelineTaskType.WORKLOAD_MAPPING_DATA
        new_res.value = JSONUtil.dumps(value, pprint=True)
        new_res.save()