Beispiel #1
0
def configuration_recommendation_ddpg(result_info):  # pylint: disable=invalid-name
    LOG.info('Use ddpg to recommend configuration')
    result_id = result_info['newest_result_id']
    result = Result.objects.filter(pk=result_id)
    session = Result.objects.get(pk=result_id).session
    agg_data = DataUtil.aggregate_data(result)
    metric_data = agg_data['y_matrix'].flatten()
    cleaned_agg_data = clean_knob_data(agg_data['X_matrix'], agg_data['X_columnlabels'],
                                       session)
    knob_labels = np.array(cleaned_agg_data[1]).flatten()
    knob_num = len(knob_labels)
    metric_num = len(metric_data)

    ddpg = DDPG(n_actions=knob_num, n_states=metric_num, alr=ACTOR_LEARNING_RATE,
                clr=CRITIC_LEARNING_RATE, gamma=GAMMA, batch_size=DDPG_BATCH_SIZE, tau=TAU)
    if session.ddpg_actor_model is not None and session.ddpg_critic_model is not None:
        ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model)
    if session.ddpg_reply_memory is not None:
        ddpg.replay_memory.set(session.ddpg_reply_memory)
    knob_data = ddpg.choose_action(metric_data)
    LOG.info('recommended knob: %s', knob_data)

    knob_bounds = np.vstack(DataUtil.get_knob_bounds(knob_labels, session))
    knob_data = MinMaxScaler().fit(knob_bounds).inverse_transform(knob_data.reshape(1, -1))[0]
    conf_map = {k: knob_data[i] for i, k in enumerate(knob_labels)}
    conf_map_res = {}
    conf_map_res['status'] = 'good'
    conf_map_res['result_id'] = result_id
    conf_map_res['recommendation'] = conf_map
    conf_map_res['info'] = 'INFO: ddpg'
    for k in knob_labels:
        LOG.info('%s: %f', k, conf_map[k])
    return conf_map_res
Beispiel #2
0
def aggregate_target_results(result_id):
    # Check that we've completed the background tasks at least once. We need
    # this data in order to make a configuration recommendation (until we
    # implement a sampling technique to generate new training data).
    latest_pipeline_run = PipelineRun.objects.get_latest()

    if latest_pipeline_run is None:
        result = Result.objects.filter(pk=result_id)
        knobs_ = KnobCatalog.objects.filter(dbms=result[0].dbms, tunable=True)
        knobs_catalog = {k.name: k for k in knobs_}
        knobs = {k: v for k, v in
                 list(knobs_catalog.items())}
        # generate a config randomly
        random_knob_result = gen_random_data(knobs)
        agg_data = DataUtil.aggregate_data(result)
        agg_data['newest_result_id'] = result_id
        agg_data['bad'] = True
        agg_data['config_recommend'] = random_knob_result
        return agg_data

    # Aggregate all knob config results tried by the target so far in this
    # tuning session and this tuning workload.
    newest_result = Result.objects.get(pk=result_id)
    target_results = Result.objects.filter(session=newest_result.session,
                                           dbms=newest_result.dbms,
                                           workload=newest_result.workload)
    if len(target_results) == 0:
        raise Exception('Cannot find any results for session_id={}, dbms_id={}'
                        .format(newest_result.session, newest_result.dbms))
    agg_data = DataUtil.aggregate_data(target_results)
    agg_data['newest_result_id'] = result_id
    agg_data['bad'] = False
    return agg_data
Beispiel #3
0
def configuration_recommendation_ddpg(result_info):  # pylint: disable=invalid-name
    LOG.info('Use ddpg to recommend configuration')
    result_id = result_info['newest_result_id']
    result = Result.objects.filter(pk=result_id)
    session = Result.objects.get(pk=result_id).session
    agg_data = DataUtil.aggregate_data(result)
    metric_data = agg_data['y_matrix'].flatten()
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(
        1, -1))[0]
    cleaned_knob_data = clean_knob_data(agg_data['X_matrix'],
                                        agg_data['X_columnlabels'], session)
    knob_labels = np.array(cleaned_knob_data[1]).flatten()
    knob_num = len(knob_labels)
    metric_num = len(metric_data)

    ddpg = DDPG(n_actions=knob_num, n_states=metric_num)
    if session.ddpg_actor_model is not None and session.ddpg_critic_model is not None:
        ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model)
    if session.ddpg_reply_memory is not None:
        ddpg.replay_memory.set(session.ddpg_reply_memory)
    knob_data = ddpg.choose_action(normalized_metric_data)

    knob_bounds = np.vstack(DataUtil.get_knob_bounds(knob_labels, session))
    knob_data = MinMaxScaler().fit(knob_bounds).inverse_transform(
        knob_data.reshape(1, -1))[0]
    conf_map = {k: knob_data[i] for i, k in enumerate(knob_labels)}
    conf_map_res = {}
    conf_map_res['status'] = 'good'
    conf_map_res['result_id'] = result_id
    conf_map_res['recommendation'] = conf_map
    conf_map_res['info'] = 'INFO: ddpg'
    return conf_map_res
Beispiel #4
0
def aggregate_target_results(result_id):
    # Check that we've completed the background tasks at least once. We need
    # this data in order to make a configuration recommendation (until we
    # implement a sampling technique to generate new training data).
    latest_pipeline_run = PipelineRun.objects.get_latest()
    newest_result = Result.objects.get(pk=result_id)
    if latest_pipeline_run is None or newest_result.session.tuning_session == 'randomly_generate':
        result = Result.objects.filter(pk=result_id)
        knobs_ = KnobCatalog.objects.filter(dbms=result[0].dbms, tunable=True)
        knobs_catalog = {k.name: k for k in knobs_}
        knobs = {k: v for k, v in list(knobs_catalog.items())}
        # generate a config randomly
        random_knob_result = gen_random_data(knobs)
        agg_data = DataUtil.aggregate_data(result)
        agg_data['newest_result_id'] = result_id
        agg_data['bad'] = True
        agg_data['config_recommend'] = random_knob_result
        return agg_data

    # Aggregate all knob config results tried by the target so far in this
    # tuning session and this tuning workload.
    target_results = Result.objects.filter(session=newest_result.session,
                                           dbms=newest_result.dbms,
                                           workload=newest_result.workload)
    if len(target_results) == 0:
        raise Exception(
            'Cannot find any results for session_id={}, dbms_id={}'.format(
                newest_result.session, newest_result.dbms))
    agg_data = DataUtil.aggregate_data(target_results)
    agg_data['newest_result_id'] = result_id
    agg_data['bad'] = False
    return agg_data
Beispiel #5
0
    def test_combine(self):
        test_dedup_row_labels = np.array(["Workload-0", "Workload-1"])
        test_dedup_x = np.matrix([[0.22, 5, "string", "11:11", "fsync", True],
                                  [0.21, 6, "string", "11:12", "fsync", True]])
        test_dedup_y = np.matrix([[30, 30, 40],
                                  [10, 10, 40]])

        test_x, test_y, row_labels = DataUtil.combine_duplicate_rows(
            test_dedup_x, test_dedup_y, test_dedup_row_labels)

        self.assertEqual(len(test_x), len(test_y))
        self.assertEqual(len(test_x), len(row_labels))

        self.assertEqual(row_labels[0], tuple([test_dedup_row_labels[0]]))
        self.assertEqual(row_labels[1], tuple([test_dedup_row_labels[1]]))
        self.assertTrue((test_x[0] == test_dedup_x[0]).all())
        self.assertTrue((test_x[1] == test_dedup_x[1]).all())
        self.assertTrue((test_y[0] == test_dedup_y[0]).all())
        self.assertTrue((test_y[1] == test_dedup_y[1]).all())

        test_row_labels = np.array(["Workload-0",
                                    "Workload-1",
                                    "Workload-2",
                                    "Workload-3"])
        test_x_matrix = np.matrix([[0.22, 5, "string", "timestamp", "enum", True],
                                   [0.3, 5, "rstring", "timestamp2", "enum", False],
                                   [0.22, 5, "string", "timestamp", "enum", True],
                                   [0.3, 5, "r", "timestamp2", "enum", False]])
        test_y_matrix = np.matrix([[20, 30, 40],
                                   [30, 30, 40],
                                   [20, 30, 40],
                                   [32, 30, 40]])

        test_x, test_y, row_labels = DataUtil.combine_duplicate_rows(
            test_x_matrix, test_y_matrix, test_row_labels)

        self.assertTrue(len(test_x) <= len(test_x_matrix))
        self.assertTrue(len(test_y) <= len(test_y_matrix))
        self.assertEqual(len(test_x), len(test_y))
        self.assertEqual(len(test_x), len(row_labels))

        row_labels_set = set(row_labels)
        self.assertTrue(tuple(["Workload-0", "Workload-2"]) in row_labels_set)
        self.assertTrue(("Workload-1",) in row_labels_set)
        self.assertTrue(("Workload-3",) in row_labels_set)

        rows = set()
        for i in test_x:
            self.assertTrue(tuple(i) not in rows)
            self.assertTrue(i in test_x_matrix)
            rows.add(tuple(i))

        rowys = set()
        for i in test_y:
            self.assertTrue(tuple(i) not in rowys)
            self.assertTrue(i in test_y_matrix)
            rowys.add(tuple(i))
Beispiel #6
0
    def test_combine(self):
        test_dedup_row_labels = np.array(["Workload-0", "Workload-1"])
        test_dedup_x = np.matrix([[0.22, 5, "string", "11:11", "fsync", True],
                                  [0.21, 6, "string", "11:12", "fsync", True]])
        test_dedup_y = np.matrix([[30, 30, 40],
                                  [10, 10, 40]])

        test_x, test_y, row_labels = DataUtil.combine_duplicate_rows(
            test_dedup_x, test_dedup_y, test_dedup_row_labels)

        self.assertEqual(len(test_x), len(test_y))
        self.assertEqual(len(test_x), len(row_labels))

        self.assertEqual(row_labels[0], tuple([test_dedup_row_labels[0]]))
        self.assertEqual(row_labels[1], tuple([test_dedup_row_labels[1]]))
        self.assertTrue((test_x[0] == test_dedup_x[0]).all())
        self.assertTrue((test_x[1] == test_dedup_x[1]).all())
        self.assertTrue((test_y[0] == test_dedup_y[0]).all())
        self.assertTrue((test_y[1] == test_dedup_y[1]).all())

        test_row_labels = np.array(["Workload-0",
                                    "Workload-1",
                                    "Workload-2",
                                    "Workload-3"])
        test_x_matrix = np.matrix([[0.22, 5, "string", "timestamp", "enum", True],
                                   [0.3, 5, "rstring", "timestamp2", "enum", False],
                                   [0.22, 5, "string", "timestamp", "enum", True],
                                   [0.3, 5, "r", "timestamp2", "enum", False]])
        test_y_matrix = np.matrix([[20, 30, 40],
                                   [30, 30, 40],
                                   [20, 30, 40],
                                   [32, 30, 40]])

        test_x, test_y, row_labels = DataUtil.combine_duplicate_rows(
            test_x_matrix, test_y_matrix, test_row_labels)

        self.assertTrue(len(test_x) <= len(test_x_matrix))
        self.assertTrue(len(test_y) <= len(test_y_matrix))
        self.assertEqual(len(test_x), len(test_y))
        self.assertEqual(len(test_x), len(row_labels))

        row_labels_set = set(row_labels)
        self.assertTrue(tuple(["Workload-0", "Workload-2"]) in row_labels_set)
        self.assertTrue(("Workload-1",) in row_labels_set)
        self.assertTrue(("Workload-3",) in row_labels_set)

        rows = set()
        for i in test_x:
            self.assertTrue(tuple(i) not in rows)
            self.assertTrue(i in test_x_matrix)
            rows.add(tuple(i))

        rowys = set()
        for i in test_y:
            self.assertTrue(tuple(i) not in rowys)
            self.assertTrue(i in test_y_matrix)
            rowys.add(tuple(i))
Beispiel #7
0
def aggregate_target_results(result_id, algorithm):
    # Check that we've completed the background tasks at least once. We need
    # this data in order to make a configuration recommendation (until we
    # implement a sampling technique to generate new training data).
    newest_result = Result.objects.get(pk=result_id)
    has_pipeline_data = PipelineData.objects.filter(
        workload=newest_result.workload).exists()
    if not has_pipeline_data or newest_result.session.tuning_session == 'randomly_generate':
        if not has_pipeline_data and newest_result.session.tuning_session == 'tuning_session':
            LOG.debug(
                "Background tasks haven't ran for this workload yet, picking random data."
            )

        result = Result.objects.filter(pk=result_id)
        knobs = SessionKnob.objects.get_knobs_for_session(
            newest_result.session)

        # generate a config randomly
        random_knob_result = gen_random_data(knobs)
        agg_data = DataUtil.aggregate_data(result)
        agg_data['newest_result_id'] = result_id
        agg_data['bad'] = True
        agg_data['config_recommend'] = random_knob_result
        LOG.debug('%s: Finished generating a random config.\n\ndata=%s\n',
                  AlgorithmType.name(algorithm),
                  JSONUtil.dumps(agg_data, pprint=True))

    else:
        # Aggregate all knob config results tried by the target so far in this
        # tuning session and this tuning workload.
        target_results = Result.objects.filter(session=newest_result.session,
                                               dbms=newest_result.dbms,
                                               workload=newest_result.workload)
        if len(target_results) == 0:
            raise Exception(
                'Cannot find any results for session_id={}, dbms_id={}'.format(
                    newest_result.session, newest_result.dbms))
        agg_data = DataUtil.aggregate_data(target_results)
        agg_data['newest_result_id'] = result_id
        agg_data['bad'] = False

        # Clean knob data
        cleaned_agg_data = clean_knob_data(agg_data['X_matrix'],
                                           agg_data['X_columnlabels'],
                                           newest_result.session)
        agg_data['X_matrix'] = np.array(cleaned_agg_data[0])
        agg_data['X_columnlabels'] = np.array(cleaned_agg_data[1])

        LOG.debug('%s: Finished aggregating target results.\n\ndata=%s\n',
                  AlgorithmType.name(algorithm),
                  JSONUtil.dumps(agg_data, pprint=True))

    return agg_data, algorithm
def configuration_recommendation_ddpg(result_info):  # pylint: disable=invalid-name
    start_ts = time.time()
    LOG.info('Use ddpg to recommend configuration')
    result_id = result_info['newest_result_id']
    result_list = Result.objects.filter(pk=result_id)
    result = result_list.first()
    session = result.session
    params = JSONUtil.loads(session.hyperparameters)
    agg_data = DataUtil.aggregate_data(result_list)
    metric_data, _ = clean_metric_data(agg_data['y_matrix'],
                                       agg_data['y_columnlabels'], session)
    metric_data = metric_data.flatten()
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(
        1, -1))[0]
    cleaned_knob_data = clean_knob_data(agg_data['X_matrix'],
                                        agg_data['X_columnlabels'], session)
    knob_labels = np.array(cleaned_knob_data[1]).flatten()
    knob_num = len(knob_labels)
    metric_num = len(metric_data)

    ddpg = DDPG(n_actions=knob_num,
                n_states=metric_num,
                a_hidden_sizes=params['DDPG_ACTOR_HIDDEN_SIZES'],
                c_hidden_sizes=params['DDPG_CRITIC_HIDDEN_SIZES'],
                use_default=params['DDPG_USE_DEFAULT'])
    if session.ddpg_actor_model is not None and session.ddpg_critic_model is not None:
        ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model)
    if session.ddpg_reply_memory is not None:
        ddpg.replay_memory.set(session.ddpg_reply_memory)
    knob_data = ddpg.choose_action(normalized_metric_data)

    knob_bounds = np.vstack(DataUtil.get_knob_bounds(knob_labels, session))
    knob_data = MinMaxScaler().fit(knob_bounds).inverse_transform(
        knob_data.reshape(1, -1))[0]
    conf_map = {k: knob_data[i] for i, k in enumerate(knob_labels)}

    conf_map_res = create_and_save_recommendation(recommended_knobs=conf_map,
                                                  result=result,
                                                  status='good',
                                                  info='INFO: ddpg')

    save_execution_time(start_ts, "configuration_recommendation_ddpg", result)
    return conf_map_res
Beispiel #9
0
 def test_no_featured_categorical(self):
     featured_knobs = ['global.backend_flush_after',
                       'global.bgwriter_delay',
                       'global.wal_writer_delay',
                       'global.work_mem']
     postgresdb = DBMSCatalog.objects.get(pk=1)
     categorical_info = DataUtil.dummy_encoder_helper(featured_knobs,
                                                      dbms=postgresdb)
     self.assertEqual(len(categorical_info['n_values']), 0)
     self.assertEqual(len(categorical_info['categorical_features']), 0)
     self.assertEqual(categorical_info['cat_columnlabels'], [])
     self.assertEqual(categorical_info['noncat_columnlabels'], featured_knobs)
Beispiel #10
0
 def test_no_featured_categorical(self):
     featured_knobs = ['global.backend_flush_after',
                       'global.bgwriter_delay',
                       'global.wal_writer_delay',
                       'global.work_mem']
     postgres96 = DBMSCatalog.objects.get(pk=1)
     categorical_info = DataUtil.dummy_encoder_helper(featured_knobs,
                                                      dbms=postgres96)
     self.assertEqual(len(categorical_info['n_values']), 0)
     self.assertEqual(len(categorical_info['categorical_features']), 0)
     self.assertEqual(categorical_info['cat_columnlabels'], [])
     self.assertEqual(categorical_info['noncat_columnlabels'], featured_knobs)
Beispiel #11
0
 def test_featured_categorical(self):
     featured_knobs = [
         'global.backend_flush_after', 'global.bgwriter_delay',
         'global.wal_writer_delay', 'global.work_mem',
         'global.wal_sync_method'
     ]  # last knob categorical
     postgres96 = DBMSCatalog.objects.get(pk=1)
     categorical_info = DataUtil.dummy_encoder_helper(featured_knobs,
                                                      dbms=postgres96)
     self.assertEqual(len(categorical_info['n_values']), 1)
     self.assertEqual(categorical_info['n_values'][0], 4)
     self.assertEqual(len(categorical_info['categorical_features']), 1)
     self.assertEqual(categorical_info['categorical_features'][0], 4)
     self.assertEqual(categorical_info['cat_columnlabels'],
                      ['global.wal_sync_method'])
     self.assertEqual(categorical_info['noncat_columnlabels'],
                      featured_knobs[:-1])
Beispiel #12
0
def aggregate_results():
    unique_clusters = WorkloadCluster.objects.all()
    unique_clusters = filter(lambda x: x.isdefault is False, unique_clusters)
    all_data = {}
    all_labels = {}
    for cluster in unique_clusters:
        results = ResultData.objects.filter(cluster=cluster)
        if len(results) < 2:
            continue
        if cluster.dbms.pk not in all_labels:
            knob_labels = np.asarray(
                sorted(JSONUtil.loads(results[0].param_data).keys()))
            metric_labels = np.asarray(
                sorted(JSONUtil.loads(results[0].metric_data).keys()))
            all_labels[cluster.dbms.pk] = (knob_labels, metric_labels)
        else:
            knob_labels, metric_labels = all_labels[cluster.dbms.pk]
        entry = DataUtil.aggregate_data(results, knob_labels, metric_labels)
        key = (cluster.dbms.pk, cluster.hardware.pk)
        if key not in all_data:
            all_data[key] = {}
        all_data[key][cluster.pk] = entry

    ts = now()
    tsf = ts.strftime("%Y%m%d-%H%M%S")
    for (dbkey, hwkey), cluster_data in all_data.iteritems():
        task_name = PipelineTaskType.TYPE_NAMES[
            PipelineTaskType.AGGREGATED_DATA].replace(' ', '').upper()
        savepaths = {}
        for clusterkey, entry in cluster_data.iteritems():
            fname = '{}_{}_{}_{}_{}.npz'.format(task_name, dbkey, hwkey,
                                                clusterkey, tsf)
            savepath = os.path.join(PIPELINE_DIR, fname)
            savepaths[clusterkey] = savepath
            np.savez_compressed(savepath, **entry)

        value = {'data': savepaths}

        new_res = PipelineResult()
        new_res.dbms = DBMSCatalog.objects.get(pk=dbkey)
        new_res.hardware = Hardware.objects.get(pk=hwkey)
        new_res.creation_timestamp = ts
        new_res.task_type = PipelineTaskType.AGGREGATED_DATA
        new_res.value = JSONUtil.dumps(value)
        new_res.save()
Beispiel #13
0
def aggregate_data(workload):
    # Aggregates both the knob & metric data for the given workload.
    #
    # Parameters:
    #   workload: aggregate data belonging to this specific workload
    #
    # Returns: two dictionaries containing the knob & metric data as
    # a tuple

    # Find the results for this workload
    wkld_results = Result.objects.filter(workload=workload)

    # Now call the aggregate_data helper function to combine all knob &
    # metric data into matrices and also create row/column labels
    # (see the DataUtil class in website/utils.py)
    #
    # The aggregate_data helper function returns a dictionary of the form:
    #   - 'X_matrix': the knob data as a 2D numpy matrix (results x knobs)
    #   - 'y_matrix': the metric data as a 2D numpy matrix (results x metrics)
    #   - 'rowlabels': list of result ids that correspond to the rows in
    #         both X_matrix & y_matrix
    #   - 'X_columnlabels': a list of the knob names corresponding to the
    #         columns in the knob_data matrix
    #   - 'y_columnlabels': a list of the metric names corresponding to the
    #         columns in the metric_data matrix
    aggregated_data = DataUtil.aggregate_data(wkld_results)

    # Separate knob & workload data into two "standard" dictionaries of the
    # same form
    knob_data = {
        'data': aggregated_data['X_matrix'],
        'rowlabels': aggregated_data['rowlabels'],
        'columnlabels': aggregated_data['X_columnlabels']
    }

    metric_data = {
        'data': aggregated_data['y_matrix'],
        'rowlabels': copy.deepcopy(aggregated_data['rowlabels']),
        'columnlabels': aggregated_data['y_columnlabels']
    }

    # Return the knob & metric data
    return knob_data, metric_data
Beispiel #14
0
def aggregate_target_results(result_id):
    # Check that we've completed the background tasks at least once. We need
    # this data in order to make a configuration recommendation (until we
    # implement a sampling technique to generate new training data).
    latest_pipeline_run = PipelineRun.objects.get_latest()
    if latest_pipeline_run is None:
        raise Exception("No previous data available. Implement me!")

    # Aggregate all knob config results tried by the target so far in this
    # tuning session.
    newest_result = Result.objects.get(pk=result_id)
    target_results = Result.objects.filter(
        session=newest_result.session, dbms=newest_result.dbms)
    if len(target_results) == 0:
        raise Exception('Cannot find any results for session_id={}, dbms_id={}'
                        .format(newest_result.session, newest_result.dbms))
    agg_data = DataUtil.aggregate_data(target_results)
    agg_data['newest_result_id'] = result_id
    return agg_data
Beispiel #15
0
def aggregate_data(wkld_results):
    # Aggregates both the knob & metric data for the given workload.
    #
    # Parameters:
    #   wkld_results: result data belonging to this specific workload
    #
    # Returns: two dictionaries containing the knob & metric data as
    # a tuple

    # Now call the aggregate_data helper function to combine all knob &
    # metric data into matrices and also create row/column labels
    # (see the DataUtil class in website/utils.py)
    #
    # The aggregate_data helper function returns a dictionary of the form:
    #   - 'X_matrix': the knob data as a 2D numpy matrix (results x knobs)
    #   - 'y_matrix': the metric data as a 2D numpy matrix (results x metrics)
    #   - 'rowlabels': list of result ids that correspond to the rows in
    #         both X_matrix & y_matrix
    #   - 'X_columnlabels': a list of the knob names corresponding to the
    #         columns in the knob_data matrix
    #   - 'y_columnlabels': a list of the metric names corresponding to the
    #         columns in the metric_data matrix
    start_ts = time.time()
    aggregated_data = DataUtil.aggregate_data(
        wkld_results, ignore=['range_test', 'default', '*'])

    # Separate knob & workload data into two "standard" dictionaries of the
    # same form
    knob_data = {
        'data': aggregated_data['X_matrix'],
        'rowlabels': aggregated_data['rowlabels'],
        'columnlabels': aggregated_data['X_columnlabels']
    }

    metric_data = {
        'data': aggregated_data['y_matrix'],
        'rowlabels': copy.deepcopy(aggregated_data['rowlabels']),
        'columnlabels': aggregated_data['y_columnlabels']
    }

    # Return the knob & metric data
    save_execution_time(start_ts, "aggregate_data")
    return knob_data, metric_data
def aggregate_data(workload):
    # Aggregates both the knob & metric data for the given workload.
    #
    # Parameters:
    #   workload: aggregate data belonging to this specific workload
    #
    # Returns: two dictionaries containing the knob & metric data as
    # a tuple

    # Find the results for this workload
    wkld_results = Result.objects.filter(workload=workload)

    # Now call the aggregate_data helper function to combine all knob &
    # metric data into matrices and also create row/column labels
    # (see the DataUtil class in website/utils.py)
    #
    # The aggregate_data helper function returns a dictionary of the form:
    #   - 'X_matrix': the knob data as a 2D numpy matrix (results x knobs)
    #   - 'y_matrix': the metric data as a 2D numpy matrix (results x metrics)
    #   - 'rowlabels': list of result ids that correspond to the rows in
    #         both X_matrix & y_matrix
    #   - 'X_columnlabels': a list of the knob names corresponding to the
    #         columns in the knob_data matrix
    #   - 'y_columnlabels': a list of the metric names corresponding to the
    #         columns in the metric_data matrix
    aggregated_data = DataUtil.aggregate_data(wkld_results)

    # Separate knob & workload data into two "standard" dictionaries of the
    # same form
    knob_data = {
        'data': aggregated_data['X_matrix'],
        'rowlabels': aggregated_data['rowlabels'],
        'columnlabels': aggregated_data['X_columnlabels']
    }

    metric_data = {
        'data': aggregated_data['y_matrix'],
        'rowlabels': copy.deepcopy(aggregated_data['rowlabels']),
        'columnlabels': aggregated_data['y_columnlabels']
    }

    # Return the knob & metric data
    return knob_data, metric_data
Beispiel #17
0
def aggregate_target_results(result_id):
    newest_result = Result.objects.get(pk=result_id)
    target_results = Result.objects.filter(
        application=newest_result.application, dbms=newest_result.dbms)
    if len(target_results) == 0:
        raise Exception(
            'Cannot find any results for app_id={}, dbms_id={}'.format(
                newest_result.application, newest_result.dbms))
    target_result_datas = [
        ResultData.objects.get(result=tres) for tres in target_results
    ]

    knob_labels = np.asarray(
        sorted(JSONUtil.loads(target_result_datas[0].param_data).keys()))
    metric_labels = np.asarray(
        sorted(JSONUtil.loads(target_result_datas[0].metric_data).keys()))
    agg_data = DataUtil.aggregate_data(target_result_datas, knob_labels,
                                       metric_labels)
    agg_data['newest_result_id'] = result_id
    return agg_data
Beispiel #18
0
    def test_aggregate(self):

        workload2 = Result.objects.filter(workload=2)
        num_results = Result.objects.filter(workload=2).count()
        knobs = list(JSONUtil.loads(workload2[0].knob_data.data).keys())
        metrics = list(JSONUtil.loads(workload2[0].metric_data.data).keys())
        num_knobs = len(knobs)
        num_metrics = len(metrics)

        test_result = DataUtil.aggregate_data(workload2)

        self.assertTrue('X_matrix' in list(test_result.keys()))
        self.assertTrue('y_matrix' in list(test_result.keys()))
        self.assertTrue('rowlabels' in list(test_result.keys()))
        self.assertTrue('X_columnlabels' in list(test_result.keys()))
        self.assertTrue('y_columnlabels' in list(test_result.keys()))

        self.assertEqual(test_result['X_columnlabels'], knobs)
        self.assertEqual(test_result['y_columnlabels'], metrics)
        self.assertEqual(test_result['X_matrix'].shape[0], num_results)
        self.assertEqual(test_result['y_matrix'].shape[0], num_results)
        self.assertEqual(test_result['X_matrix'].shape[1], num_knobs)
        self.assertEqual(test_result['y_matrix'].shape[1], num_metrics)
Beispiel #19
0
    def test_aggregate(self):

        workload2 = Result.objects.filter(workload=2)
        num_results = Result.objects.filter(workload=2).count()
        knobs = list(JSONUtil.loads(workload2[0].knob_data.data).keys())
        metrics = list(JSONUtil.loads(workload2[0].metric_data.data).keys())
        num_knobs = len(knobs)
        num_metrics = len(metrics)

        test_result = DataUtil.aggregate_data(workload2)

        self.assertTrue('X_matrix' in list(test_result.keys()))
        self.assertTrue('y_matrix' in list(test_result.keys()))
        self.assertTrue('rowlabels' in list(test_result.keys()))
        self.assertTrue('X_columnlabels' in list(test_result.keys()))
        self.assertTrue('y_columnlabels' in list(test_result.keys()))

        self.assertEqual(test_result['X_columnlabels'], knobs)
        self.assertEqual(test_result['y_columnlabels'], metrics)
        self.assertEqual(test_result['X_matrix'].shape[0], num_results)
        self.assertEqual(test_result['y_matrix'].shape[0], num_results)
        self.assertEqual(test_result['X_matrix'].shape[1], num_knobs)
        self.assertEqual(test_result['y_matrix'].shape[1], num_metrics)
Beispiel #20
0
def run_knob_identification(knob_data, metric_data, dbms):
    # Performs knob identification on the knob & metric data and returns
    # a set of ranked knobs.
    #
    # Parameters:
    #   knob_data & metric_data are dictionaries of the form:
    #     - 'data': 2D numpy matrix of knob/metric data
    #     - 'rowlabels': a list of identifiers for the rows in the matrix
    #     - 'columnlabels': a list of the knob/metric names corresponding
    #           to the columns in the data matrix
    #   dbms is the foreign key pointing to target dbms in DBMSCatalog
    #
    # When running the lasso algorithm, the knob_data matrix is set of
    # independent variables (X) and the metric_data is the set of
    # dependent variables (y).

    knob_matrix = knob_data['data']
    knob_columnlabels = knob_data['columnlabels']

    metric_matrix = metric_data['data']
    metric_columnlabels = metric_data['columnlabels']

    # remove constant columns from knob_matrix and metric_matrix
    nonconst_knob_matrix = []
    nonconst_knob_columnlabels = []

    for col, cl in zip(knob_matrix.T, knob_columnlabels):
        if np.any(col != col[0]):
            nonconst_knob_matrix.append(col.reshape(-1, 1))
            nonconst_knob_columnlabels.append(cl)
    assert len(nonconst_knob_matrix) > 0, "Need more data to train the model"
    nonconst_knob_matrix = np.hstack(nonconst_knob_matrix)

    nonconst_metric_matrix = []
    nonconst_metric_columnlabels = []

    for col, cl in zip(metric_matrix.T, metric_columnlabels):
        if np.any(col != col[0]):
            nonconst_metric_matrix.append(col.reshape(-1, 1))
            nonconst_metric_columnlabels.append(cl)
    nonconst_metric_matrix = np.hstack(nonconst_metric_matrix)

    # determine which knobs need encoding (enums with >2 possible values)

    categorical_info = DataUtil.dummy_encoder_helper(
        nonconst_knob_columnlabels, dbms)
    # encode categorical variable first (at least, before standardize)
    dummy_encoder = DummyEncoder(categorical_info['n_values'],
                                 categorical_info['categorical_features'],
                                 categorical_info['cat_columnlabels'],
                                 categorical_info['noncat_columnlabels'])
    encoded_knob_matrix = dummy_encoder.fit_transform(nonconst_knob_matrix)
    encoded_knob_columnlabels = dummy_encoder.new_labels

    # standardize values in each column to N(0, 1)
    standardizer = StandardScaler()
    standardized_knob_matrix = standardizer.fit_transform(encoded_knob_matrix)
    standardized_metric_matrix = standardizer.fit_transform(
        nonconst_metric_matrix)

    # shuffle rows (note: same shuffle applied to both knob and metric matrices)
    shuffle_indices = get_shuffle_indices(standardized_knob_matrix.shape[0],
                                          seed=17)
    shuffled_knob_matrix = standardized_knob_matrix[shuffle_indices, :]
    shuffled_metric_matrix = standardized_metric_matrix[shuffle_indices, :]

    # run lasso algorithm
    lasso_model = LassoPath()
    lasso_model.fit(shuffled_knob_matrix, shuffled_metric_matrix,
                    encoded_knob_columnlabels)

    # consolidate categorical feature columns, and reset to original names
    encoded_knobs = lasso_model.get_ranked_features()
    consolidated_knobs = consolidate_columnlabels(encoded_knobs)

    return consolidated_knobs
Beispiel #21
0
def configuration_recommendation(target_data):
    if target_data['scores'] is None:
        raise NotImplementedError('Implement me!')
    best_wkld_id = target_data['mapped_workload'][0]

    # Load specific workload data
    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    target_obj = newest_result.application.target_objective
    dbms_id = newest_result.dbms.pk
    hw_id = newest_result.application.hardware.pk
    agg_data = PipelineResult.get_latest(dbms_id, hw_id,
                                         PipelineTaskType.AGGREGATED_DATA)
    if agg_data is None:
        return None
    data_map = JSONUtil.loads(agg_data.value)
    if best_wkld_id not in data_map['data']:
        raise Exception(('Cannot find mapped workload'
                         '(id={}) in aggregated data').format(best_wkld_id))
    workload_data = np.load(data_map['data'][best_wkld_id])

    # Mapped workload data
    X_wkld_matrix = workload_data['X_matrix']
    y_wkld_matrix = workload_data['y_matrix']
    wkld_rowlabels = workload_data['rowlabels']
    X_columnlabels = workload_data['X_columnlabels']
    y_columnlabels = workload_data['y_columnlabels']

    # Target workload data
    X_target_matrix = target_data['X_matrix']
    y_target_matrix = target_data['y_matrix']
    target_rowlabels = target_data['rowlabels']

    if not np.array_equal(X_columnlabels, target_data['X_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical X columnlabels (sorted knob names)'))
    if not np.array_equal(y_columnlabels, target_data['y_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical y columnlabels (sorted metric names)'))

    # Filter knobs
    ranked_knobs = JSONUtil.loads(
        PipelineResult.get_latest(
            dbms_id, hw_id, PipelineTaskType.RANKED_KNOBS).value)[:10]  # FIXME
    X_idxs = [
        i for i in range(X_columnlabels.shape[0])
        if X_columnlabels[i] in ranked_knobs
    ]
    X_wkld_matrix = X_wkld_matrix[:, X_idxs]
    X_target_matrix = X_target_matrix[:, X_idxs]
    X_columnlabels = X_columnlabels[X_idxs]

    # Filter metrics by current target objective metric
    y_idx = [
        i for i in range(y_columnlabels.shape[0])
        if y_columnlabels[i] == target_obj
    ]
    if len(y_idx) == 0:
        raise Exception(('Could not find target objective in metrics '
                         '(target_obj={})').format(target_obj))
    elif len(y_idx) > 1:
        raise Exception(
            ('Found {} instances of target objective in '
             'metrics (target_obj={})').format(len(y_idx), target_obj))
    y_wkld_matrix = y_wkld_matrix[:, y_idx]
    y_target_matrix = y_target_matrix[:, y_idx]
    y_columnlabels = y_columnlabels[y_idx]

    # Combine duplicate rows in the target/workload data (separately)
    X_wkld_matrix, y_wkld_matrix, wkld_rowlabels = DataUtil.combine_duplicate_rows(
        X_wkld_matrix, y_wkld_matrix, wkld_rowlabels)
    X_target_matrix, y_target_matrix, target_rowlabels = DataUtil.combine_duplicate_rows(
        X_target_matrix, y_target_matrix, target_rowlabels)

    # Delete any rows that appear in both the workload data and the target
    # data from the workload data
    dups_filter = np.ones(X_wkld_matrix.shape[0], dtype=bool)
    target_row_tups = [tuple(row) for row in X_target_matrix]
    for i, row in enumerate(X_wkld_matrix):
        if tuple(row) in target_row_tups:
            dups_filter[i] = False
    X_wkld_matrix = X_wkld_matrix[dups_filter, :]
    y_wkld_matrix = y_wkld_matrix[dups_filter, :]
    wkld_rowlabels = wkld_rowlabels[dups_filter]

    # Combine Xs and scale
    X_matrix = np.vstack([X_target_matrix, X_wkld_matrix])
    X_scaler = StandardScaler()
    X_scaled = X_scaler.fit_transform(X_matrix)
    if y_target_matrix.shape[0] < 5:  # FIXME
        y_target_scaler = None
        y_wkld_scaler = StandardScaler()
        y_matrix = np.vstack([y_target_matrix, y_wkld_matrix])
        y_scaled = y_wkld_scaler.fit_transform(y_matrix)
    else:
        try:
            y_target_scaler = StandardScaler()
            y_wkld_scaler = StandardScaler()
            y_target_scaled = y_target_scaler.fit_transform(y_target_matrix)
            y_wkld_scaled = y_wkld_scaler.fit_transform(y_wkld_matrix)
            y_scaled = np.vstack([y_target_scaled, y_wkld_scaled])
        except ValueError:
            y_target_scaler = None
            y_wkld_scaler = StandardScaler()
            y_matrix = np.vstack([y_target_matrix, y_wkld_matrix])
            y_scaled = y_wkld_scaler.fit_transform(y_matrix)

    ridge = np.empty(X_scaled.shape[0])
    ridge[:X_target_matrix.shape[0]] = 0.01
    ridge[X_target_matrix.shape[0]:] = 0.1

    # FIXME
    num_samples = 5
    X_samples = np.empty((num_samples, X_scaled.shape[1]))
    for i in range(X_scaled.shape[1]):
        col_min = X_scaled[:, i].min()
        col_max = X_scaled[:, i].max()
        X_samples[:, i] = np.random.rand(num_samples) * (col_max -
                                                         col_min) + col_min

    model = GPR_GD()
    model.fit(X_scaled, y_scaled, ridge)
    res = model.predict(X_samples)
    best_idx = np.argmin(res.minL.ravel())
    best_conf = res.minL_conf[best_idx, :]
    best_conf = X_scaler.inverse_transform(best_conf)

    conf_map = {k: best_conf[i] for i, k in enumerate(X_columnlabels)}
    return conf_map
Beispiel #22
0
def map_workload(target_data):
    # Get the latest version of pipeline data that's been computed so far.
    latest_pipeline_run = PipelineRun.objects.get_latest()
    if target_data['bad']:
        assert target_data is not None
        return target_data
    assert latest_pipeline_run is not None

    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    target_workload = newest_result.workload
    X_columnlabels = np.array(target_data['X_columnlabels'])
    y_columnlabels = np.array(target_data['y_columnlabels'])

    # Find all pipeline data belonging to the latest version with the same
    # DBMS and hardware as the target
    pipeline_data = PipelineData.objects.filter(
        pipeline_run=latest_pipeline_run,
        workload__dbms=target_workload.dbms,
        workload__hardware=target_workload.hardware)

    # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs
    # and pruned metrics but we just use those from the first workload for now
    initialized = False
    global_ranked_knobs = None
    global_pruned_metrics = None
    ranked_knob_idxs = None
    pruned_metric_idxs = None

    # Compute workload mapping data for each unique workload
    unique_workloads = pipeline_data.values_list('workload',
                                                 flat=True).distinct()
    assert len(unique_workloads) > 0
    workload_data = {}
    for unique_workload in unique_workloads:

        workload_obj = Workload.objects.get(pk=unique_workload)
        wkld_results = Result.objects.filter(workload=workload_obj)
        if wkld_results.exists() is False:
            # delete the workload
            workload_obj.delete()
            continue

        # Load knob & metric data for this workload
        knob_data = load_data_helper(pipeline_data, unique_workload,
                                     PipelineTaskType.KNOB_DATA)

        metric_data = load_data_helper(pipeline_data, unique_workload,
                                       PipelineTaskType.METRIC_DATA)
        X_matrix = np.array(knob_data["data"])
        y_matrix = np.array(metric_data["data"])
        rowlabels = np.array(knob_data["rowlabels"])
        assert np.array_equal(rowlabels, metric_data["rowlabels"])

        if not initialized:
            # For now set ranked knobs & pruned metrics to be those computed
            # for the first workload
            global_ranked_knobs = load_data_helper(
                pipeline_data, unique_workload,
                PipelineTaskType.RANKED_KNOBS)[:IMPORTANT_KNOB_NUMBER]
            global_pruned_metrics = load_data_helper(
                pipeline_data, unique_workload,
                PipelineTaskType.PRUNED_METRICS)
            ranked_knob_idxs = [
                i for i in range(X_matrix.shape[1])
                if X_columnlabels[i] in global_ranked_knobs
            ]
            pruned_metric_idxs = [
                i for i in range(y_matrix.shape[1])
                if y_columnlabels[i] in global_pruned_metrics
            ]

            # Filter X & y columnlabels by top ranked_knobs & pruned_metrics
            X_columnlabels = X_columnlabels[ranked_knob_idxs]
            y_columnlabels = y_columnlabels[pruned_metric_idxs]
            initialized = True

        # Filter X & y matrices by top ranked_knobs & pruned_metrics
        X_matrix = X_matrix[:, ranked_knob_idxs]
        y_matrix = y_matrix[:, pruned_metric_idxs]

        # Combine duplicate rows (rows with same knob settings)
        X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows(
            X_matrix, y_matrix, rowlabels)

        workload_data[unique_workload] = {
            'X_matrix': X_matrix,
            'y_matrix': y_matrix,
            'rowlabels': rowlabels,
        }

    assert len(workload_data) > 0

    # Stack all X & y matrices for preprocessing
    Xs = np.vstack(
        [entry['X_matrix'] for entry in list(workload_data.values())])
    ys = np.vstack(
        [entry['y_matrix'] for entry in list(workload_data.values())])

    # Scale the X & y values, then compute the deciles for each column in y
    X_scaler = StandardScaler(copy=False)
    X_scaler.fit(Xs)
    y_scaler = StandardScaler(copy=False)
    y_scaler.fit_transform(ys)
    y_binner = Bin(bin_start=1, axis=0)
    y_binner.fit(ys)
    del Xs
    del ys

    # Filter the target's X & y data by the ranked knobs & pruned metrics.
    X_target = target_data['X_matrix'][:, ranked_knob_idxs]
    y_target = target_data['y_matrix'][:, pruned_metric_idxs]

    # Now standardize the target's data and bin it by the deciles we just
    # calculated
    X_target = X_scaler.transform(X_target)
    y_target = y_scaler.transform(y_target)
    y_target = y_binner.transform(y_target)

    scores = {}
    for workload_id, workload_entry in list(workload_data.items()):
        predictions = np.empty_like(y_target)
        X_workload = workload_entry['X_matrix']
        X_scaled = X_scaler.transform(X_workload)
        y_workload = workload_entry['y_matrix']
        y_scaled = y_scaler.transform(y_workload)
        for j, y_col in enumerate(y_scaled.T):
            # Using this workload's data, train a Gaussian process model
            # and then predict the performance of each metric for each of
            # the knob configurations attempted so far by the target.
            y_col = y_col.reshape(-1, 1)
            model = GPRNP(length_scale=DEFAULT_LENGTH_SCALE,
                          magnitude=DEFAULT_MAGNITUDE,
                          max_train_size=MAX_TRAIN_SIZE,
                          batch_size=BATCH_SIZE)
            model.fit(X_scaled, y_col, ridge=DEFAULT_RIDGE)
            predictions[:, j] = model.predict(X_target).ypreds.ravel()
        # Bin each of the predicted metric columns by deciles and then
        # compute the score (i.e., distance) between the target workload
        # and each of the known workloads
        predictions = y_binner.transform(predictions)
        dists = np.sqrt(
            np.sum(np.square(np.subtract(predictions, y_target)), axis=1))
        scores[workload_id] = np.mean(dists)

    # Find the best (minimum) score
    best_score = np.inf
    best_workload_id = None
    # scores_info = {workload_id: (workload_name, score)}
    scores_info = {}
    for workload_id, similarity_score in list(scores.items()):
        workload_name = Workload.objects.get(pk=workload_id).name
        if similarity_score < best_score:
            best_score = similarity_score
            best_workload_id = workload_id
            best_workload_name = workload_name
        scores_info[workload_id] = (workload_name, similarity_score)
    target_data['mapped_workload'] = (best_workload_id, best_workload_name,
                                      best_score)
    target_data['scores'] = scores_info
    return target_data
def map_workload(map_workload_input):
    start_ts = time.time()
    target_data, algorithm = map_workload_input

    if target_data['bad']:
        assert target_data is not None
        target_data['pipeline_run'] = None
        LOG.debug('%s: Skipping workload mapping.\n\ndata=%s\n',
                  AlgorithmType.name(algorithm),
                  JSONUtil.dumps(target_data, pprint=True))

        return target_data, algorithm

    # Get the latest version of pipeline data that's been computed so far.
    latest_pipeline_run = PipelineRun.objects.get_latest()
    assert latest_pipeline_run is not None
    target_data['pipeline_run'] = latest_pipeline_run.pk

    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    session = newest_result.session
    params = JSONUtil.loads(session.hyperparameters)
    target_workload = newest_result.workload
    X_columnlabels = np.array(target_data['X_columnlabels'])
    y_columnlabels = np.array(target_data['y_columnlabels'])

    # Find all pipeline data belonging to the latest version with the same
    # DBMS and hardware as the target
    pipeline_data = PipelineData.objects.filter(
        pipeline_run=latest_pipeline_run,
        workload__dbms=target_workload.dbms,
        workload__hardware=target_workload.hardware,
        workload__project=target_workload.project)

    # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs
    # and pruned metrics but we just use those from the first workload for now
    initialized = False
    global_ranked_knobs = None
    global_pruned_metrics = None
    ranked_knob_idxs = None
    pruned_metric_idxs = None

    unique_workloads = pipeline_data.values_list('workload',
                                                 flat=True).distinct()

    workload_data = {}
    # Compute workload mapping data for each unique workload
    for unique_workload in unique_workloads:

        workload_obj = Workload.objects.get(pk=unique_workload)
        wkld_results = Result.objects.filter(workload=workload_obj)
        if wkld_results.exists() is False:
            # delete the workload
            workload_obj.delete()
            continue

        # Load knob & metric data for this workload
        knob_data = load_data_helper(pipeline_data, unique_workload,
                                     PipelineTaskType.KNOB_DATA)
        knob_data["data"], knob_data["columnlabels"] = clean_knob_data(
            knob_data["data"], knob_data["columnlabels"],
            newest_result.session)
        metric_data = load_data_helper(pipeline_data, unique_workload,
                                       PipelineTaskType.METRIC_DATA)
        X_matrix = np.array(knob_data["data"])
        y_matrix = np.array(metric_data["data"])
        rowlabels = np.array(knob_data["rowlabels"])
        assert np.array_equal(rowlabels, metric_data["rowlabels"])

        if not initialized:
            # For now set ranked knobs & pruned metrics to be those computed
            # for the first workload
            global_ranked_knobs = load_data_helper(
                pipeline_data, unique_workload, PipelineTaskType.RANKED_KNOBS
            )[:params['IMPORTANT_KNOB_NUMBER']]
            global_pruned_metrics = load_data_helper(
                pipeline_data, unique_workload,
                PipelineTaskType.PRUNED_METRICS)
            ranked_knob_idxs = [
                i for i in range(X_matrix.shape[1])
                if X_columnlabels[i] in global_ranked_knobs
            ]
            pruned_metric_idxs = [
                i for i in range(y_matrix.shape[1])
                if y_columnlabels[i] in global_pruned_metrics
            ]

            # Filter X & y columnlabels by top ranked_knobs & pruned_metrics
            X_columnlabels = X_columnlabels[ranked_knob_idxs]
            y_columnlabels = y_columnlabels[pruned_metric_idxs]
            initialized = True

        # Filter X & y matrices by top ranked_knobs & pruned_metrics
        X_matrix = X_matrix[:, ranked_knob_idxs]
        y_matrix = y_matrix[:, pruned_metric_idxs]

        # Combine duplicate rows (rows with same knob settings)
        X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows(
            X_matrix, y_matrix, rowlabels)

        workload_data[unique_workload] = {
            'X_matrix': X_matrix,
            'y_matrix': y_matrix,
            'rowlabels': rowlabels,
        }

    if len(workload_data) == 0:
        # The background task that aggregates the data has not finished running yet
        target_data.update(mapped_workload=None, scores=None)
        LOG.debug(
            '%s: Skipping workload mapping because there is no parsed workload.\n',
            AlgorithmType.name(algorithm))
        return target_data, algorithm

    # Stack all X & y matrices for preprocessing
    Xs = np.vstack(
        [entry['X_matrix'] for entry in list(workload_data.values())])
    ys = np.vstack(
        [entry['y_matrix'] for entry in list(workload_data.values())])

    # Scale the X & y values, then compute the deciles for each column in y
    X_scaler = StandardScaler(copy=False)
    X_scaler.fit(Xs)
    y_scaler = StandardScaler(copy=False)
    y_scaler.fit_transform(ys)
    y_binner = Bin(bin_start=1, axis=0)
    y_binner.fit(ys)
    del Xs
    del ys

    # Filter the target's X & y data by the ranked knobs & pruned metrics.
    X_target = target_data['X_matrix'][:, ranked_knob_idxs]
    y_target = target_data['y_matrix'][:, pruned_metric_idxs]

    # Now standardize the target's data and bin it by the deciles we just
    # calculated
    X_target = X_scaler.transform(X_target)
    y_target = y_scaler.transform(y_target)
    y_target = y_binner.transform(y_target)

    scores = {}
    for workload_id, workload_entry in list(workload_data.items()):
        predictions = np.empty_like(y_target)
        X_workload = workload_entry['X_matrix']
        X_scaled = X_scaler.transform(X_workload)
        y_workload = workload_entry['y_matrix']
        y_scaled = y_scaler.transform(y_workload)
        for j, y_col in enumerate(y_scaled.T):
            # Using this workload's data, train a Gaussian process model
            # and then predict the performance of each metric for each of
            # the knob configurations attempted so far by the target.
            y_col = y_col.reshape(-1, 1)
            if params['GPR_USE_GPFLOW']:
                model_kwargs = {
                    'lengthscales': params['GPR_LENGTH_SCALE'],
                    'variance': params['GPR_MAGNITUDE'],
                    'noise_variance': params['GPR_RIDGE']
                }
                tf.reset_default_graph()
                graph = tf.get_default_graph()
                gpflow.reset_default_session(graph=graph)
                m = gpr_models.create_model(params['GPR_MODEL_NAME'],
                                            X=X_scaled,
                                            y=y_col,
                                            **model_kwargs)
                gpr_result = gpflow_predict(m.model, X_target)
            else:
                model = GPRNP(length_scale=params['GPR_LENGTH_SCALE'],
                              magnitude=params['GPR_MAGNITUDE'],
                              max_train_size=params['GPR_MAX_TRAIN_SIZE'],
                              batch_size=params['GPR_BATCH_SIZE'])
                model.fit(X_scaled, y_col, ridge=params['GPR_RIDGE'])
                gpr_result = model.predict(X_target)
            predictions[:, j] = gpr_result.ypreds.ravel()
        # Bin each of the predicted metric columns by deciles and then
        # compute the score (i.e., distance) between the target workload
        # and each of the known workloads
        predictions = y_binner.transform(predictions)
        dists = np.sqrt(
            np.sum(np.square(np.subtract(predictions, y_target)), axis=1))
        scores[workload_id] = np.mean(dists)

    # Find the best (minimum) score
    best_score = np.inf
    best_workload_id = None
    best_workload_name = None
    scores_info = {}
    for workload_id, similarity_score in list(scores.items()):
        workload_name = Workload.objects.get(pk=workload_id).name
        if similarity_score < best_score:
            best_score = similarity_score
            best_workload_id = workload_id
            best_workload_name = workload_name
        scores_info[workload_id] = (workload_name, similarity_score)
    target_data.update(mapped_workload=(best_workload_id, best_workload_name,
                                        best_score),
                       scores=scores_info)
    LOG.debug('%s: Finished mapping the workload.\n\ndata=%s\n',
              AlgorithmType.name(algorithm),
              JSONUtil.dumps(target_data, pprint=True))

    save_execution_time(start_ts, "map_workload", newest_result)
    return target_data, algorithm
Beispiel #24
0
def run_workload_characterization(metric_data, dbms=None):
    # Performs workload characterization on the metric_data and returns
    # a set of pruned metrics.
    #
    # Parameters:
    #   metric_data is a dictionary of the form:
    #     - 'data': 2D numpy matrix of metric data (results x metrics)
    #     - 'rowlabels': a list of identifiers for the rows in the matrix
    #     - 'columnlabels': a list of the metric names corresponding to
    #                       the columns in the data matrix
    start_ts = time.time()

    matrix = metric_data['data']
    columnlabels = metric_data['columnlabels']
    LOG.debug("Workload characterization ~ initial data size: %s",
              matrix.shape)

    views = None if dbms is None else VIEWS_FOR_PRUNING.get(dbms.type, None)
    matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels,
                                                      views)
    LOG.debug("Workload characterization ~ cleaned data size: %s",
              matrix.shape)

    # Bin each column (metric) in the matrix by its decile
    binner = Bin(bin_start=1, axis=0)
    binned_matrix = binner.fit_transform(matrix)

    # Remove any constant columns
    nonconst_matrix = []
    nonconst_columnlabels = []
    for col, cl in zip(binned_matrix.T, columnlabels):
        if np.any(col != col[0]):
            nonconst_matrix.append(col.reshape(-1, 1))
            nonconst_columnlabels.append(cl)
    assert len(nonconst_matrix) > 0, "Need more data to train the model"
    nonconst_matrix = np.hstack(nonconst_matrix)
    LOG.debug("Workload characterization ~ nonconst data size: %s",
              nonconst_matrix.shape)

    # Remove any duplicate columns
    unique_matrix, unique_idxs = np.unique(nonconst_matrix,
                                           axis=1,
                                           return_index=True)
    unique_columnlabels = [nonconst_columnlabels[idx] for idx in unique_idxs]

    LOG.debug("Workload characterization ~ final data size: %s",
              unique_matrix.shape)
    n_rows, n_cols = unique_matrix.shape

    # Shuffle the matrix rows
    shuffle_indices = get_shuffle_indices(n_rows)
    shuffled_matrix = unique_matrix[shuffle_indices, :]

    # Fit factor analysis model
    fa_model = FactorAnalysis()
    # For now we use 5 latent variables
    fa_model.fit(shuffled_matrix, unique_columnlabels, n_components=5)

    # Components: metrics * factors
    components = fa_model.components_.T.copy()
    LOG.info("Workload characterization first part costs %.0f seconds.",
             time.time() - start_ts)

    # Run Kmeans for # clusters k in range(1, num_nonduplicate_metrics - 1)
    # K should be much smaller than n_cols in detK, For now max_cluster <= 20
    kmeans_models = KMeansClusters()
    kmeans_models.fit(components,
                      min_cluster=1,
                      max_cluster=min(n_cols - 1, 20),
                      sample_labels=unique_columnlabels,
                      estimator_params={'n_init': 50})

    # Compute optimal # clusters, k, using gap statistics
    gapk = create_kselection_model("gap-statistic")
    gapk.fit(components, kmeans_models.cluster_map_)

    LOG.debug("Found optimal number of clusters: %d",
              gapk.optimal_num_clusters_)

    # Get pruned metrics, cloest samples of each cluster center
    pruned_metrics = kmeans_models.cluster_map_[
        gapk.optimal_num_clusters_].get_closest_samples()

    # Return pruned metrics
    save_execution_time(start_ts, "run_workload_characterization")
    LOG.info("Workload characterization finished in %.0f seconds.",
             time.time() - start_ts)
    return pruned_metrics
Beispiel #25
0
def create_workload_mapping_data():
    agg_datas = PipelineResult.objects.filter(
        task_type=PipelineTaskType.AGGREGATED_DATA)
    dbmss = set([ad.dbms.pk for ad in agg_datas])
    hardwares = set([ad.hardware.pk for ad in agg_datas])

    for dbms_id, hw_id in itertools.product(dbmss, hardwares):
        data = PipelineResult.get_latest(dbms_id, hw_id,
                                         PipelineTaskType.AGGREGATED_DATA)
        file_info = JSONUtil.loads(data.value)
        cluster_data = OrderedDict()
        for cluster, path in file_info['data'].iteritems():
            compressed_data = np.load(path)
            X_matrix = compressed_data['X_matrix']
            y_matrix = compressed_data['y_matrix']
            X_columnlabels = compressed_data['X_columnlabels']
            y_columnlabels = compressed_data['y_columnlabels']
            rowlabels = compressed_data['rowlabels']

            # Filter metrics and knobs
            ranked_knobs = JSONUtil.loads(
                PipelineResult.get_latest(
                    dbms_id, hw_id,
                    PipelineTaskType.RANKED_KNOBS).value)[:10]  # FIXME
            pruned_metrics = JSONUtil.loads(
                PipelineResult.get_latest(
                    dbms_id, hw_id, PipelineTaskType.PRUNED_METRICS).value)
            knob_idxs = [
                i for i in range(X_matrix.shape[1])
                if X_columnlabels[i] in ranked_knobs
            ]
            metric_idxs = [
                i for i in range(y_matrix.shape[1])
                if y_columnlabels[i] in pruned_metrics
            ]
            X_matrix = X_matrix[:, knob_idxs]
            X_columnlabels = X_columnlabels[knob_idxs]
            y_matrix = y_matrix[:, metric_idxs]
            y_columnlabels = y_columnlabels[metric_idxs]

            # Combine duplicate rows
            X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows(
                X_matrix, y_matrix, rowlabels)
            cluster_data[cluster] = {
                'X_matrix': X_matrix,
                'y_matrix': y_matrix,
                'X_columnlabels': X_columnlabels,
                'y_columnlabels': y_columnlabels,
                'rowlabels': rowlabels,
            }

        Xs = np.vstack([entry['X_matrix'] for entry in cluster_data.values()])
        ys = np.vstack([entry['y_matrix'] for entry in cluster_data.values()])

        X_scaler = StandardScaler(copy=False)
        X_scaler.fit(Xs)
        y_scaler = StandardScaler(copy=False)
        y_scaler.fit_transform(ys)
        y_binner = Bin(axis=0)
        y_binner.fit(ys)
        del Xs
        del ys

        task_name = PipelineTaskType.TYPE_NAMES[
            PipelineTaskType.WORKLOAD_MAPPING_DATA].replace(' ', '').upper()
        timestamp = data.creation_timestamp
        tsf = timestamp.strftime("%Y%m%d-%H%M%S")
        savepaths = {}
        for cluster, entry in cluster_data.iteritems():
            X_scaler.transform(entry['X_matrix'])
            y_scaler.transform(entry['y_matrix'])
            fname = '{}_{}_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id,
                                                cluster, tsf)
            savepath = os.path.join(PIPELINE_DIR, fname)
            savepaths[cluster] = savepath
            np.savez_compressed(savepath, **entry)

        X_scaler_path = os.path.join(
            PIPELINE_DIR,
            '{}_XSCALER_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf))
        np.savez_compressed(X_scaler_path,
                            mean=X_scaler.mean_,
                            scale=X_scaler.scale_)
        y_scaler_path = os.path.join(
            PIPELINE_DIR,
            '{}_YSCALER_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf))
        np.savez_compressed(y_scaler_path,
                            mean=y_scaler.mean_,
                            scale=y_scaler.scale_)
        y_deciles_path = os.path.join(
            PIPELINE_DIR,
            '{}_YDECILES_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf))
        np.savez_compressed(y_deciles_path, deciles=y_binner.deciles_)

        value = {
            'data': savepaths,
            'X_scaler': X_scaler_path,
            'y_scaler': y_scaler_path,
            'y_deciles': y_deciles_path,
            'X_columnlabels':
            cluster_data.values()[0]['X_columnlabels'].tolist(),
            'y_columnlabels':
            cluster_data.values()[0]['y_columnlabels'].tolist(),
        }

        new_res = PipelineResult()
        new_res.dbms = DBMSCatalog.objects.get(pk=dbms_id)
        new_res.hardware = Hardware.objects.get(pk=hw_id)
        new_res.creation_timestamp = timestamp
        new_res.task_type = PipelineTaskType.WORKLOAD_MAPPING_DATA
        new_res.value = JSONUtil.dumps(value, pprint=True)
        new_res.save()
def run_knob_identification(knob_data, metric_data, dbms):
    # Performs knob identification on the knob & metric data and returns
    # a set of ranked knobs.
    #
    # Parameters:
    #   knob_data & metric_data are dictionaries of the form:
    #     - 'data': 2D numpy matrix of knob/metric data
    #     - 'rowlabels': a list of identifiers for the rows in the matrix
    #     - 'columnlabels': a list of the knob/metric names corresponding
    #           to the columns in the data matrix
    #   dbms is the foreign key pointing to target dbms in DBMSCatalog
    #
    # When running the lasso algorithm, the knob_data matrix is set of
    # independent variables (X) and the metric_data is the set of
    # dependent variables (y).

    knob_matrix = knob_data['data']
    knob_columnlabels = knob_data['columnlabels']

    metric_matrix = metric_data['data']
    metric_columnlabels = metric_data['columnlabels']

    # remove constant columns from knob_matrix and metric_matrix
    nonconst_knob_matrix = []
    nonconst_knob_columnlabels = []

    for col, cl in zip(knob_matrix.T, knob_columnlabels):
        if np.any(col != col[0]):
            nonconst_knob_matrix.append(col.reshape(-1, 1))
            nonconst_knob_columnlabels.append(cl)
    assert len(nonconst_knob_matrix) > 0, "Need more data to train the model"
    nonconst_knob_matrix = np.hstack(nonconst_knob_matrix)

    nonconst_metric_matrix = []
    nonconst_metric_columnlabels = []

    for col, cl in zip(metric_matrix.T, metric_columnlabels):
        if np.any(col != col[0]):
            nonconst_metric_matrix.append(col.reshape(-1, 1))
            nonconst_metric_columnlabels.append(cl)
    nonconst_metric_matrix = np.hstack(nonconst_metric_matrix)

    # determine which knobs need encoding (enums with >2 possible values)

    categorical_info = DataUtil.dummy_encoder_helper(nonconst_knob_columnlabels,
                                                     dbms)
    # encode categorical variable first (at least, before standardize)
    dummy_encoder = DummyEncoder(categorical_info['n_values'],
                                 categorical_info['categorical_features'],
                                 categorical_info['cat_columnlabels'],
                                 categorical_info['noncat_columnlabels'])
    encoded_knob_matrix = dummy_encoder.fit_transform(
        nonconst_knob_matrix)
    encoded_knob_columnlabels = dummy_encoder.new_labels

    # standardize values in each column to N(0, 1)
    standardizer = StandardScaler()
    standardized_knob_matrix = standardizer.fit_transform(encoded_knob_matrix)
    standardized_metric_matrix = standardizer.fit_transform(nonconst_metric_matrix)

    # shuffle rows (note: same shuffle applied to both knob and metric matrices)
    shuffle_indices = get_shuffle_indices(standardized_knob_matrix.shape[0], seed=17)
    shuffled_knob_matrix = standardized_knob_matrix[shuffle_indices, :]
    shuffled_metric_matrix = standardized_metric_matrix[shuffle_indices, :]

    # run lasso algorithm
    lasso_model = LassoPath()
    lasso_model.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels)

    # consolidate categorical feature columns, and reset to original names
    encoded_knobs = lasso_model.get_ranked_features()
    consolidated_knobs = consolidate_columnlabels(encoded_knobs)

    return consolidated_knobs
Beispiel #27
0
def train_ddpg(result_id):
    LOG.info('Add training data to ddpg and train ddpg')
    result = Result.objects.get(pk=result_id)
    session = Result.objects.get(pk=result_id).session
    params = JSONUtil.loads(session.hyperparameters)
    session_results = Result.objects.filter(
        session=session, creation_time__lt=result.creation_time)
    result_info = {}
    result_info['newest_result_id'] = result_id

    # Extract data from result and previous results
    result = Result.objects.filter(pk=result_id)
    if len(session_results) == 0:
        base_result_id = result_id
        prev_result_id = result_id
    else:
        base_result_id = session_results[0].pk
        prev_result_id = session_results[len(session_results) - 1].pk
    base_result = Result.objects.filter(pk=base_result_id)
    prev_result = Result.objects.filter(pk=prev_result_id)

    agg_data = DataUtil.aggregate_data(result)
    base_metric_data = (
        DataUtil.aggregate_data(base_result))['y_matrix'].flatten()
    prev_metric_data = (
        DataUtil.aggregate_data(prev_result))['y_matrix'].flatten()

    result = Result.objects.get(pk=result_id)
    target_objective = result.session.target_objective
    prev_obj_idx = [
        i for i, n in enumerate(agg_data['y_columnlabels'])
        if n == target_objective
    ]

    # Clean metric data
    metric_data, metric_labels = clean_metric_data(agg_data['y_matrix'],
                                                   agg_data['y_columnlabels'],
                                                   session)
    metric_data = metric_data.flatten()
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(
        1, -1))[0]

    # Clean knob data
    cleaned_knob_data = clean_knob_data(agg_data['X_matrix'],
                                        agg_data['X_columnlabels'], session)
    knob_data = np.array(cleaned_knob_data[0])
    knob_labels = np.array(cleaned_knob_data[1])
    knob_bounds = np.vstack(
        DataUtil.get_knob_bounds(knob_labels.flatten(), session))
    knob_data = MinMaxScaler().fit(knob_bounds).transform(knob_data)[0]
    knob_num = len(knob_data)
    metric_num = len(metric_data)
    LOG.info('knob_num: %d, metric_num: %d', knob_num, metric_num)

    # Filter ys by current target objective metric
    target_obj_idx = [
        i for i, n in enumerate(metric_labels) if n == target_objective
    ]
    if len(target_obj_idx) == 0:
        raise Exception(('Could not find target objective in metrics '
                         '(target_obj={})').format(target_objective))
    elif len(target_obj_idx) > 1:
        raise Exception(
            ('Found {} instances of target objective in '
             'metrics (target_obj={})').format(len(target_obj_idx),
                                               target_objective))
    objective = metric_data[target_obj_idx]
    base_objective = base_metric_data[prev_obj_idx]
    prev_objective = prev_metric_data[prev_obj_idx]
    metric_meta = db.target_objectives.get_metric_metadata(
        result.session.dbms.pk, result.session.target_objective)

    # Calculate the reward
    if params['DDPG_SIMPLE_REWARD']:
        objective = objective / base_objective
        if metric_meta[target_objective].improvement == '(less is better)':
            reward = -objective
        else:
            reward = objective
    else:
        if metric_meta[target_objective].improvement == '(less is better)':
            if objective - base_objective <= 0:  # positive reward
                reward = (np.square((2 * base_objective - objective) / base_objective) - 1)\
                    * abs(2 * prev_objective - objective) / prev_objective
            else:  # negative reward
                reward = -(np.square(objective / base_objective) -
                           1) * objective / prev_objective
        else:
            if objective - base_objective > 0:  # positive reward
                reward = (np.square(objective / base_objective) -
                          1) * objective / prev_objective
            else:  # negative reward
                reward = -(np.square((2 * base_objective - objective) / base_objective) - 1)\
                    * abs(2 * prev_objective - objective) / prev_objective
    LOG.info('reward: %f', reward)

    # Update ddpg
    ddpg = DDPG(n_actions=knob_num,
                n_states=metric_num,
                alr=params['DDPG_ACTOR_LEARNING_RATE'],
                clr=params['DDPG_CRITIC_LEARNING_RATE'],
                gamma=params['DDPG_GAMMA'],
                batch_size=params['DDPG_BATCH_SIZE'],
                a_hidden_sizes=params['DDPG_ACTOR_HIDDEN_SIZES'],
                c_hidden_sizes=params['DDPG_CRITIC_HIDDEN_SIZES'],
                use_default=params['DDPG_USE_DEFAULT'])
    if session.ddpg_actor_model and session.ddpg_critic_model:
        ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model)
    if session.ddpg_reply_memory:
        ddpg.replay_memory.set(session.ddpg_reply_memory)
    ddpg.add_sample(normalized_metric_data, knob_data, reward,
                    normalized_metric_data)
    for _ in range(params['DDPG_UPDATE_EPOCHS']):
        ddpg.update()
    session.ddpg_actor_model, session.ddpg_critic_model = ddpg.get_model()
    session.ddpg_reply_memory = ddpg.replay_memory.get()
    session.save()
    return result_info
Beispiel #28
0
def combine_workload(target_data):
    # Load mapped workload data
    mapped_workload_id = target_data['mapped_workload'][0]

    latest_pipeline_run = PipelineRun.objects.get(
        pk=target_data['pipeline_run'])
    mapped_workload = Workload.objects.get(pk=mapped_workload_id)
    workload_knob_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.KNOB_DATA)
    workload_knob_data = JSONUtil.loads(workload_knob_data.data)
    workload_metric_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.METRIC_DATA)
    workload_metric_data = JSONUtil.loads(workload_metric_data.data)

    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    session = newest_result.session
    params = JSONUtil.loads(session.hyperparameters)
    cleaned_workload_knob_data = clean_knob_data(
        workload_knob_data["data"], workload_knob_data["columnlabels"],
        newest_result.session)

    X_workload = np.array(cleaned_workload_knob_data[0])
    X_columnlabels = np.array(cleaned_workload_knob_data[1])
    y_workload = np.array(workload_metric_data['data'])
    y_columnlabels = np.array(workload_metric_data['columnlabels'])
    rowlabels_workload = np.array(workload_metric_data['rowlabels'])

    # Target workload data
    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    X_target = target_data['X_matrix']
    y_target = target_data['y_matrix']
    rowlabels_target = np.array(target_data['rowlabels'])

    if not np.array_equal(X_columnlabels, target_data['X_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical X columnlabels (sorted knob names)'),
                        X_columnlabels, target_data['X_columnlabels'])
    if not np.array_equal(y_columnlabels, target_data['y_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical y columnlabels (sorted metric names)'),
                        y_columnlabels, target_data['y_columnlabels'])

    # Filter Xs by top 10 ranked knobs
    ranked_knobs = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.RANKED_KNOBS)
    ranked_knobs = JSONUtil.loads(
        ranked_knobs.data)[:params['IMPORTANT_KNOB_NUMBER']]
    ranked_knob_idxs = [
        i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs
    ]
    X_workload = X_workload[:, ranked_knob_idxs]
    X_target = X_target[:, ranked_knob_idxs]
    X_columnlabels = X_columnlabels[ranked_knob_idxs]

    # Filter ys by current target objective metric
    target_objective = newest_result.session.target_objective
    target_obj_idx = [
        i for i, cl in enumerate(y_columnlabels) if cl == target_objective
    ]
    if len(target_obj_idx) == 0:
        raise Exception(('Could not find target objective in metrics '
                         '(target_obj={})').format(target_objective))
    elif len(target_obj_idx) > 1:
        raise Exception(
            ('Found {} instances of target objective in '
             'metrics (target_obj={})').format(len(target_obj_idx),
                                               target_objective))

    y_workload = y_workload[:, target_obj_idx]
    y_target = y_target[:, target_obj_idx]
    y_columnlabels = y_columnlabels[target_obj_idx]

    # Combine duplicate rows in the target/workload data (separately)
    X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows(
        X_workload, y_workload, rowlabels_workload)
    X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows(
        X_target, y_target, rowlabels_target)

    # Delete any rows that appear in both the workload data and the target
    # data from the workload data
    dups_filter = np.ones(X_workload.shape[0], dtype=bool)
    target_row_tups = [tuple(row) for row in X_target]
    for i, row in enumerate(X_workload):
        if tuple(row) in target_row_tups:
            dups_filter[i] = False
    X_workload = X_workload[dups_filter, :]
    y_workload = y_workload[dups_filter, :]
    rowlabels_workload = rowlabels_workload[dups_filter]

    # Combine target & workload Xs for preprocessing
    X_matrix = np.vstack([X_target, X_workload])

    # Dummy encode categorial variables
    if ENABLE_DUMMY_ENCODER:
        categorical_info = DataUtil.dummy_encoder_helper(
            X_columnlabels, mapped_workload.dbms)
        dummy_encoder = DummyEncoder(categorical_info['n_values'],
                                     categorical_info['categorical_features'],
                                     categorical_info['cat_columnlabels'],
                                     categorical_info['noncat_columnlabels'])
        X_matrix = dummy_encoder.fit_transform(X_matrix)
        binary_encoder = categorical_info['binary_vars']
        # below two variables are needed for correctly determing max/min on dummies
        binary_index_set = set(categorical_info['binary_vars'])
        total_dummies = dummy_encoder.total_dummies()
    else:
        dummy_encoder = None
        binary_encoder = None
        binary_index_set = []
        total_dummies = 0

    # Scale to N(0, 1)
    X_scaler = StandardScaler()
    X_scaled = X_scaler.fit_transform(X_matrix)
    if y_target.shape[0] < 5:  # FIXME
        # FIXME (dva): if there are fewer than 5 target results so far
        # then scale the y values (metrics) using the workload's
        # y_scaler. I'm not sure if 5 is the right cutoff.
        y_target_scaler = None
        y_workload_scaler = StandardScaler()
        y_matrix = np.vstack([y_target, y_workload])
        y_scaled = y_workload_scaler.fit_transform(y_matrix)
    else:
        # FIXME (dva): otherwise try to compute a separate y_scaler for
        # the target and scale them separately.
        try:
            y_target_scaler = StandardScaler()
            y_workload_scaler = StandardScaler()
            y_target_scaled = y_target_scaler.fit_transform(y_target)
            y_workload_scaled = y_workload_scaler.fit_transform(y_workload)
            y_scaled = np.vstack([y_target_scaled, y_workload_scaled])
        except ValueError:
            y_target_scaler = None
            y_workload_scaler = StandardScaler()
            y_scaled = y_workload_scaler.fit_transform(y_target)

    metric_meta = db.target_objectives.get_metric_metadata(
        newest_result.session.dbms.pk, newest_result.session.target_objective)
    lessisbetter = metric_meta[
        target_objective].improvement == db.target_objectives.LESS_IS_BETTER
    # Maximize the throughput, moreisbetter
    # Use gradient descent to minimize -throughput
    if not lessisbetter:
        y_scaled = -y_scaled

    # Set up constraint helper
    constraint_helper = ParamConstraintHelper(
        scaler=X_scaler,
        encoder=dummy_encoder,
        binary_vars=binary_encoder,
        init_flip_prob=params['INIT_FLIP_PROB'],
        flip_prob_decay=params['FLIP_PROB_DECAY'])

    # FIXME (dva): check if these are good values for the ridge
    # ridge = np.empty(X_scaled.shape[0])
    # ridge[:X_target.shape[0]] = 0.01
    # ridge[X_target.shape[0]:] = 0.1

    X_min = np.empty(X_scaled.shape[1])
    X_max = np.empty(X_scaled.shape[1])
    X_scaler_matrix = np.zeros([1, X_scaled.shape[1]])

    session_knobs = SessionKnob.objects.get_knobs_for_session(
        newest_result.session)

    # Set min/max for knob values
    for i in range(X_scaled.shape[1]):
        if i < total_dummies or i in binary_index_set:
            col_min = 0
            col_max = 1
        else:
            col_min = X_scaled[:, i].min()
            col_max = X_scaled[:, i].max()
            for knob in session_knobs:
                if X_columnlabels[i] == knob["name"]:
                    X_scaler_matrix[0][i] = knob["minval"]
                    col_min = X_scaler.transform(X_scaler_matrix)[0][i]
                    X_scaler_matrix[0][i] = knob["maxval"]
                    col_max = X_scaler.transform(X_scaler_matrix)[0][i]
        X_min[i] = col_min
        X_max[i] = col_max

    return X_columnlabels, X_scaler, X_scaled, y_scaled, X_max, X_min,\
        dummy_encoder, constraint_helper
Beispiel #29
0
def train_ddpg(result_id):
    LOG.info('Add training data to ddpg and train ddpg')
    result = Result.objects.get(pk=result_id)
    session = Result.objects.get(pk=result_id).session
    session_results = Result.objects.filter(
        session=session, creation_time__lt=result.creation_time)
    result_info = {}
    result_info['newest_result_id'] = result_id
    if len(session_results) == 0:
        LOG.info('No previous result. Abort.')
        return result_info

    # Extract data from result
    result = Result.objects.filter(pk=result_id)
    base_result_id = session_results[0].pk
    base_result = Result.objects.filter(pk=base_result_id)

    agg_data = DataUtil.aggregate_data(result)
    metric_data = agg_data['y_matrix'].flatten()
    base_metric_data = (
        DataUtil.aggregate_data(base_result))['y_matrix'].flatten()
    metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1))
    normalized_metric_data = metric_scalar.transform(metric_data.reshape(
        1, -1))[0]

    # Clean knob data
    cleaned_knob_data = clean_knob_data(agg_data['X_matrix'],
                                        agg_data['X_columnlabels'], session)
    knob_data = np.array(cleaned_knob_data[0])
    knob_labels = np.array(cleaned_knob_data[1])
    knob_bounds = np.vstack(
        DataUtil.get_knob_bounds(knob_labels.flatten(), session))
    knob_data = MinMaxScaler().fit(knob_bounds).transform(knob_data)[0]
    knob_num = len(knob_data)
    metric_num = len(metric_data)
    LOG.info('knob_num: %d, metric_num: %d', knob_num, metric_num)

    # Filter ys by current target objective metric
    result = Result.objects.get(pk=result_id)
    target_objective = result.session.target_objective
    target_obj_idx = [
        i for i, n in enumerate(agg_data['y_columnlabels'])
        if n == target_objective
    ]
    if len(target_obj_idx) == 0:
        raise Exception(('Could not find target objective in metrics '
                         '(target_obj={})').format(target_objective))
    elif len(target_obj_idx) > 1:
        raise Exception(
            ('Found {} instances of target objective in '
             'metrics (target_obj={})').format(len(target_obj_idx),
                                               target_objective))
    objective = metric_data[target_obj_idx]
    base_objective = base_metric_data[target_obj_idx]
    metric_meta = db.target_objectives.get_metric_metadata(
        result.session.dbms.pk, result.session.target_objective)

    # Calculate the reward
    objective = objective / base_objective
    if metric_meta[target_objective].improvement == '(less is better)':
        reward = -objective
    else:
        reward = objective
    LOG.info('reward: %f', reward)

    # Update ddpg
    ddpg = DDPG(n_actions=knob_num,
                n_states=metric_num,
                alr=ACTOR_LEARNING_RATE,
                clr=CRITIC_LEARNING_RATE,
                gamma=0,
                batch_size=DDPG_BATCH_SIZE)
    if session.ddpg_actor_model and session.ddpg_critic_model:
        ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model)
    if session.ddpg_reply_memory:
        ddpg.replay_memory.set(session.ddpg_reply_memory)
    ddpg.add_sample(normalized_metric_data, knob_data, reward,
                    normalized_metric_data)
    for _ in range(25):
        ddpg.update()
    session.ddpg_actor_model, session.ddpg_critic_model = ddpg.get_model()
    session.ddpg_reply_memory = ddpg.replay_memory.get()
    session.save()
    return result_info
Beispiel #30
0
def configuration_recommendation(target_data):
    LOG.info('configuration_recommendation called')
    latest_pipeline_run = PipelineRun.objects.get_latest()

    if target_data['bad'] is True:
        target_data_res = {}
        target_data_res['status'] = 'bad'
        target_data_res['info'] = 'WARNING: no training data, the config is generated randomly'
        target_data_res['recommendation'] = target_data['config_recommend']
        return target_data_res

    # Load mapped workload data
    mapped_workload_id = target_data['mapped_workload'][0]

    mapped_workload = Workload.objects.get(pk=mapped_workload_id)
    workload_knob_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.KNOB_DATA)
    workload_knob_data = JSONUtil.loads(workload_knob_data.data)
    workload_metric_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.METRIC_DATA)
    workload_metric_data = JSONUtil.loads(workload_metric_data.data)

    X_workload = np.array(workload_knob_data['data'])
    X_columnlabels = np.array(workload_knob_data['columnlabels'])
    y_workload = np.array(workload_metric_data['data'])
    y_columnlabels = np.array(workload_metric_data['columnlabels'])
    rowlabels_workload = np.array(workload_metric_data['rowlabels'])

    # Target workload data
    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    X_target = target_data['X_matrix']
    y_target = target_data['y_matrix']
    rowlabels_target = np.array(target_data['rowlabels'])

    if not np.array_equal(X_columnlabels, target_data['X_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical X columnlabels (sorted knob names)'))
    if not np.array_equal(y_columnlabels, target_data['y_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical y columnlabels (sorted metric names)'))

    # Filter Xs by top 10 ranked knobs
    ranked_knobs = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.RANKED_KNOBS)
    ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER]
    ranked_knob_idxs = [i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs]
    X_workload = X_workload[:, ranked_knob_idxs]
    X_target = X_target[:, ranked_knob_idxs]
    X_columnlabels = X_columnlabels[ranked_knob_idxs]

    # Filter ys by current target objective metric
    target_objective = newest_result.session.target_objective
    target_obj_idx = [i for i, cl in enumerate(y_columnlabels) if cl == target_objective]
    if len(target_obj_idx) == 0:
        raise Exception(('Could not find target objective in metrics '
                         '(target_obj={})').format(target_objective))
    elif len(target_obj_idx) > 1:
        raise Exception(('Found {} instances of target objective in '
                         'metrics (target_obj={})').format(len(target_obj_idx),
                                                           target_objective))

    metric_meta = MetricCatalog.objects.get_metric_meta(newest_result.session.dbms,
                                                        newest_result.session.target_objective)
    if metric_meta[target_objective] == '(less is better)':
        lessisbetter = True
    else:
        lessisbetter = False

    y_workload = y_workload[:, target_obj_idx]
    y_target = y_target[:, target_obj_idx]
    y_columnlabels = y_columnlabels[target_obj_idx]

    # Combine duplicate rows in the target/workload data (separately)
    X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows(
        X_workload, y_workload, rowlabels_workload)
    X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows(
        X_target, y_target, rowlabels_target)

    # Delete any rows that appear in both the workload data and the target
    # data from the workload data
    dups_filter = np.ones(X_workload.shape[0], dtype=bool)
    target_row_tups = [tuple(row) for row in X_target]
    for i, row in enumerate(X_workload):
        if tuple(row) in target_row_tups:
            dups_filter[i] = False
    X_workload = X_workload[dups_filter, :]
    y_workload = y_workload[dups_filter, :]
    rowlabels_workload = rowlabels_workload[dups_filter]

    # Combine target & workload Xs for preprocessing
    X_matrix = np.vstack([X_target, X_workload])

    # Dummy encode categorial variables
    categorical_info = DataUtil.dummy_encoder_helper(X_columnlabels,
                                                     mapped_workload.dbms)
    dummy_encoder = DummyEncoder(categorical_info['n_values'],
                                 categorical_info['categorical_features'],
                                 categorical_info['cat_columnlabels'],
                                 categorical_info['noncat_columnlabels'])
    X_matrix = dummy_encoder.fit_transform(X_matrix)

    # below two variables are needed for correctly determing max/min on dummies
    binary_index_set = set(categorical_info['binary_vars'])
    total_dummies = dummy_encoder.total_dummies()

    # Scale to N(0, 1)
    X_scaler = StandardScaler()
    X_scaled = X_scaler.fit_transform(X_matrix)
    if y_target.shape[0] < 5:  # FIXME
        # FIXME (dva): if there are fewer than 5 target results so far
        # then scale the y values (metrics) using the workload's
        # y_scaler. I'm not sure if 5 is the right cutoff.
        y_target_scaler = None
        y_workload_scaler = StandardScaler()
        y_matrix = np.vstack([y_target, y_workload])
        y_scaled = y_workload_scaler.fit_transform(y_matrix)
    else:
        # FIXME (dva): otherwise try to compute a separate y_scaler for
        # the target and scale them separately.
        try:
            y_target_scaler = StandardScaler()
            y_workload_scaler = StandardScaler()
            y_target_scaled = y_target_scaler.fit_transform(y_target)
            y_workload_scaled = y_workload_scaler.fit_transform(y_workload)
            y_scaled = np.vstack([y_target_scaled, y_workload_scaled])
        except ValueError:
            y_target_scaler = None
            y_workload_scaler = StandardScaler()
            y_scaled = y_workload_scaler.fit_transform(y_target)

    # Set up constraint helper
    constraint_helper = ParamConstraintHelper(scaler=X_scaler,
                                              encoder=dummy_encoder,
                                              binary_vars=categorical_info['binary_vars'],
                                              init_flip_prob=INIT_FLIP_PROB,
                                              flip_prob_decay=FLIP_PROB_DECAY)

    # FIXME (dva): check if these are good values for the ridge
    # ridge = np.empty(X_scaled.shape[0])
    # ridge[:X_target.shape[0]] = 0.01
    # ridge[X_target.shape[0]:] = 0.1

    # FIXME: we should generate more samples and use a smarter sampling
    # technique
    num_samples = NUM_SAMPLES
    X_samples = np.empty((num_samples, X_scaled.shape[1]))
    X_min = np.empty(X_scaled.shape[1])
    X_max = np.empty(X_scaled.shape[1])
    knobs_mem = KnobCatalog.objects.filter(
        dbms=newest_result.session.dbms, tunable=True, resource=1)
    knobs_mem_catalog = {k.name: k for k in knobs_mem}
    mem_max = newest_result.workload.hardware.memory
    X_mem = np.zeros([1, X_scaled.shape[1]])
    X_default = np.empty(X_scaled.shape[1])

    # Get default knob values
    for i, k_name in enumerate(X_columnlabels):
        k = KnobCatalog.objects.filter(dbms=newest_result.session.dbms, name=k_name)[0]
        X_default[i] = k.default

    X_default_scaled = X_scaler.transform(X_default.reshape(1, X_default.shape[0]))[0]

    # Determine min/max for knob values
    for i in range(X_scaled.shape[1]):
        if i < total_dummies or i in binary_index_set:
            col_min = 0
            col_max = 1
        else:
            col_min = X_scaled[:, i].min()
            col_max = X_scaled[:, i].max()
            if X_columnlabels[i] in knobs_mem_catalog:
                X_mem[0][i] = mem_max * 1024 * 1024 * 1024  # mem_max GB
                col_max = X_scaler.transform(X_mem)[0][i]

            # Set min value to the default value
            # FIXME: support multiple methods can be selected by users
            col_min = X_default_scaled[i]

        X_min[i] = col_min
        X_max[i] = col_max
        X_samples[:, i] = np.random.rand(num_samples) * (col_max - col_min) + col_min

    # Maximize the throughput, moreisbetter
    # Use gradient descent to minimize -throughput
    if not lessisbetter:
        y_scaled = -y_scaled

    q = queue.PriorityQueue()
    for x in range(0, y_scaled.shape[0]):
        q.put((y_scaled[x][0], x))

    i = 0
    while i < TOP_NUM_CONFIG:
        try:
            item = q.get_nowait()
            # Tensorflow get broken if we use the training data points as
            # starting points for GPRGD. We add a small bias for the
            # starting points. GPR_EPS default value is 0.001
            X_samples = np.vstack((X_samples, X_scaled[item[1]] + GPR_EPS))
            i = i + 1
        except queue.Empty:
            break

    model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE,
                  magnitude=DEFAULT_MAGNITUDE,
                  max_train_size=MAX_TRAIN_SIZE,
                  batch_size=BATCH_SIZE,
                  num_threads=NUM_THREADS,
                  learning_rate=DEFAULT_LEARNING_RATE,
                  epsilon=DEFAULT_EPSILON,
                  max_iter=MAX_ITER,
                  sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER,
                  mu_multiplier=DEFAULT_MU_MULTIPLIER)
    model.fit(X_scaled, y_scaled, X_min, X_max, ridge=DEFAULT_RIDGE)
    res = model.predict(X_samples, constraint_helper=constraint_helper)

    best_config_idx = np.argmin(res.minl.ravel())
    best_config = res.minl_conf[best_config_idx, :]
    best_config = X_scaler.inverse_transform(best_config)
    # Decode one-hot encoding into categorical knobs
    best_config = dummy_encoder.inverse_transform(best_config)

    # Although we have max/min limits in the GPRGD training session, it may
    # lose some precisions. e.g. 0.99..99 >= 1.0 may be True on the scaled data,
    # when we inversely transform the scaled data, the different becomes much larger
    # and cannot be ignored. Here we check the range on the original data
    # directly, and make sure the recommended config lies within the range
    X_min_inv = X_scaler.inverse_transform(X_min)
    X_max_inv = X_scaler.inverse_transform(X_max)
    best_config = np.minimum(best_config, X_max_inv)
    best_config = np.maximum(best_config, X_min_inv)

    conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)}
    conf_map_res = {}
    conf_map_res['status'] = 'good'
    conf_map_res['recommendation'] = conf_map
    conf_map_res['info'] = 'INFO: training data size is {}'.format(X_scaled.shape[0])
    return conf_map_res
Beispiel #31
0
def map_workload(target_data):
    # Get the latest version of pipeline data that's been computed so far.
    latest_pipeline_run = PipelineRun.objects.get_latest()
    if target_data['bad']:
        assert target_data is not None
        return target_data
    assert latest_pipeline_run is not None

    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    target_workload = newest_result.workload
    X_columnlabels = np.array(target_data['X_columnlabels'])
    y_columnlabels = np.array(target_data['y_columnlabels'])

    # Find all pipeline data belonging to the latest version with the same
    # DBMS and hardware as the target
    pipeline_data = PipelineData.objects.filter(
        pipeline_run=latest_pipeline_run,
        workload__dbms=target_workload.dbms,
        workload__hardware=target_workload.hardware)

    # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs
    # and pruned metrics but we just use those from the first workload for now
    initialized = False
    global_ranked_knobs = None
    global_pruned_metrics = None
    ranked_knob_idxs = None
    pruned_metric_idxs = None

    # Compute workload mapping data for each unique workload
    unique_workloads = pipeline_data.values_list('workload', flat=True).distinct()
    assert len(unique_workloads) > 0
    workload_data = {}
    for unique_workload in unique_workloads:
        # Load knob & metric data for this workload
        knob_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.KNOB_DATA)

        metric_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.METRIC_DATA)
        X_matrix = np.array(knob_data["data"])
        y_matrix = np.array(metric_data["data"])
        rowlabels = np.array(knob_data["rowlabels"])
        assert np.array_equal(rowlabels, metric_data["rowlabels"])

        if not initialized:
            # For now set ranked knobs & pruned metrics to be those computed
            # for the first workload
            global_ranked_knobs = load_data_helper(
                pipeline_data, unique_workload,
                PipelineTaskType.RANKED_KNOBS)[:IMPORTANT_KNOB_NUMBER]
            global_pruned_metrics = load_data_helper(
                pipeline_data, unique_workload, PipelineTaskType.PRUNED_METRICS)
            ranked_knob_idxs = [i for i in range(X_matrix.shape[1]) if X_columnlabels[
                i] in global_ranked_knobs]
            pruned_metric_idxs = [i for i in range(y_matrix.shape[1]) if y_columnlabels[
                i] in global_pruned_metrics]

            # Filter X & y columnlabels by top ranked_knobs & pruned_metrics
            X_columnlabels = X_columnlabels[ranked_knob_idxs]
            y_columnlabels = y_columnlabels[pruned_metric_idxs]
            initialized = True

        # Filter X & y matrices by top ranked_knobs & pruned_metrics
        X_matrix = X_matrix[:, ranked_knob_idxs]
        y_matrix = y_matrix[:, pruned_metric_idxs]

        # Combine duplicate rows (rows with same knob settings)
        X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows(
            X_matrix, y_matrix, rowlabels)

        workload_data[unique_workload] = {
            'X_matrix': X_matrix,
            'y_matrix': y_matrix,
            'rowlabels': rowlabels,
        }

    # Stack all X & y matrices for preprocessing
    Xs = np.vstack([entry['X_matrix'] for entry in list(workload_data.values())])
    ys = np.vstack([entry['y_matrix'] for entry in list(workload_data.values())])

    # Scale the X & y values, then compute the deciles for each column in y
    X_scaler = StandardScaler(copy=False)
    X_scaler.fit(Xs)
    y_scaler = StandardScaler(copy=False)
    y_scaler.fit_transform(ys)
    y_binner = Bin(bin_start=1, axis=0)
    y_binner.fit(ys)
    del Xs
    del ys

    # Filter the target's X & y data by the ranked knobs & pruned metrics.
    X_target = target_data['X_matrix'][:, ranked_knob_idxs]
    y_target = target_data['y_matrix'][:, pruned_metric_idxs]

    # Now standardize the target's data and bin it by the deciles we just
    # calculated
    X_target = X_scaler.transform(X_target)
    y_target = y_scaler.transform(y_target)
    y_target = y_binner.transform(y_target)

    scores = {}
    for workload_id, workload_entry in list(workload_data.items()):
        predictions = np.empty_like(y_target)
        X_workload = workload_entry['X_matrix']
        X_scaled = X_scaler.transform(X_workload)
        y_workload = workload_entry['y_matrix']
        y_scaled = y_scaler.transform(y_workload)
        for j, y_col in enumerate(y_scaled.T):
            # Using this workload's data, train a Gaussian process model
            # and then predict the performance of each metric for each of
            # the knob configurations attempted so far by the target.
            y_col = y_col.reshape(-1, 1)
            model = GPRNP(length_scale=DEFAULT_LENGTH_SCALE,
                          magnitude=DEFAULT_MAGNITUDE,
                          max_train_size=MAX_TRAIN_SIZE,
                          batch_size=BATCH_SIZE)
            model.fit(X_scaled, y_col, ridge=DEFAULT_RIDGE)
            predictions[:, j] = model.predict(X_target).ypreds.ravel()
        # Bin each of the predicted metric columns by deciles and then
        # compute the score (i.e., distance) between the target workload
        # and each of the known workloads
        predictions = y_binner.transform(predictions)
        dists = np.sqrt(np.sum(np.square(
            np.subtract(predictions, y_target)), axis=1))
        scores[workload_id] = np.mean(dists)

    # Find the best (minimum) score
    best_score = np.inf
    best_workload_id = None
    for workload_id, similarity_score in list(scores.items()):
        if similarity_score < best_score:
            best_score = similarity_score
            best_workload_id = workload_id
    target_data['mapped_workload'] = (best_workload_id, best_score)
    target_data['scores'] = scores
    return target_data
Beispiel #32
0
def configuration_recommendation(target_data):
    LOG.info('configuration_recommendation called')
    latest_pipeline_run = PipelineRun.objects.get_latest()

    if target_data['bad'] is True:
        target_data_res = {}
        target_data_res['status'] = 'bad'
        target_data_res[
            'info'] = 'WARNING: no training data, the config is generated randomly'
        target_data_res['recommendation'] = target_data['config_recommend']
        return target_data_res

    # Load mapped workload data
    mapped_workload_id = target_data['mapped_workload'][0]

    mapped_workload = Workload.objects.get(pk=mapped_workload_id)
    workload_knob_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.KNOB_DATA)
    workload_knob_data = JSONUtil.loads(workload_knob_data.data)
    workload_metric_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.METRIC_DATA)
    workload_metric_data = JSONUtil.loads(workload_metric_data.data)

    X_workload = np.array(workload_knob_data['data'])
    X_columnlabels = np.array(workload_knob_data['columnlabels'])
    y_workload = np.array(workload_metric_data['data'])
    y_columnlabels = np.array(workload_metric_data['columnlabels'])
    rowlabels_workload = np.array(workload_metric_data['rowlabels'])

    # Target workload data
    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    X_target = target_data['X_matrix']
    y_target = target_data['y_matrix']
    rowlabels_target = np.array(target_data['rowlabels'])

    if not np.array_equal(X_columnlabels, target_data['X_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical X columnlabels (sorted knob names)'))
    if not np.array_equal(y_columnlabels, target_data['y_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical y columnlabels (sorted metric names)'))

    # Filter Xs by top 10 ranked knobs
    ranked_knobs = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.RANKED_KNOBS)
    ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER]
    ranked_knob_idxs = [
        i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs
    ]
    X_workload = X_workload[:, ranked_knob_idxs]
    X_target = X_target[:, ranked_knob_idxs]
    X_columnlabels = X_columnlabels[ranked_knob_idxs]

    # Filter ys by current target objective metric
    target_objective = newest_result.session.target_objective
    target_obj_idx = [
        i for i, cl in enumerate(y_columnlabels) if cl == target_objective
    ]
    if len(target_obj_idx) == 0:
        raise Exception(('Could not find target objective in metrics '
                         '(target_obj={})').format(target_objective))
    elif len(target_obj_idx) > 1:
        raise Exception(
            ('Found {} instances of target objective in '
             'metrics (target_obj={})').format(len(target_obj_idx),
                                               target_objective))

    metric_meta = MetricCatalog.objects.get_metric_meta(
        newest_result.session.dbms, newest_result.session.target_objective)
    if metric_meta[target_objective].improvement == '(less is better)':
        lessisbetter = True
    else:
        lessisbetter = False

    y_workload = y_workload[:, target_obj_idx]
    y_target = y_target[:, target_obj_idx]
    y_columnlabels = y_columnlabels[target_obj_idx]

    # Combine duplicate rows in the target/workload data (separately)
    X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows(
        X_workload, y_workload, rowlabels_workload)
    X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows(
        X_target, y_target, rowlabels_target)

    # Delete any rows that appear in both the workload data and the target
    # data from the workload data
    dups_filter = np.ones(X_workload.shape[0], dtype=bool)
    target_row_tups = [tuple(row) for row in X_target]
    for i, row in enumerate(X_workload):
        if tuple(row) in target_row_tups:
            dups_filter[i] = False
    X_workload = X_workload[dups_filter, :]
    y_workload = y_workload[dups_filter, :]
    rowlabels_workload = rowlabels_workload[dups_filter]

    # Combine target & workload Xs for preprocessing
    X_matrix = np.vstack([X_target, X_workload])

    # Dummy encode categorial variables
    categorical_info = DataUtil.dummy_encoder_helper(X_columnlabels,
                                                     mapped_workload.dbms)
    dummy_encoder = DummyEncoder(categorical_info['n_values'],
                                 categorical_info['categorical_features'],
                                 categorical_info['cat_columnlabels'],
                                 categorical_info['noncat_columnlabels'])
    X_matrix = dummy_encoder.fit_transform(X_matrix)

    # below two variables are needed for correctly determing max/min on dummies
    binary_index_set = set(categorical_info['binary_vars'])
    total_dummies = dummy_encoder.total_dummies()

    # Scale to N(0, 1)
    X_scaler = StandardScaler()
    X_scaled = X_scaler.fit_transform(X_matrix)
    if y_target.shape[0] < 5:  # FIXME
        # FIXME (dva): if there are fewer than 5 target results so far
        # then scale the y values (metrics) using the workload's
        # y_scaler. I'm not sure if 5 is the right cutoff.
        y_target_scaler = None
        y_workload_scaler = StandardScaler()
        y_matrix = np.vstack([y_target, y_workload])
        y_scaled = y_workload_scaler.fit_transform(y_matrix)
    else:
        # FIXME (dva): otherwise try to compute a separate y_scaler for
        # the target and scale them separately.
        try:
            y_target_scaler = StandardScaler()
            y_workload_scaler = StandardScaler()
            y_target_scaled = y_target_scaler.fit_transform(y_target)
            y_workload_scaled = y_workload_scaler.fit_transform(y_workload)
            y_scaled = np.vstack([y_target_scaled, y_workload_scaled])
        except ValueError:
            y_target_scaler = None
            y_workload_scaler = StandardScaler()
            y_scaled = y_workload_scaler.fit_transform(y_target)

    # Set up constraint helper
    constraint_helper = ParamConstraintHelper(
        scaler=X_scaler,
        encoder=dummy_encoder,
        binary_vars=categorical_info['binary_vars'],
        init_flip_prob=INIT_FLIP_PROB,
        flip_prob_decay=FLIP_PROB_DECAY)

    # FIXME (dva): check if these are good values for the ridge
    # ridge = np.empty(X_scaled.shape[0])
    # ridge[:X_target.shape[0]] = 0.01
    # ridge[X_target.shape[0]:] = 0.1

    # FIXME: we should generate more samples and use a smarter sampling
    # technique
    num_samples = NUM_SAMPLES
    X_samples = np.empty((num_samples, X_scaled.shape[1]))
    X_min = np.empty(X_scaled.shape[1])
    X_max = np.empty(X_scaled.shape[1])
    knobs_mem = KnobCatalog.objects.filter(dbms=newest_result.session.dbms,
                                           tunable=True,
                                           resource=1)
    knobs_mem_catalog = {k.name: k for k in knobs_mem}
    mem_max = newest_result.workload.hardware.memory
    X_mem = np.zeros([1, X_scaled.shape[1]])
    X_default = np.empty(X_scaled.shape[1])

    # Get default knob values
    for i, k_name in enumerate(X_columnlabels):
        k = KnobCatalog.objects.filter(dbms=newest_result.session.dbms,
                                       name=k_name)[0]
        X_default[i] = k.default

    X_default_scaled = X_scaler.transform(
        X_default.reshape(1, X_default.shape[0]))[0]

    # Determine min/max for knob values
    for i in range(X_scaled.shape[1]):
        if i < total_dummies or i in binary_index_set:
            col_min = 0
            col_max = 1
        else:
            col_min = X_scaled[:, i].min()
            col_max = X_scaled[:, i].max()
            if X_columnlabels[i] in knobs_mem_catalog:
                X_mem[0][i] = mem_max * 1024 * 1024 * 1024  # mem_max GB
                col_max = min(col_max, X_scaler.transform(X_mem)[0][i])

            # Set min value to the default value
            # FIXME: support multiple methods can be selected by users
            col_min = X_default_scaled[i]

        X_min[i] = col_min
        X_max[i] = col_max
        X_samples[:, i] = np.random.rand(num_samples) * (col_max -
                                                         col_min) + col_min

    # Maximize the throughput, moreisbetter
    # Use gradient descent to minimize -throughput
    if not lessisbetter:
        y_scaled = -y_scaled

    q = queue.PriorityQueue()
    for x in range(0, y_scaled.shape[0]):
        q.put((y_scaled[x][0], x))

    i = 0
    while i < TOP_NUM_CONFIG:
        try:
            item = q.get_nowait()
            # Tensorflow get broken if we use the training data points as
            # starting points for GPRGD. We add a small bias for the
            # starting points. GPR_EPS default value is 0.001
            # if the starting point is X_max, we minus a small bias to
            # make sure it is within the range.
            dist = sum(np.square(X_max - X_scaled[item[1]]))
            if dist < 0.001:
                X_samples = np.vstack(
                    (X_samples, X_scaled[item[1]] - abs(GPR_EPS)))
            else:
                X_samples = np.vstack(
                    (X_samples, X_scaled[item[1]] + abs(GPR_EPS)))
            i = i + 1
        except queue.Empty:
            break

    model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE,
                  magnitude=DEFAULT_MAGNITUDE,
                  max_train_size=MAX_TRAIN_SIZE,
                  batch_size=BATCH_SIZE,
                  num_threads=NUM_THREADS,
                  learning_rate=DEFAULT_LEARNING_RATE,
                  epsilon=DEFAULT_EPSILON,
                  max_iter=MAX_ITER,
                  sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER,
                  mu_multiplier=DEFAULT_MU_MULTIPLIER)
    model.fit(X_scaled, y_scaled, X_min, X_max, ridge=DEFAULT_RIDGE)
    res = model.predict(X_samples, constraint_helper=constraint_helper)

    best_config_idx = np.argmin(res.minl.ravel())
    best_config = res.minl_conf[best_config_idx, :]
    best_config = X_scaler.inverse_transform(best_config)
    # Decode one-hot encoding into categorical knobs
    best_config = dummy_encoder.inverse_transform(best_config)

    # Although we have max/min limits in the GPRGD training session, it may
    # lose some precisions. e.g. 0.99..99 >= 1.0 may be True on the scaled data,
    # when we inversely transform the scaled data, the different becomes much larger
    # and cannot be ignored. Here we check the range on the original data
    # directly, and make sure the recommended config lies within the range
    X_min_inv = X_scaler.inverse_transform(X_min)
    X_max_inv = X_scaler.inverse_transform(X_max)
    best_config = np.minimum(best_config, X_max_inv)
    best_config = np.maximum(best_config, X_min_inv)

    conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)}
    conf_map_res = {}
    conf_map_res['status'] = 'good'
    conf_map_res['recommendation'] = conf_map
    conf_map_res['info'] = 'INFO: training data size is {}'.format(
        X_scaled.shape[0])
    return conf_map_res
Beispiel #33
0
def configuration_recommendation(recommendation_input):
    target_data, algorithm = recommendation_input
    LOG.info('configuration_recommendation called')

    if target_data['bad'] is True:
        target_data_res = dict(
            status='bad',
            result_id=target_data['newest_result_id'],
            info='WARNING: no training data, the config is generated randomly',
            recommendation=target_data['config_recommend'],
            pipeline_run=target_data['pipeline_run'])
        LOG.debug('%s: Skipping configuration recommendation.\n\ndata=%s\n',
                  AlgorithmType.name(algorithm),
                  JSONUtil.dumps(target_data, pprint=True))
        return target_data_res

    # Load mapped workload data
    mapped_workload_id = target_data['mapped_workload'][0]

    latest_pipeline_run = PipelineRun.objects.get(
        pk=target_data['pipeline_run'])
    mapped_workload = Workload.objects.get(pk=mapped_workload_id)
    workload_knob_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.KNOB_DATA)
    workload_knob_data = JSONUtil.loads(workload_knob_data.data)
    workload_metric_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.METRIC_DATA)
    workload_metric_data = JSONUtil.loads(workload_metric_data.data)

    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    cleaned_workload_knob_data = clean_knob_data(
        workload_knob_data["data"], workload_knob_data["columnlabels"],
        newest_result.session)

    X_workload = np.array(cleaned_workload_knob_data[0])
    X_columnlabels = np.array(cleaned_workload_knob_data[1])
    y_workload = np.array(workload_metric_data['data'])
    y_columnlabels = np.array(workload_metric_data['columnlabels'])
    rowlabels_workload = np.array(workload_metric_data['rowlabels'])

    # Target workload data
    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    X_target = target_data['X_matrix']
    y_target = target_data['y_matrix']
    rowlabels_target = np.array(target_data['rowlabels'])

    if not np.array_equal(X_columnlabels, target_data['X_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical X columnlabels (sorted knob names)'))
    if not np.array_equal(y_columnlabels, target_data['y_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical y columnlabels (sorted metric names)'))

    # Filter Xs by top 10 ranked knobs
    ranked_knobs = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.RANKED_KNOBS)
    ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER]
    ranked_knob_idxs = [
        i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs
    ]
    X_workload = X_workload[:, ranked_knob_idxs]
    X_target = X_target[:, ranked_knob_idxs]
    X_columnlabels = X_columnlabels[ranked_knob_idxs]

    # Filter ys by current target objective metric
    target_objective = newest_result.session.target_objective
    target_obj_idx = [
        i for i, cl in enumerate(y_columnlabels) if cl == target_objective
    ]
    if len(target_obj_idx) == 0:
        raise Exception(('Could not find target objective in metrics '
                         '(target_obj={})').format(target_objective))
    elif len(target_obj_idx) > 1:
        raise Exception(
            ('Found {} instances of target objective in '
             'metrics (target_obj={})').format(len(target_obj_idx),
                                               target_objective))

    metric_meta = db.target_objectives.get_metric_metadata(
        newest_result.session.dbms.pk, newest_result.session.target_objective)
    lessisbetter = metric_meta[
        target_objective].improvement == db.target_objectives.LESS_IS_BETTER

    y_workload = y_workload[:, target_obj_idx]
    y_target = y_target[:, target_obj_idx]
    y_columnlabels = y_columnlabels[target_obj_idx]

    # Combine duplicate rows in the target/workload data (separately)
    X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows(
        X_workload, y_workload, rowlabels_workload)
    X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows(
        X_target, y_target, rowlabels_target)

    # Delete any rows that appear in both the workload data and the target
    # data from the workload data
    dups_filter = np.ones(X_workload.shape[0], dtype=bool)
    target_row_tups = [tuple(row) for row in X_target]
    for i, row in enumerate(X_workload):
        if tuple(row) in target_row_tups:
            dups_filter[i] = False
    X_workload = X_workload[dups_filter, :]
    y_workload = y_workload[dups_filter, :]
    rowlabels_workload = rowlabels_workload[dups_filter]

    # Combine target & workload Xs for preprocessing
    X_matrix = np.vstack([X_target, X_workload])

    # Dummy encode categorial variables
    categorical_info = DataUtil.dummy_encoder_helper(X_columnlabels,
                                                     mapped_workload.dbms)
    dummy_encoder = DummyEncoder(categorical_info['n_values'],
                                 categorical_info['categorical_features'],
                                 categorical_info['cat_columnlabels'],
                                 categorical_info['noncat_columnlabels'])
    X_matrix = dummy_encoder.fit_transform(X_matrix)

    # below two variables are needed for correctly determing max/min on dummies
    binary_index_set = set(categorical_info['binary_vars'])
    total_dummies = dummy_encoder.total_dummies()

    # Scale to N(0, 1)
    X_scaler = StandardScaler()
    X_scaled = X_scaler.fit_transform(X_matrix)
    if y_target.shape[0] < 5:  # FIXME
        # FIXME (dva): if there are fewer than 5 target results so far
        # then scale the y values (metrics) using the workload's
        # y_scaler. I'm not sure if 5 is the right cutoff.
        y_target_scaler = None
        y_workload_scaler = StandardScaler()
        y_matrix = np.vstack([y_target, y_workload])
        y_scaled = y_workload_scaler.fit_transform(y_matrix)
    else:
        # FIXME (dva): otherwise try to compute a separate y_scaler for
        # the target and scale them separately.
        try:
            y_target_scaler = StandardScaler()
            y_workload_scaler = StandardScaler()
            y_target_scaled = y_target_scaler.fit_transform(y_target)
            y_workload_scaled = y_workload_scaler.fit_transform(y_workload)
            y_scaled = np.vstack([y_target_scaled, y_workload_scaled])
        except ValueError:
            y_target_scaler = None
            y_workload_scaler = StandardScaler()
            y_scaled = y_workload_scaler.fit_transform(y_target)

    # Set up constraint helper
    constraint_helper = ParamConstraintHelper(
        scaler=X_scaler,
        encoder=dummy_encoder,
        binary_vars=categorical_info['binary_vars'],
        init_flip_prob=INIT_FLIP_PROB,
        flip_prob_decay=FLIP_PROB_DECAY)

    # FIXME (dva): check if these are good values for the ridge
    # ridge = np.empty(X_scaled.shape[0])
    # ridge[:X_target.shape[0]] = 0.01
    # ridge[X_target.shape[0]:] = 0.1

    # FIXME: we should generate more samples and use a smarter sampling
    # technique
    num_samples = NUM_SAMPLES
    X_samples = np.empty((num_samples, X_scaled.shape[1]))
    X_min = np.empty(X_scaled.shape[1])
    X_max = np.empty(X_scaled.shape[1])
    X_scaler_matrix = np.zeros([1, X_scaled.shape[1]])

    session_knobs = SessionKnob.objects.get_knobs_for_session(
        newest_result.session)

    # Set min/max for knob values
    for i in range(X_scaled.shape[1]):
        if i < total_dummies or i in binary_index_set:
            col_min = 0
            col_max = 1
        else:
            col_min = X_scaled[:, i].min()
            col_max = X_scaled[:, i].max()
            for knob in session_knobs:
                if X_columnlabels[i] == knob["name"]:
                    X_scaler_matrix[0][i] = knob["minval"]
                    col_min = X_scaler.transform(X_scaler_matrix)[0][i]
                    X_scaler_matrix[0][i] = knob["maxval"]
                    col_max = X_scaler.transform(X_scaler_matrix)[0][i]
        X_min[i] = col_min
        X_max[i] = col_max
        X_samples[:, i] = np.random.rand(num_samples) * (col_max -
                                                         col_min) + col_min

    # Maximize the throughput, moreisbetter
    # Use gradient descent to minimize -throughput
    if not lessisbetter:
        y_scaled = -y_scaled

    q = queue.PriorityQueue()
    for x in range(0, y_scaled.shape[0]):
        q.put((y_scaled[x][0], x))

    i = 0
    while i < TOP_NUM_CONFIG:
        try:
            item = q.get_nowait()
            # Tensorflow get broken if we use the training data points as
            # starting points for GPRGD. We add a small bias for the
            # starting points. GPR_EPS default value is 0.001
            # if the starting point is X_max, we minus a small bias to
            # make sure it is within the range.
            dist = sum(np.square(X_max - X_scaled[item[1]]))
            if dist < 0.001:
                X_samples = np.vstack(
                    (X_samples, X_scaled[item[1]] - abs(GPR_EPS)))
            else:
                X_samples = np.vstack(
                    (X_samples, X_scaled[item[1]] + abs(GPR_EPS)))
            i = i + 1
        except queue.Empty:
            break

    session = newest_result.session
    res = None

    if algorithm == AlgorithmType.DNN:
        # neural network model
        model_nn = NeuralNet(n_input=X_samples.shape[1],
                             batch_size=X_samples.shape[0],
                             explore_iters=DNN_EXPLORE_ITER,
                             noise_scale_begin=DNN_NOISE_SCALE_BEGIN,
                             noise_scale_end=DNN_NOISE_SCALE_END,
                             debug=DNN_DEBUG,
                             debug_interval=DNN_DEBUG_INTERVAL)
        if session.dnn_model is not None:
            model_nn.set_weights_bin(session.dnn_model)
        model_nn.fit(X_scaled, y_scaled, fit_epochs=DNN_TRAIN_ITER)
        res = model_nn.recommend(X_samples,
                                 X_min,
                                 X_max,
                                 explore=DNN_EXPLORE,
                                 recommend_epochs=MAX_ITER)
        session.dnn_model = model_nn.get_weights_bin()
        session.save()

    elif algorithm == AlgorithmType.GPR:
        # default gpr model
        model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE,
                      magnitude=DEFAULT_MAGNITUDE,
                      max_train_size=MAX_TRAIN_SIZE,
                      batch_size=BATCH_SIZE,
                      num_threads=NUM_THREADS,
                      learning_rate=DEFAULT_LEARNING_RATE,
                      epsilon=DEFAULT_EPSILON,
                      max_iter=MAX_ITER,
                      sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER,
                      mu_multiplier=DEFAULT_MU_MULTIPLIER)
        model.fit(X_scaled, y_scaled, X_min, X_max, ridge=DEFAULT_RIDGE)
        res = model.predict(X_samples, constraint_helper=constraint_helper)

    best_config_idx = np.argmin(res.minl.ravel())
    best_config = res.minl_conf[best_config_idx, :]
    best_config = X_scaler.inverse_transform(best_config)
    # Decode one-hot encoding into categorical knobs
    best_config = dummy_encoder.inverse_transform(best_config)

    # Although we have max/min limits in the GPRGD training session, it may
    # lose some precisions. e.g. 0.99..99 >= 1.0 may be True on the scaled data,
    # when we inversely transform the scaled data, the different becomes much larger
    # and cannot be ignored. Here we check the range on the original data
    # directly, and make sure the recommended config lies within the range
    X_min_inv = X_scaler.inverse_transform(X_min)
    X_max_inv = X_scaler.inverse_transform(X_max)
    best_config = np.minimum(best_config, X_max_inv)
    best_config = np.maximum(best_config, X_min_inv)

    conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)}
    conf_map_res = dict(status='good',
                        result_id=target_data['newest_result_id'],
                        recommendation=conf_map,
                        info='INFO: training data size is {}'.format(
                            X_scaled.shape[0]),
                        pipeline_run=latest_pipeline_run.pk)
    LOG.debug('%s: Finished selecting the next config.\n\ndata=%s\n',
              AlgorithmType.name(algorithm),
              JSONUtil.dumps(conf_map_res, pprint=True))

    return conf_map_res
Beispiel #34
0
def configuration_recommendation(target_data):
    LOG.info('configuration_recommendation called')
    latest_pipeline_run = PipelineRun.objects.get_latest()

    if target_data['bad'] is True:
        target_data_res = {}
        target_data_res['status'] = 'bad'
        target_data_res['info'] = 'WARNING: no training data, the config is generated randomly'
        target_data_res['recommendation'] = target_data['config_recommend']
        return target_data_res

    # Load mapped workload data
    mapped_workload_id = target_data['mapped_workload'][0]

    mapped_workload = Workload.objects.get(pk=mapped_workload_id)
    workload_knob_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.KNOB_DATA)
    workload_knob_data = JSONUtil.loads(workload_knob_data.data)
    workload_metric_data = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.METRIC_DATA)
    workload_metric_data = JSONUtil.loads(workload_metric_data.data)

    X_workload = np.array(workload_knob_data['data'])
    X_columnlabels = np.array(workload_knob_data['columnlabels'])
    y_workload = np.array(workload_metric_data['data'])
    y_columnlabels = np.array(workload_metric_data['columnlabels'])
    rowlabels_workload = np.array(workload_metric_data['rowlabels'])

    # Target workload data
    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    X_target = target_data['X_matrix']
    y_target = target_data['y_matrix']
    rowlabels_target = np.array(target_data['rowlabels'])

    if not np.array_equal(X_columnlabels, target_data['X_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical X columnlabels (sorted knob names)'))
    if not np.array_equal(y_columnlabels, target_data['y_columnlabels']):
        raise Exception(('The workload and target data should have '
                         'identical y columnlabels (sorted metric names)'))

    # Filter Xs by top 10 ranked knobs
    ranked_knobs = PipelineData.objects.get(
        pipeline_run=latest_pipeline_run,
        workload=mapped_workload,
        task_type=PipelineTaskType.RANKED_KNOBS)
    ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:10]  # FIXME
    ranked_knob_idxs = [i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs]
    X_workload = X_workload[:, ranked_knob_idxs]
    X_target = X_target[:, ranked_knob_idxs]
    X_columnlabels = X_columnlabels[ranked_knob_idxs]

    # Filter ys by current target objective metric
    target_objective = newest_result.session.target_objective
    target_obj_idx = [i for i, cl in enumerate(y_columnlabels) if cl == target_objective]
    if len(target_obj_idx) == 0:
        raise Exception(('Could not find target objective in metrics '
                         '(target_obj={})').format(target_objective))
    elif len(target_obj_idx) > 1:
        raise Exception(('Found {} instances of target objective in '
                         'metrics (target_obj={})').format(len(target_obj_idx),
                                                           target_objective))
    y_workload = y_workload[:, target_obj_idx]
    y_target = y_target[:, target_obj_idx]
    y_columnlabels = y_columnlabels[target_obj_idx]

    # Combine duplicate rows in the target/workload data (separately)
    X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows(
        X_workload, y_workload, rowlabels_workload)
    X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows(
        X_target, y_target, rowlabels_target)

    # Delete any rows that appear in both the workload data and the target
    # data from the workload data
    dups_filter = np.ones(X_workload.shape[0], dtype=bool)
    target_row_tups = [tuple(row) for row in X_target]
    for i, row in enumerate(X_workload):
        if tuple(row) in target_row_tups:
            dups_filter[i] = False
    X_workload = X_workload[dups_filter, :]
    y_workload = y_workload[dups_filter, :]
    rowlabels_workload = rowlabels_workload[dups_filter]

    # Combine target & workload Xs then scale
    X_matrix = np.vstack([X_target, X_workload])
    X_scaler = StandardScaler()
    X_scaled = X_scaler.fit_transform(X_matrix)
    if y_target.shape[0] < 5:  # FIXME
        # FIXME (dva): if there are fewer than 5 target results so far
        # then scale the y values (metrics) using the workload's
        # y_scaler. I'm not sure if 5 is the right cutoff.
        y_target_scaler = None
        y_workload_scaler = StandardScaler()
        y_matrix = np.vstack([y_target, y_workload])
        y_scaled = y_workload_scaler.fit_transform(y_matrix)
    else:
        # FIXME (dva): otherwise try to compute a separate y_scaler for
        # the target and scale them separately.
        try:
            y_target_scaler = StandardScaler()
            y_workload_scaler = StandardScaler()
            y_target_scaled = y_target_scaler.fit_transform(y_target)
            y_workload_scaled = y_workload_scaler.fit_transform(y_workload)
            y_scaled = np.vstack([y_target_scaled, y_workload_scaled])
        except ValueError:
            y_target_scaler = None
            y_workload_scaler = StandardScaler()
            y_scaled = y_workload_scaler.fit_transform(y_target)

    # FIXME (dva): check if these are good values for the ridge
    ridge = np.empty(X_scaled.shape[0])
    ridge[:X_target.shape[0]] = 0.01
    ridge[X_target.shape[0]:] = 0.1

    # FIXME: we should generate more samples and use a smarter sampling
    # technique
    num_samples = 20
    X_samples = np.empty((num_samples, X_scaled.shape[1]))
    X_min = np.empty(X_scaled.shape[1])
    X_max = np.empty(X_scaled.shape[1])
    for i in range(X_scaled.shape[1]):
        col_min = X_scaled[:, i].min()
        col_max = X_scaled[:, i].max()
        X_min[i] = col_min
        X_max[i] = col_max
        X_samples[:, i] = np.random.rand(
            num_samples) * (col_max - col_min) + col_min

    # FIXME: Maximize the throughput, hardcode
    # Use gradient descent to minimize -throughput
    y_scaled = -y_scaled

    model = GPRGD()
    model.fit(X_scaled, y_scaled, X_min, X_max, ridge)
    res = model.predict(X_samples)

    # FIXME: whether we select the min/max for the best config depends
    # on the target objective
    best_config_idx = np.argmin(res.minl.ravel())
    best_config = res.minl_conf[best_config_idx, :]
    best_config = X_scaler.inverse_transform(best_config)

    conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)}
    conf_map_res = {}
    conf_map_res['status'] = 'good'
    conf_map_res['recommendation'] = conf_map
    conf_map_res['info'] = 'INFO: training data size is {}'.format(X_scaled.shape[0])
    return conf_map_res
Beispiel #35
0
def run_background_tasks():
    start_ts = time.time()
    LOG.info("Starting background tasks...")
    # Find modified and not modified workloads, we only have to calculate for the
    # modified workloads.
    modified_workloads = Workload.objects.filter(
        status=WorkloadStatusType.MODIFIED)
    num_modified = modified_workloads.count()
    non_modified_workloads = Workload.objects.filter(
        status=WorkloadStatusType.PROCESSED)
    non_modified_workloads = list(
        non_modified_workloads.values_list('pk', flat=True))
    last_pipeline_run = PipelineRun.objects.get_latest()
    LOG.debug("Workloads: # modified: %s, # processed: %s, # total: %s",
              num_modified, len(non_modified_workloads),
              Workload.objects.all().count())

    if num_modified == 0:
        # No previous workload data yet. Try again later.
        LOG.info("No modified workload data yet. Ending background tasks.")
        return

    # Create new entry in PipelineRun table to store the output of each of
    # the background tasks
    pipeline_run_obj = PipelineRun(start_time=now(), end_time=None)
    pipeline_run_obj.save()

    for i, workload in enumerate(modified_workloads):
        workload.status = WorkloadStatusType.PROCESSING
        workload.save()
        wkld_results = Result.objects.filter(workload=workload)
        num_wkld_results = wkld_results.count()
        workload_name = '{}@{}.{}'.format(workload.dbms.key,
                                          workload.project.name, workload.name)

        LOG.info("Starting workload %s (%s/%s, # results: %s)...",
                 workload_name, i + 1, num_modified, num_wkld_results)

        if num_wkld_results == 0:
            # delete the workload
            LOG.info("Deleting workload %s because it has no results.",
                     workload_name)
            workload.delete()
            continue

        if num_wkld_results < MIN_WORKLOAD_RESULTS_COUNT:
            # Check that there are enough results in the workload
            LOG.info(
                "Not enough results in workload %s (# results: %s, # required: %s).",
                workload_name, num_wkld_results, MIN_WORKLOAD_RESULTS_COUNT)
            workload.status = WorkloadStatusType.PROCESSED
            workload.save()
            continue

        LOG.info("Aggregating data for workload %s...", workload_name)
        # Aggregate the knob & metric data for this workload
        knob_data, metric_data = aggregate_data(wkld_results)
        LOG.debug(
            "Aggregated knob data: rowlabels=%s, columnlabels=%s, data=%s.",
            len(knob_data['rowlabels']), len(knob_data['columnlabels']),
            knob_data['data'].shape)
        LOG.debug(
            "Aggregated metric data: rowlabels=%s, columnlabels=%s, data=%s.",
            len(metric_data['rowlabels']), len(metric_data['columnlabels']),
            metric_data['data'].shape)
        LOG.info("Done aggregating data for workload %s.", workload_name)

        num_valid_results = knob_data['data'].shape[0]  # pylint: disable=unsubscriptable-object
        if num_valid_results < MIN_WORKLOAD_RESULTS_COUNT:
            # Check that there are enough valid results in the workload
            LOG.info(
                "Not enough valid results in workload %s (# valid results: "
                "%s, # required: %s).", workload_name, num_valid_results,
                MIN_WORKLOAD_RESULTS_COUNT)
            workload.status = WorkloadStatusType.PROCESSED
            workload.save()
            continue

        # Knob_data and metric_data are 2D numpy arrays. Convert them into a
        # JSON-friendly (nested) lists and then save them as new PipelineData
        # objects.
        knob_data_copy = copy.deepcopy(knob_data)
        knob_data_copy['data'] = knob_data_copy['data'].tolist()
        knob_data_copy = JSONUtil.dumps(knob_data_copy)
        knob_entry = PipelineData(pipeline_run=pipeline_run_obj,
                                  task_type=PipelineTaskType.KNOB_DATA,
                                  workload=workload,
                                  data=knob_data_copy,
                                  creation_time=now())
        knob_entry.save()

        metric_data_copy = copy.deepcopy(metric_data)
        metric_data_copy['data'] = metric_data_copy['data'].tolist()
        metric_data_copy = JSONUtil.dumps(metric_data_copy)
        metric_entry = PipelineData(pipeline_run=pipeline_run_obj,
                                    task_type=PipelineTaskType.METRIC_DATA,
                                    workload=workload,
                                    data=metric_data_copy,
                                    creation_time=now())
        metric_entry.save()

        # Execute the Workload Characterization task to compute the list of
        # pruned metrics for this workload and save them in a new PipelineData
        # object.
        LOG.info("Pruning metrics for workload %s...", workload_name)
        pruned_metrics = run_workload_characterization(metric_data=metric_data,
                                                       dbms=workload.dbms)
        LOG.info(
            "Done pruning metrics for workload %s (# pruned metrics: %s).\n\n"
            "Pruned metrics: %s\n", workload_name, len(pruned_metrics),
            pruned_metrics)
        pruned_metrics_entry = PipelineData(
            pipeline_run=pipeline_run_obj,
            task_type=PipelineTaskType.PRUNED_METRICS,
            workload=workload,
            data=JSONUtil.dumps(pruned_metrics),
            creation_time=now())
        pruned_metrics_entry.save()

        # Workload target objective data
        ranked_knob_metrics = sorted(
            wkld_results.distinct('session').values_list(
                'session__target_objective', flat=True).distinct())
        LOG.debug("Target objectives for workload %s: %s", workload_name,
                  ', '.join(ranked_knob_metrics))

        if KNOB_IDENT_USE_PRUNED_METRICS:
            ranked_knob_metrics = sorted(
                set(ranked_knob_metrics) + set(pruned_metrics))

        # Use the set of metrics to filter the metric_data
        metric_idxs = [
            i for i, metric_name in enumerate(metric_data['columnlabels'])
            if metric_name in ranked_knob_metrics
        ]
        ranked_metric_data = {
            'data': metric_data['data'][:, metric_idxs],
            'rowlabels': copy.deepcopy(metric_data['rowlabels']),
            'columnlabels':
            [metric_data['columnlabels'][i] for i in metric_idxs]
        }

        # Execute the Knob Identification task to compute an ordered list of knobs
        # ranked by their impact on the DBMS's performance. Save them in a new
        # PipelineData object.
        LOG.info(
            "Ranking knobs for workload %s (use pruned metric data: %s)...",
            workload_name, KNOB_IDENT_USE_PRUNED_METRICS)
        sessions = []
        for result in wkld_results:
            if result.session not in sessions:
                sessions.append(result.session)
        rank_knob_data = copy.deepcopy(knob_data)
        rank_knob_data['data'], rank_knob_data['columnlabels'] =\
            DataUtil.clean_knob_data(knob_data['data'], knob_data['columnlabels'], sessions)
        ranked_knobs = run_knob_identification(knob_data=rank_knob_data,
                                               metric_data=ranked_metric_data,
                                               dbms=workload.dbms)
        LOG.info(
            "Done ranking knobs for workload %s (# ranked knobs: %s).\n\n"
            "Ranked knobs: %s\n", workload_name, len(ranked_knobs),
            ranked_knobs)
        ranked_knobs_entry = PipelineData(
            pipeline_run=pipeline_run_obj,
            task_type=PipelineTaskType.RANKED_KNOBS,
            workload=workload,
            data=JSONUtil.dumps(ranked_knobs),
            creation_time=now())
        ranked_knobs_entry.save()

        workload.status = WorkloadStatusType.PROCESSED
        workload.save()
        LOG.info("Done processing workload %s (%s/%s).", workload_name, i + 1,
                 num_modified)

    LOG.info("Finished processing %s modified workloads.", num_modified)

    non_modified_workloads = Workload.objects.filter(
        pk__in=non_modified_workloads)
    # Update the latest pipeline data for the non modified workloads to have this pipeline run
    PipelineData.objects.filter(workload__in=non_modified_workloads,
                                pipeline_run=last_pipeline_run)\
        .update(pipeline_run=pipeline_run_obj)

    # Set the end_timestamp to the current time to indicate that we are done running
    # the background tasks
    pipeline_run_obj.end_time = now()
    pipeline_run_obj.save()
    save_execution_time(start_ts, "run_background_tasks")
    LOG.info("Finished background tasks (%.0f seconds).",
             time.time() - start_ts)