Ejemplo n.º 1
0
    def _record_results(self, x, y, y_pred, label):
        """Record the prediction results

        :param x: the input data
        :param y: the actual output
        :param y_pred: the predicted output
        :param label: the result label ("resource" or "impact")
        """
        # Result files
        metrics_path = "{}/global_{}_model_metrics.csv".format(
            self.model_results_path, label)
        prediction_path = "{}/global_{}_model_prediction.csv".format(
            self.model_results_path, label)
        result_writing_util.create_metrics_and_prediction_files(
            metrics_path, prediction_path)

        # Log the prediction results
        ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0)
        io_util.write_csv_result(metrics_path, "Ratio Error", ratio_error)
        result_writing_util.record_predictions((x, y_pred, y), prediction_path)

        # Print Error summary to command line
        if label == "resource":
            original_ratio_error = np.average(np.abs(y - x[:, :y.shape[1]]) /
                                              (y + 1e-6),
                                              axis=0)
        else:
            original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1),
                                              axis=0)
        logging.info('Model Original Ratio Error ({}): {}'.format(
            label, original_ratio_error))
        logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error))
        logging.info('')
Ejemplo n.º 2
0
def _global_model_training_process(x, y, methods, test_ratio, metrics_path, prediction_path):
    """Training process for the global models

    :param x: input feature
    :param y: labels
    :param methods: ML models to enumerate
    :param test_ratio: train-test split ratio
    :param metrics_path: to store the prediction metrics
    :param prediction_path: to store the raw prediction results
    :return: (the best model, the indices for the test data for additional metric calculation)
    """
    global_model = None
    result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path, False)
    n_samples = x.shape[0]
    indices = np.arange(n_samples)

    x_train, x_test, y_train, y_test, indices_train, indices_test = model_selection.train_test_split(
        x, y, indices, test_size=test_ratio, random_state=0)

    min_percentage_error = 1
    pred_results = None
    elapsed_us_index = data_info.instance.target_csv_index[Target.ELAPSED_US]

    for method in methods:
        # Train the model
        logging.info("Training the global model with {}".format(method))
        regressor = model.Model(method)
        regressor.train(x_train, y_train)

        # Evaluate on both the training and test set
        results = []
        evaluate_data = [(x_train, y_train), (x_test, y_test)]
        train_test_label = ["Train", "Test"]
        for i, d in enumerate(evaluate_data):
            evaluate_x = d[0]
            evaluate_y = d[1]

            y_pred = regressor.predict(evaluate_x)
            logging.debug("x shape: {}".format(evaluate_x.shape))
            logging.debug("y shape: {}".format(y_pred.shape))
            percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + 1), axis=0)
            results += list(percentage_error) + [""]

            logging.info('{} Ratio Error: {}'.format(train_test_label[i], percentage_error))

            # Record the model with the lowest elapsed time prediction (since that might be the most
            # important prediction)
            if i == 1 and percentage_error[elapsed_us_index] < min_percentage_error:
                min_percentage_error = percentage_error[elapsed_us_index]
                global_model = regressor
                pred_results = (evaluate_x, y_pred, evaluate_y)

        io_util.write_csv_result(metrics_path, method, results)

        logging.info("")

    # Record the best prediction results on the test data
    result_writing_util.record_predictions(pred_results, prediction_path)

    return global_model, indices_test
Ejemplo n.º 3
0
    def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label):
        """Record the prediction results

        :param x: the input data
        :param y: the actual output
        :param y_pred: the predicted output
        :param label: the result label ("resource" or "impact")
        """
        # Result files
        metrics_path = "{}/global_{}_model_metrics.csv".format(self.model_results_path, label)
        prediction_path = "{}/global_{}_model_prediction.csv".format(self.model_results_path, label)
        result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path, True)

        # Log the prediction results
        ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0)
        io_util.write_csv_result(metrics_path, "Model Ratio Error", ratio_error)
        result_writing_util.record_predictions((x, y_pred, y), prediction_path)

        # Print Error summary to command line
        if label == "resource":
            original_ratio_error = np.average(np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0)
        else:
            original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1), axis=0)
        logging.info('Model Original Ratio Error ({}): {}'.format(label, original_ratio_error))
        logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error))
        logging.info('')

        if label != "resource":
            # Calculate the accumulated ratio error
            epsilon = global_model_config.RATIO_DIVISION_EPSILON
            mini_model_y_pred = np.array(mini_model_y_pred)
            raw_y = np.array(raw_y)
            raw_y_pred = (mini_model_y_pred + epsilon) * y_pred
            accumulated_raw_y = np.sum(raw_y, axis=0)
            accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0)
            original_ratio_error = np.average(np.abs(raw_y - mini_model_y_pred) / (raw_y + epsilon), axis=0)
            ratio_error = np.average(np.abs(raw_y - raw_y_pred) / (raw_y + epsilon), axis=0)
            accumulated_percentage_error = np.abs(accumulated_raw_y - accumulated_raw_y_pred) / (
                        accumulated_raw_y + epsilon)
            original_accumulated_percentage_error = np.abs(accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / (
                    accumulated_raw_y + epsilon)

            logging.info('Original Ratio Error: {}'.format(original_ratio_error))
            io_util.write_csv_result(metrics_path, "Original Ratio Error", original_ratio_error)
            logging.info('Ratio Error: {}'.format(ratio_error))
            io_util.write_csv_result(metrics_path, "Ratio Error", ratio_error)
            logging.info('Original Accumulated Ratio Error: {}'.format(original_accumulated_percentage_error))
            io_util.write_csv_result(metrics_path, "Original Accumulated Ratio Error",
                                     original_accumulated_percentage_error)
            logging.info('Accumulated Ratio Error: {}'.format(accumulated_percentage_error))
            io_util.write_csv_result(metrics_path, "Accumulated Ratio Error", accumulated_percentage_error)
            logging.info('Accumulated Actual: {}'.format(accumulated_raw_y))
            logging.info('Original Accumulated Predict: {}'.format(np.sum(mini_model_y_pred, axis=0)))
            logging.info('Accumulated Predict: {}'.format(accumulated_raw_y_pred))
Ejemplo n.º 4
0
    def _train_data(self, data, summary_file):
        x_train, x_test, y_train, y_test = model_selection.train_test_split(
            data.x, data.y, test_size=self.test_ratio, random_state=0)

        # Write the first header rwo to the result file
        metrics_path = "{}/{}.csv".format(self.model_metrics_path,
                                          data.opunit.name.lower())
        prediction_path = "{}/{}_prediction.csv".format(
            self.model_metrics_path, data.opunit.name.lower())
        result_writing_util.create_metrics_and_prediction_files(
            metrics_path, prediction_path, False)

        methods = self.ml_models

        # Test the prediction with/without the target transformer
        y_transformers = [
            None, data_transforming_util.OPUNIT_Y_TRANSFORMER_MAP[data.opunit]
        ]
        # modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit]
        # if modeling_transformer is not None:
        #    transformers.append(modeling_transformer)
        x_transformer = data_transforming_util.OPUNIT_X_TRANSFORMER_MAP[
            data.opunit]

        error_bias = 1
        min_percentage_error = 2
        pred_results = None
        elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US]
        memory_b_index = data_info.TARGET_CSV_INDEX[Target.MEMORY_B]

        best_y_transformer = -1
        best_method = -1
        for i, y_transformer in enumerate(y_transformers):
            for m, method in enumerate(methods):
                # Train the model
                label = method if i == 0 else method + " transform"
                logging.info("{} {}".format(data.opunit.name, label))
                regressor = model.Model(method,
                                        y_transformer=y_transformer,
                                        x_transformer=x_transformer)
                regressor.train(x_train, y_train)

                # Evaluate on both the training and test set
                results = []
                evaluate_data = [(x_train, y_train), (x_test, y_test)]
                train_test_label = ["Train", "Test"]
                for j, d in enumerate(evaluate_data):
                    evaluate_x = d[0]
                    evaluate_y = d[1]

                    y_pred = regressor.predict(evaluate_x)
                    logging.debug("x shape: {}".format(evaluate_x.shape))
                    logging.debug("y shape: {}".format(y_pred.shape))
                    # In order to avoid the percentage error to explode when the actual label is very small,
                    # we omit the data point with the actual label <= 5 when calculating the percentage error (by
                    # essentially giving the data points with small labels a very small weight)
                    evaluate_threshold = 5
                    weights = np.where(evaluate_y > evaluate_threshold,
                                       np.ones(evaluate_y.shape),
                                       np.full(evaluate_y.shape, 1e-6))
                    percentage_error = np.average(np.abs(evaluate_y - y_pred) /
                                                  (evaluate_y + error_bias),
                                                  axis=0,
                                                  weights=weights)
                    results += list(percentage_error) + [""]

                    logging.info('{} Percentage Error: {}'.format(
                        train_test_label[j], percentage_error))

                    # The default method of determining whether a model is better is by comparing the model error
                    # on the elapsed us. For any opunits in MEM_EVALUATE_OPUNITS, we evaluate by comparing the
                    # model error on memory_b.
                    eval_error = percentage_error[elapsed_us_index]
                    if data.opunit in data_info.MEM_EVALUATE_OPUNITS:
                        eval_error = percentage_error[memory_b_index]

                    # Record the model with the lowest elapsed time prediction (since that might be the most
                    # important prediction)
                    # Only use linear regression for the arithmetic operating units
                    if (j == 1 and eval_error < min_percentage_error
                            and y_transformer == y_transformers[-1] and
                        (data.opunit not in data_info.ARITHMETIC_OPUNITS
                         or method == 'lr')):
                        min_percentage_error = eval_error
                        if self.expose_all:
                            best_y_transformer = i
                            best_method = m
                        else:
                            self.model_map[data.opunit] = regressor
                        pred_results = (evaluate_x, y_pred, evaluate_y)

                    if j == 1:
                        io_util.write_csv_result(summary_file,
                                                 data.opunit.name, [label] +
                                                 list(percentage_error))

                # Dump the prediction results
                io_util.write_csv_result(metrics_path, label, results)

                logging.info("")

            io_util.write_csv_result(metrics_path, "", [])

        # Record the best prediction results on the test data
        result_writing_util.record_predictions(pred_results, prediction_path)
        return best_y_transformer, best_method
Ejemplo n.º 5
0
    def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label,
                        data_list):
        """Record the prediction results

        :param x: the input data
        :param y: the actual output
        :param y_pred: the predicted output
        :param label: the result label ("resource" or "impact")
        """
        # Result files
        metrics_path = "{}/global_{}_model_metrics.csv".format(
            self.model_results_path, label)
        prediction_path = "{}/global_{}_model_prediction.csv".format(
            self.model_results_path, label)
        result_writing_util.create_metrics_and_prediction_files(
            metrics_path, prediction_path, True)

        # Log the prediction results
        ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0)
        io_util.write_csv_result(metrics_path, "Model Ratio Error",
                                 ratio_error)
        result_writing_util.record_predictions((x, y_pred, y), prediction_path)

        # Print Error summary to command line
        if label == "resource":
            avg_original_ratio_error = np.average(
                np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0)
        else:
            avg_original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1),
                                                  axis=0)
        logging.info('Model Original Ratio Error ({}): {}'.format(
            label, avg_original_ratio_error))
        logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error))
        logging.info('')

        if label != "resource":
            # Calculate the accumulated ratio error
            epsilon = global_model_config.RATIO_DIVISION_EPSILON
            mini_model_y_pred = np.array(mini_model_y_pred)
            raw_y = np.array(raw_y)
            raw_y_pred = (mini_model_y_pred + epsilon) * y_pred
            accumulated_raw_y = np.sum(raw_y, axis=0)
            accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0)
            original_ratio_error = np.abs(raw_y - mini_model_y_pred) / (
                raw_y + epsilon)
            avg_original_ratio_error = np.average(original_ratio_error, axis=0)
            ratio_error = np.abs(raw_y - raw_y_pred) / (raw_y + epsilon)
            avg_ratio_error = np.average(ratio_error, axis=0)
            accumulated_percentage_error = np.abs(
                accumulated_raw_y -
                accumulated_raw_y_pred) / (accumulated_raw_y + epsilon)
            original_accumulated_percentage_error = np.abs(
                accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / (
                    accumulated_raw_y + epsilon)

            logging.info(
                'Original Ratio Error: {}'.format(avg_original_ratio_error))
            io_util.write_csv_result(metrics_path, "Original Ratio Error",
                                     avg_original_ratio_error)
            logging.info('Ratio Error: {}'.format(avg_ratio_error))
            io_util.write_csv_result(metrics_path, "Ratio Error",
                                     avg_ratio_error)
            logging.info('Original Accumulated Ratio Error: {}'.format(
                original_accumulated_percentage_error))
            io_util.write_csv_result(metrics_path,
                                     "Original Accumulated Ratio Error",
                                     original_accumulated_percentage_error)
            logging.info('Accumulated Ratio Error: {}'.format(
                accumulated_percentage_error))
            io_util.write_csv_result(metrics_path, "Accumulated Ratio Error",
                                     accumulated_percentage_error)
            logging.info('Accumulated Actual: {}'.format(accumulated_raw_y))
            logging.info('Original Accumulated Predict: {}'.format(
                np.sum(mini_model_y_pred, axis=0)))
            logging.info(
                'Accumulated Predict: {}'.format(accumulated_raw_y_pred))

            if label == 'direct':
                prediction_path = "{}/grouped_opunit_prediction.csv".format(
                    self.model_results_path)
                io_util.create_csv_file(prediction_path, [
                    "Pipeline", "", "Actual", "", "Predicted", "",
                    "Ratio Error"
                ])
                for i, data in enumerate(data_list):
                    io_util.write_csv_result(prediction_path, data.name,
                                             [""] + list(raw_y[i]) + [""] +
                                             list(raw_y_pred[i]) + [""] +
                                             list(ratio_error[i]))

                average_result_path = "{}/interval_average_prediction.csv".format(
                    self.model_results_path)
                io_util.create_csv_file(
                    average_result_path,
                    ["Timestamp", "Actual Average", "Predicted Average"])

                interval_y_map = {}
                interval_y_pred_map = {}
                mark_list = None
                #mark_list = _generate_mark_list(data_list)
                for i, data in enumerate(data_list):
                    # Don't count the create index OU
                    # TODO(lin): needs better way to evaluate... maybe add a id_query field to GroupedOpunitData
                    if data.concurrency > 0:
                        continue
                    if mark_list is not None and not mark_list[i]:
                        continue
                    interval_time = _round_to_interval(
                        data.start_time,
                        global_model_config.AVERAGING_INTERVAL)
                    if interval_time not in interval_y_map:
                        interval_y_map[interval_time] = []
                        interval_y_pred_map[interval_time] = []
                    interval_y_map[interval_time].append(raw_y[i][-5])
                    interval_y_pred_map[interval_time].append(
                        raw_y_pred[i][-5])

                for time in sorted(interval_y_map.keys()):
                    if mark_list is None:
                        io_util.write_csv_result(average_result_path, time, [
                            np.average(interval_y_map[time]),
                            np.average(interval_y_pred_map[time])
                        ])
                    else:
                        io_util.write_csv_result(average_result_path, time, [
                            np.sum(interval_y_map[time]),
                            np.sum(interval_y_pred_map[time])
                        ])
Ejemplo n.º 6
0
    def _train_data(self, data, summary_file):
        x_train, x_test, y_train, y_test = model_selection.train_test_split(
            data.x, data.y, test_size=self.test_ratio, random_state=0)

        # Write the first header rwo to the result file
        metrics_path = "{}/{}.csv".format(self.model_metrics_path,
                                          data.opunit.name.lower())
        prediction_path = "{}/{}_prediction.csv".format(
            self.model_metrics_path, data.opunit.name.lower())
        result_writing_util.create_metrics_and_prediction_files(
            metrics_path, prediction_path)

        methods = self.ml_models

        # Test the prediction with/without the target transformer
        transformers = [
            None,
            data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit]
        ]
        # modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit]
        # if modeling_transformer is not None:
        #    transformers.append(modeling_transformer)

        error_bias = 1
        min_percentage_error = 2
        pred_results = None
        elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US]

        for i, transformer in enumerate(transformers):
            for method in methods:
                # Train the model
                label = method if i == 0 else method + " transform"
                logging.info("{} {}".format(data.opunit.name, label))
                regressor = model.Model(method,
                                        modeling_transformer=transformer)
                regressor.train(x_train, y_train)

                # Evaluate on both the training and test set
                results = []
                evaluate_data = [(x_train, y_train), (x_test, y_test)]
                train_test_label = ["Train", "Test"]
                for j, d in enumerate(evaluate_data):
                    evaluate_x = d[0]
                    evaluate_y = d[1]

                    for x, y in zip(evaluate_x, evaluate_y):
                        stat_vec = [data.opunit]
                        stat_vec.extend(x)
                        self.stats_map[tuple(stat_vec)] = y

                    y_pred = regressor.predict(evaluate_x)
                    logging.debug("x shape: {}".format(evaluate_x.shape))
                    logging.debug("y shape: {}".format(y_pred.shape))
                    percentage_error = np.average(
                        np.abs(evaluate_y - y_pred) /
                        (evaluate_y + 1 + error_bias),
                        axis=0)
                    results += list(percentage_error) + [""]

                    logging.info('{} Percentage Error: {}'.format(
                        train_test_label[j], percentage_error))

                    # Record the model with the lowest elapsed time prediction (since that might be the most
                    # important prediction)
                    # Only use linear regression for the arithmetic operating units
                    if (j == 1 and percentage_error[elapsed_us_index] <
                            min_percentage_error
                            and transformer == transformers[-1] and
                        (data.opunit not in data_info.ARITHMETIC_OPUNITS
                         or method == 'lr')):
                        min_percentage_error = percentage_error[
                            elapsed_us_index]
                        self.model_map[data.opunit] = regressor
                        pred_results = (evaluate_x, y_pred, evaluate_y)

                    if j == 1:
                        io_util.write_csv_result(summary_file,
                                                 data.opunit.name, [label] +
                                                 list(percentage_error))

                # Dump the prediction results
                io_util.write_csv_result(metrics_path, label, results)

                logging.info("")

            io_util.write_csv_result(metrics_path, "", [])

        # Record the best prediction results on the test data
        result_writing_util.record_predictions(pred_results, prediction_path)
Ejemplo n.º 7
0
    def train(self):
        """Train the mini-models

        :return: the map of the trained models
        """

        data_list = []

        # First get the data for all mini runners
        for filename in glob.glob(os.path.join(self.input_path, '*.csv')):
            print(filename)
            data_list += opunit_data.get_mini_runner_data(filename)

        model_map = {}
        # train the models for all the operating units
        for data in data_list:
            x_train, x_test, y_train, y_test = model_selection.train_test_split(data.x, data.y,
                                                                                test_size=self.test_ratio,
                                                                                random_state=0)

            # Write the first header rwo to the result file
            metrics_path = "{}/{}.csv".format(self.model_metrics_path, data.opunit.name.lower())
            prediction_path = "{}/{}_prediction.csv".format(self.model_metrics_path, data.opunit.name.lower())
            result_writing_util.create_metrics_and_prediction_files(metrics_path, prediction_path)

            methods = self.ml_models
            # Only use linear regression for the arithmetic operating units
            if data.opunit in data_info.ARITHMETIC_OPUNITS:
                methods = ["lr"]

            # Also test the prediction with the target transformer (if specified for the operating unit)
            transformers = [None]
            modeling_transformer = data_transforming_util.OPUNIT_MODELING_TRANSFORMER_MAP[data.opunit]
            if modeling_transformer is not None:
                transformers.append(modeling_transformer)

            min_percentage_error = 1
            pred_results = None
            elapsed_us_index = data_info.TARGET_CSV_INDEX[Target.ELAPSED_US]

            for transformer in transformers:
                for method in methods:
                    # Train the model
                    logging.info("{} {}".format(data.opunit.name, method))
                    regressor = model.Model(method, modeling_transformer=transformer)
                    regressor.train(x_train, y_train)

                    # Evaluate on both the training and test set
                    results = []
                    evaluate_data = [(x_train, y_train), (x_test, y_test)]
                    train_test_label = ["Train", "Test"]
                    for i, d in enumerate(evaluate_data):
                        evaluate_x = d[0]
                        evaluate_y = d[1]

                        y_pred = regressor.predict(evaluate_x)
                        logging.debug("x shape: {}".format(evaluate_x.shape))
                        logging.debug("y shape: {}".format(y_pred.shape))
                        percentage_error = np.average(np.abs(evaluate_y - y_pred) / (evaluate_y + 1), axis=0)
                        results += list(percentage_error) + [""]

                        logging.info('{} Percentage Error: {}'.format(train_test_label[i], percentage_error))

                        # Record the model with the lowest elapsed time prediction (since that might be the most
                        # important prediction)
                        if (i == 1 and percentage_error[elapsed_us_index] < min_percentage_error and transformer ==
                                transformers[-1]):
                            min_percentage_error = percentage_error[elapsed_us_index]
                            model_map[data.opunit] = regressor
                            pred_results = (evaluate_x, y_pred, evaluate_y)

                    # Dump the prediction results
                    transform = " "
                    if transformer is not None:
                        transform = " transform"
                    io_util.write_csv_result(metrics_path, method + transform, results)

                    logging.info("")

                io_util.write_csv_result(metrics_path, "", [])

            # Record the best prediction results on the test data
            result_writing_util.record_predictions(pred_results, prediction_path)

        return model_map