Example #1
0
    def train(self):
        """Train the mini-models

        :return: the map of the trained models
        """

        self.model_map = {}

        # Create the results files for the paper
        header = ["OpUnit", "Method"] + [
            target.name for target in data_info.MINI_MODEL_TARGET_LIST
        ]
        summary_file = "{}/mini_runner.csv".format(self.model_metrics_path)
        io_util.create_csv_file(summary_file, header)

        # First get the data for all mini runners
        for filename in sorted(
                glob.glob(os.path.join(self.input_path, '*.csv'))):
            print(filename)
            data_list = opunit_data.get_mini_runner_data(
                filename, self.model_metrics_path, self.txn_sample_interval,
                self.model_map, self.stats_map, self.trim)
            for data in data_list:
                best_y_transformer, best_method = self._train_data(
                    data, summary_file)
                if self.expose_all:
                    self._train_specific_model(data, best_y_transformer,
                                               best_method)

        return self.model_map
Example #2
0
def _txn_get_mini_runner_data(filename, model_results_path,
                              txn_sample_interval):
    # In the default case, the data does not need any pre-processing and the file name indicates the opunit
    df = pd.read_csv(filename)
    file_name = os.path.splitext(os.path.basename(filename))[0]

    # prepending a column of ones as the base transaction data feature
    base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int))
    df = pd.concat([base_x, df], axis=1)
    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
    start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.
                                                        START_TIME]].values
    cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.
                                                    CPU_ID]].values

    logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))

    # change the data based on the interval for the periodically invoked operating units
    prediction_path = "{}/{}_txn_converted_data.csv".format(
        model_results_path, file_name)
    io_util.create_csv_file(prediction_path, [""])

    interval = data_info.CONTENDING_OPUNIT_INTERVAL

    # Map from interval start time to the data in this interval
    interval_x_map = {}
    interval_y_map = {}
    interval_id_map = {}
    n = x.shape[0]
    for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"):
        rounded_time = data_util.round_to_interval(start_times[i], interval)
        if rounded_time not in interval_x_map:
            interval_x_map[rounded_time] = []
            interval_y_map[rounded_time] = []
            interval_id_map[rounded_time] = set()
        interval_x_map[rounded_time].append(x[i])
        interval_y_map[rounded_time].append(y[i])
        interval_id_map[rounded_time].add(cpu_ids[i])

    # Construct the new data
    x_list = []
    y_list = []
    for rounded_time in interval_x_map:
        # Sum the features
        x_new = np.sum(interval_x_map[rounded_time], axis=0)
        # Concatenate the number of different threads
        x_new = np.concatenate((x_new, [len(interval_id_map[rounded_time])]))
        x_new *= txn_sample_interval + 1
        x_list.append(x_new)
        # The prediction is the average behavior
        y_list.append(np.average(interval_y_map[rounded_time], axis=0))
        io_util.write_csv_result(prediction_path, rounded_time,
                                 np.concatenate((x_list[-1], y_list[-1])))

    return [
        OpUnitData(OpUnit[file_name.upper()], np.array(x_list),
                   np.array(y_list))
    ]
def _construct_interval_based_global_model_data(data_list, model_results_path):
    """Construct the GlobalImpactData used for the global model training

    :param data_list: The list of GroupedOpUnitData objects
    :param model_results_path: directory path to log the result information
    :return: (GlobalResourceData list, GlobalImpactData list)
    """
    prediction_path = "{}/global_resource_data.csv".format(model_results_path)
    io_util.create_csv_file(prediction_path,
                            ["Elapsed us", "# Concurrent OpUnit Groups"])

    start_time_list = sorted(
        [d.get_start_time(ConcurrentCountingMode.INTERVAL) for d in data_list])
    rounded_start_time_list = [_round_to_second(start_time_list[0])]
    # Map from interval start time to the data in this interval
    interval_data_map = {rounded_start_time_list[0]: []}
    # Get all the interval start times and initialize the map
    for t in start_time_list:
        rounded_time = _round_to_second(t)
        if rounded_time > rounded_start_time_list[-1]:
            rounded_start_time_list.append(rounded_time)
            interval_data_map[rounded_time] = []

    for data in tqdm.tqdm(data_list, desc="Find Interval Data"):
        # For each data, find the intervals that might overlap with it
        interval_start_time = _round_to_second(
            data.get_start_time(ConcurrentCountingMode.EXACT) -
            global_model_config.INTERVAL_SIZE +
            global_model_config.INTERVAL_SEGMENT)
        while interval_start_time <= data.get_end_time(
                ConcurrentCountingMode.ESTIMATED):
            if interval_start_time in interval_data_map:
                interval_data_map[interval_start_time].append(data)
            interval_start_time += global_model_config.INTERVAL_SEGMENT

    # Get the global resource data
    resource_data_map = {}
    for start_time in tqdm.tqdm(rounded_start_time_list,
                                desc="Construct GlobalResourceData"):
        resource_data_map[start_time] = _get_global_resource_data(
            start_time, interval_data_map[start_time], prediction_path)

    # Now construct the global impact data
    impact_data_list = []
    for data in data_list:
        interval_start_time = _round_to_second(
            data.get_start_time(ConcurrentCountingMode.INTERVAL))
        resource_data_list = []
        while interval_start_time <= data.get_end_time(
                ConcurrentCountingMode.ESTIMATED):
            if interval_start_time in resource_data_map:
                resource_data_list.append(
                    resource_data_map[interval_start_time])
            interval_start_time += global_model_config.INTERVAL_SIZE

        impact_data_list.append(
            global_model_data.GlobalImpactData(data, resource_data_list))

    return list(resource_data_map.values()), impact_data_list
Example #4
0
def _predict_grouped_opunit_data(data_list, mini_model_map,
                                 model_results_path):
    """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction
    result in place

    :param data_list: The list of the GroupedOpUnitData objects
    :param mini_model_map: The trained mini models
    :param model_results_path: file path to log the prediction results
    """
    prediction_path = "{}/grouped_opunit_prediction.csv".format(
        model_results_path)
    io_util.create_csv_file(
        prediction_path,
        ["Pipeline", "Actual Us", "Predicted Us", "", "Ratio Error"])

    # Have to use a prediction cache when having lots of global data...
    prediction_cache = {}

    # First run a prediction on the global running data with the mini model results
    for i, data in enumerate(
            tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")):
        y = data.y
        logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1]))
        pipeline_y_pred = 0
        x = None
        for opunit_feature in data.opunit_features:
            opunit = opunit_feature[0]
            opunit_model = mini_model_map[opunit]
            x = np.array(opunit_feature[1]).reshape(1, -1)
            key = (opunit, x.tobytes())
            if key not in prediction_cache:
                y_pred = opunit_model.predict(x)
                # subtract scan from certain double-counted opunits
                if opunit in data_info.SCAN_SUBSTRACT_UNITS:
                    scan_y_pred = mini_model_map[OpUnit.SEQ_SCAN].predict(x)
                    y_pred -= scan_y_pred
                y_pred = np.clip(y_pred, 0, None)
                prediction_cache[key] = y_pred
            else:
                y_pred = prediction_cache[key]
            logging.debug(
                "Predicted {} elapsed time with feature {}: {}".format(
                    opunit_feature[0].name, x[0], y_pred[0, -1]))
            pipeline_y_pred += y_pred[0]

        # Record the predicted
        data.y_pred = pipeline_y_pred
        logging.debug("{} pipeline predicted time: {}".format(
            data.name, pipeline_y_pred[-1]))
        ratio_error = abs(y - pipeline_y_pred) / (y + 1e-6)
        logging.debug("|Actual - Predict| / Actual: {}".format(
            ratio_error[-1]))

        io_util.write_csv_result(prediction_path,
                                 data.name + " " + str(x[0][-1]),
                                 [y[-1], pipeline_y_pred[-1], "", ratio_error])

        logging.debug("")
def create_metrics_and_prediction_files(metrics_path, prediction_path):
    """Create the prediction result files

    :param metrics_path: the file to store the prediction metrics
    :param prediction_path: the file to store the raw predictions
    :return:
    """
    # First write the header to the result files
    io_util.create_csv_file(metrics_path, ["Method"] + _get_result_labels())
    io_util.create_csv_file(prediction_path, None)
Example #6
0
def _interval_get_mini_runner_data(filename, model_results_path):
    # In the default case, the data does not need any pre-processing and the file name indicates the opunit
    df = pd.read_csv(filename, skipinitialspace=True)
    headers = list(df.columns.values)
    data_info.parse_csv_header(headers, False)
    file_name = os.path.splitext(os.path.basename(filename))[0]

    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
    start_times = df.iloc[:, data_info.RAW_TARGET_CSV_INDEX[Target.
                                                            START_TIME]].values
    logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))

    # change the data based on the interval for the periodically invoked operating units
    prediction_path = "{}/{}_interval_converted_data.csv".format(
        model_results_path, file_name)
    io_util.create_csv_file(prediction_path, [""])

    interval = data_info.PERIODIC_OPUNIT_INTERVAL

    # Map from interval start time to the data in this interval
    interval_x_map = {}
    interval_y_map = {}
    n = x.shape[0]
    for i in tqdm.tqdm(list(range(n)), desc="Group data by interval"):
        rounded_time = data_util.round_to_interval(start_times[i], interval)
        if rounded_time not in interval_x_map:
            interval_x_map[rounded_time] = []
            interval_y_map[rounded_time] = []
        interval_x_map[rounded_time].append(x[i])
        interval_y_map[rounded_time].append(y[i])

    # Construct the new data
    x_list = []
    y_list = []
    for rounded_time in interval_x_map:
        # Sum the features
        x_new = np.sum(interval_x_map[rounded_time], axis=0)
        # Keep the interval parameter the same
        # TODO: currently the interval parameter is always the last. Change the hard-coding later
        x_new[-1] /= len(interval_x_map[rounded_time])
        x_list.append(x_new)
        # The prediction is the average behavior
        y_list.append(np.average(interval_y_map[rounded_time], axis=0))
        io_util.write_csv_result(prediction_path, rounded_time,
                                 np.concatenate((x_list[-1], y_list[-1])))

    return [
        OpUnitData(OpUnit[file_name.upper()], np.array(x_list),
                   np.array(y_list))
    ]
Example #7
0
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path):
    """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction
    result in place

    :param data_list: The list of the GroupedOpUnitData objects
    :param mini_model_map: The trained mini models
    :param model_results_path: file path to log the prediction results
    """
    prediction_path = "{}/grouped_opunit_prediction.csv".format(model_results_path)
    io_util.create_csv_file(prediction_path, ["Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error"])

    # Have to use a prediction cache when having lots of global data...
    prediction_cache = {}

    # First run a prediction on the global running data with the mini model results
    for i, data in enumerate(tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")):
        y = data.y
        logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1]))
        pipeline_y_pred = 0
        x = None
        for opunit_feature in data.opunit_features:
            opunit = opunit_feature[0]
            opunit_model = mini_model_map[opunit]
            x = np.array(opunit_feature[1]).reshape(1, -1)
            key = (opunit, x.tobytes())
            if key not in prediction_cache:
                y_pred = opunit_model.predict(x)
                y_pred = np.clip(y_pred, 0, None)
                prediction_cache[key] = y_pred
            else:
                y_pred = prediction_cache[key]
            logging.debug("Predicted {} elapsed time with feature {}: {}".format(opunit_feature[0].name,
                                                                                 x[0], y_pred[0, -1]))

            if opunit in data_info.MEM_ADJUST_OPUNITS:
                # Compute the number of "slots" (based on row feature or cardinality feature
                num_tuple = opunit_feature[1][data_info.TUPLE_NUM_INDEX]
                if opunit == OpUnit.AGG_BUILD:
                    num_tuple = opunit_feature[1][data_info.CARDINALITY_INDEX]

                # SORT/AGG/HASHJOIN_BUILD all allocate a "pointer" buffer
                # that contains the first pow2 larger than num_tuple entries
                pow_high = 2 ** math.ceil(math.log(num_tuple, 2))
                buffer_size = pow_high * data_info.POINTER_SIZE
                if opunit == OpUnit.AGG_BUILD and num_tuple <= 256:
                    # For AGG_BUILD, if slots <= AggregationHashTable::K_DEFAULT_INITIAL_TABLE_SIZE
                    # the buffer is not recorded as part of the pipeline
                    buffer_size = 0

                pred_mem = y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]]
                if pred_mem <= buffer_size:
                    logging.warning("{} feature {} {} with prediction {} exceeds buffer {}"
                            .format(data.name, opunit_feature, opunit_feature[1], y_pred[0], buffer_size))

                # Poorly encapsulated, but memory scaling factor is located as the 2nd last of feature
                # slightly inaccurate since ignores load factors for hash tables
                adj_mem = (pred_mem - buffer_size) * opunit_feature[1][-2] + buffer_size

                # Don't modify prediction cache
                y_pred = copy.deepcopy(y_pred)
                y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] = adj_mem

            pipeline_y_pred += y_pred[0]

        # Record the predicted
        data.y_pred = pipeline_y_pred
        logging.debug("{} pipeline predicted time: {}".format(data.name, pipeline_y_pred[-1]))
        ratio_error = abs(y - pipeline_y_pred) / (y + 1)
        logging.debug("|Actual - Predict| / Actual: {}".format(ratio_error[-1]))

        io_util.write_csv_result(prediction_path, data.name, [""] + list(y) + [""] + list(pipeline_y_pred) + [""] +
                                 list(ratio_error))

        logging.debug("")
Example #8
0
    def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label,
                        data_list):
        """Record the prediction results

        :param x: the input data
        :param y: the actual output
        :param y_pred: the predicted output
        :param label: the result label ("resource" or "impact")
        """
        # Result files
        metrics_path = "{}/global_{}_model_metrics.csv".format(
            self.model_results_path, label)
        prediction_path = "{}/global_{}_model_prediction.csv".format(
            self.model_results_path, label)
        result_writing_util.create_metrics_and_prediction_files(
            metrics_path, prediction_path, True)

        # Log the prediction results
        ratio_error = np.average(np.abs(y - y_pred) / (y + 1e-6), axis=0)
        io_util.write_csv_result(metrics_path, "Model Ratio Error",
                                 ratio_error)
        result_writing_util.record_predictions((x, y_pred, y), prediction_path)

        # Print Error summary to command line
        if label == "resource":
            avg_original_ratio_error = np.average(
                np.abs(y - x[:, :y.shape[1]]) / (y + 1e-6), axis=0)
        else:
            avg_original_ratio_error = np.average(np.abs(1 / (y + 1e-6) - 1),
                                                  axis=0)
        logging.info('Model Original Ratio Error ({}): {}'.format(
            label, avg_original_ratio_error))
        logging.info('Model Ratio Error ({}): {}'.format(label, ratio_error))
        logging.info('')

        if label != "resource":
            # Calculate the accumulated ratio error
            epsilon = global_model_config.RATIO_DIVISION_EPSILON
            mini_model_y_pred = np.array(mini_model_y_pred)
            raw_y = np.array(raw_y)
            raw_y_pred = (mini_model_y_pred + epsilon) * y_pred
            accumulated_raw_y = np.sum(raw_y, axis=0)
            accumulated_raw_y_pred = np.sum(raw_y_pred, axis=0)
            original_ratio_error = np.abs(raw_y - mini_model_y_pred) / (
                raw_y + epsilon)
            avg_original_ratio_error = np.average(original_ratio_error, axis=0)
            ratio_error = np.abs(raw_y - raw_y_pred) / (raw_y + epsilon)
            avg_ratio_error = np.average(ratio_error, axis=0)
            accumulated_percentage_error = np.abs(
                accumulated_raw_y -
                accumulated_raw_y_pred) / (accumulated_raw_y + epsilon)
            original_accumulated_percentage_error = np.abs(
                accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / (
                    accumulated_raw_y + epsilon)

            logging.info(
                'Original Ratio Error: {}'.format(avg_original_ratio_error))
            io_util.write_csv_result(metrics_path, "Original Ratio Error",
                                     avg_original_ratio_error)
            logging.info('Ratio Error: {}'.format(avg_ratio_error))
            io_util.write_csv_result(metrics_path, "Ratio Error",
                                     avg_ratio_error)
            logging.info('Original Accumulated Ratio Error: {}'.format(
                original_accumulated_percentage_error))
            io_util.write_csv_result(metrics_path,
                                     "Original Accumulated Ratio Error",
                                     original_accumulated_percentage_error)
            logging.info('Accumulated Ratio Error: {}'.format(
                accumulated_percentage_error))
            io_util.write_csv_result(metrics_path, "Accumulated Ratio Error",
                                     accumulated_percentage_error)
            logging.info('Accumulated Actual: {}'.format(accumulated_raw_y))
            logging.info('Original Accumulated Predict: {}'.format(
                np.sum(mini_model_y_pred, axis=0)))
            logging.info(
                'Accumulated Predict: {}'.format(accumulated_raw_y_pred))

            if label == 'direct':
                prediction_path = "{}/grouped_opunit_prediction.csv".format(
                    self.model_results_path)
                io_util.create_csv_file(prediction_path, [
                    "Pipeline", "", "Actual", "", "Predicted", "",
                    "Ratio Error"
                ])
                for i, data in enumerate(data_list):
                    io_util.write_csv_result(prediction_path, data.name,
                                             [""] + list(raw_y[i]) + [""] +
                                             list(raw_y_pred[i]) + [""] +
                                             list(ratio_error[i]))

                average_result_path = "{}/interval_average_prediction.csv".format(
                    self.model_results_path)
                io_util.create_csv_file(
                    average_result_path,
                    ["Timestamp", "Actual Average", "Predicted Average"])

                interval_y_map = {}
                interval_y_pred_map = {}
                mark_list = None
                #mark_list = _generate_mark_list(data_list)
                for i, data in enumerate(data_list):
                    # Don't count the create index OU
                    # TODO(lin): needs better way to evaluate... maybe add a id_query field to GroupedOpunitData
                    if data.concurrency > 0:
                        continue
                    if mark_list is not None and not mark_list[i]:
                        continue
                    interval_time = _round_to_interval(
                        data.start_time,
                        global_model_config.AVERAGING_INTERVAL)
                    if interval_time not in interval_y_map:
                        interval_y_map[interval_time] = []
                        interval_y_pred_map[interval_time] = []
                    interval_y_map[interval_time].append(raw_y[i][-5])
                    interval_y_pred_map[interval_time].append(
                        raw_y_pred[i][-5])

                for time in sorted(interval_y_map.keys()):
                    if mark_list is None:
                        io_util.write_csv_result(average_result_path, time, [
                            np.average(interval_y_map[time]),
                            np.average(interval_y_pred_map[time])
                        ])
                    else:
                        io_util.write_csv_result(average_result_path, time, [
                            np.sum(interval_y_map[time]),
                            np.sum(interval_y_pred_map[time])
                        ])
Example #9
0
def _predict_grouped_opunit_data(data_list, mini_model_map, model_results_path):
    """Use the mini-runner to predict the resource consumptions for all the GlobalData, and record the prediction
    result in place

    :param data_list: The list of the GroupedOpUnitData objects
    :param mini_model_map: The trained mini models
    :param model_results_path: file path to log the prediction results
    """
    prediction_path = "{}/grouped_opunit_prediction.csv".format(model_results_path)
    pipeline_path = "{}/grouped_pipeline.csv".format(model_results_path)
    io_util.create_csv_file(prediction_path, ["Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error"])
    io_util.create_csv_file(pipeline_path, ["Number", "Percentage", "Pipeline", "Actual Us", "Predicted Us", "Us Error", "Absolute Us", "Assolute Us %"])

    # Track pipeline cumulative numbers
    num_pipelines = 0
    total_actual = None
    total_predicted = []
    actual_pipelines = {}
    predicted_pipelines = {}
    count_pipelines = {}

    query_prediction_path = "{}/grouped_query_prediction.csv".format(model_results_path)
    io_util.create_csv_file(query_prediction_path, ["Query", "", "Actual", "", "Predicted", "", "Ratio Error"])
    current_query_id = None
    query_y = None
    query_y_pred = None

    # Have to use a prediction cache when having lots of global data...
    prediction_cache = {}

    # First run a prediction on the global running data with the mini model results
    for i, data in enumerate(tqdm.tqdm(data_list, desc="Predict GroupedOpUnitData")):
        y = data.y
        logging.debug("{} pipeline elapsed time: {}".format(data.name, y[-1]))

        pipeline_y_pred = 0
        x = None
        for opunit_feature in data.opunit_features:
            opunit = opunit_feature[0]
            opunit_model = mini_model_map[opunit]
            x = np.array(opunit_feature[1]).reshape(1, -1)
            key = (opunit, x.tobytes())
            if key not in prediction_cache:
                y_pred = opunit_model.predict(x)
                y_pred = np.clip(y_pred, 0, None)
                prediction_cache[key] = y_pred
            else:
                y_pred = prediction_cache[key]
            logging.debug("Predicted {} elapsed time with feature {}: {}".format(opunit_feature[0].name,
                                                                                 x[0], y_pred[0, -1]))

            if opunit in data_info.MEM_ADJUST_OPUNITS:
                # Compute the number of "slots" (based on row feature or cardinality feature
                num_tuple = opunit_feature[1][data_info.TUPLE_NUM_INDEX]
                if opunit == OpUnit.AGG_BUILD:
                    num_tuple = opunit_feature[1][data_info.CARDINALITY_INDEX]

                # SORT/AGG/HASHJOIN_BUILD all allocate a "pointer" buffer
                # that contains the first pow2 larger than num_tuple entries
                pow_high = 2 ** math.ceil(math.log(num_tuple, 2))
                buffer_size = pow_high * data_info.POINTER_SIZE
                if opunit == OpUnit.AGG_BUILD and num_tuple <= 256:
                    # For AGG_BUILD, if slots <= AggregationHashTable::K_DEFAULT_INITIAL_TABLE_SIZE
                    # the buffer is not recorded as part of the pipeline
                    buffer_size = 0

                pred_mem = y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]]
                if pred_mem <= buffer_size:
                    logging.warning("{} feature {} {} with prediction {} exceeds buffer {}"
                                    .format(data.name, opunit_feature, opunit_feature[1], y_pred[0], buffer_size))

                # Poorly encapsulated, but memory scaling factor is located as the 2nd last of feature
                # slightly inaccurate since ignores load factors for hash tables
                adj_mem = (pred_mem - buffer_size) * opunit_feature[1][-3] + buffer_size

                # Don't modify prediction cache
                y_pred = copy.deepcopy(y_pred)
                y_pred[0][data_info.TARGET_CSV_INDEX[Target.MEMORY_B]] = adj_mem

            pipeline_y_pred += y_pred[0]

        pipeline_y = copy.deepcopy(pipeline_y_pred)

        # Grouping when we're predicting queries
        if data.name[0] == 'q':
            query_id = data.name[1:data.name.rfind(" p")]
            if query_id != current_query_id:
                if current_query_id is not None:
                    io_util.write_csv_result(query_prediction_path, current_query_id, [""] + list(query_y) + [""] +
                                             list(query_y_pred) + [""] +
                                             list(abs(query_y - query_y_pred) / (query_y + 1)))

                current_query_id = query_id
                query_y = y
                query_y_pred = pipeline_y_pred
            else:
                query_y += y
                query_y_pred += pipeline_y_pred

        data.y_pred = pipeline_y
        logging.debug("{} pipeline prediction: {}".format(data.name, pipeline_y))
        logging.debug("{} pipeline predicted time: {}".format(data.name, pipeline_y[-1]))
        ratio_error = abs(y - pipeline_y) / (y + 1)
        logging.debug("|Actual - Predict| / Actual: {}".format(ratio_error[-1]))

        io_util.write_csv_result(prediction_path, data.name, [""] + list(y) + [""] + list(pipeline_y) + [""] +
                                 list(ratio_error))

        logging.debug("")

        # Record cumulative numbers
        if data.name not in actual_pipelines:
            actual_pipelines[data.name] = copy.deepcopy(y)
            predicted_pipelines[data.name] = copy.deepcopy(pipeline_y)
            count_pipelines[data.name] = 1
        else:
            actual_pipelines[data.name] += y
            predicted_pipelines[data.name] += pipeline_y
            count_pipelines[data.name] += 1

        # Update totals
        if total_actual is None:
            total_actual = copy.deepcopy(y)
            total_predicted = copy.deepcopy(pipeline_y)
        else:
            total_actual += y
            total_predicted += pipeline_y

        num_pipelines += 1

    total_elapsed_err = 0
    for pipeline in actual_pipelines:
        actual = actual_pipelines[pipeline]
        predicted = predicted_pipelines[pipeline]
        total_elapsed_err = total_elapsed_err + (abs(actual - predicted))[-1]

    for pipeline in actual_pipelines:
        actual = actual_pipelines[pipeline]
        predicted = predicted_pipelines[pipeline]
        num = count_pipelines[pipeline]

        ratio_error = abs(actual - predicted) / (actual + 1)
        abs_error = abs(actual - predicted)[-1]
        pabs_error = abs_error / total_elapsed_err
        io_util.write_csv_result(pipeline_path, pipeline, [num, num*1.0/num_pipelines, actual[-1],
                                 predicted[-1], ratio_error[-1], abs_error, pabs_error] +
                                 [""] + list(actual) + [""] + list(predicted) + [""] + list(ratio_error))

    ratio_error = abs(total_actual - total_predicted) / (total_actual + 1)
    io_util.write_csv_result(pipeline_path, "Total Pipeline", [num_pipelines, 1, total_actual[-1],
                             total_predicted[-1], ratio_error[-1], total_elapsed_err, 1] +
                             [""] + list(total_actual) + [""] + list(total_predicted) + [""] + list(ratio_error))