Example #1
0
def get_full_data():
    df, features = data_loader.get_dataset("data/darshan_theta_2017_2020.csv",
                                           "POSIX")

    df.reset_index(inplace=True)
    df.drop(columns=['index', 'level_0'], inplace=True)

    X_train, X_test, y_train, y_test = test_set_utils.random_split(
        df,
        "POSIX_AGG_PERF_BY_SLOWEST_LOG10",
        keep_columns=features,
        test_size=0.3)

    regressor = xgb.XGBRegressor(obj=huber_approx_obj,
                                 n_estimators=2**11,
                                 max_depth=7,
                                 colsample_bytree=0.8,
                                 subsample=1)
    regressor.fit(X_train, y_train, eval_metric=huber_approx_obj)
    y_pred_test = regressor.predict(X_test)

    df = pd.DataFrame({
        'POSIX_AGG_PERF_BY_SLOWEST_LOG10': y_test,
        'prediction': y_pred_test
    })

    return df
def predict(split_time):
    df, columns = data_loader.get_dataset('data/darshan_theta_2017_2020.csv',
                                          'POSIX',
                                          min_job_volume=0)

    df_before = df[df.START_TIME <= split_time]
    df_after = df[df.START_TIME > split_time]

    X_train, X_test_before, y_train, y_test_before = \
        sklearn.model_selection.train_test_split(df_before[columns + ["START_TIME"]], df_before.POSIX_AGG_PERF_BY_SLOWEST_LOG10, test_size=0.3)

    timestamps_before, timestamps_after = X_test_before.START_TIME.to_numpy(
    ), df_after.START_TIME.to_numpy()
    X_train, X_test_before = X_train[columns], X_test_before[columns]

    X_test_after, y_test_after = df_after[
        columns], df_after.POSIX_AGG_PERF_BY_SLOWEST_LOG10

    X_train, X_test_before, X_test_after, y_train, y_test_before, y_test_after = \
        X_train.to_numpy(), X_test_before.to_numpy(), X_test_after.to_numpy(), y_train.to_numpy(), y_test_before.to_numpy(), y_test_after.to_numpy()

    len_before = len(y_test_before)

    y_pred_train, y_pred_test = prediction_results(
        X_train, y_train, np.concatenate((X_test_before, X_test_after)),
        np.concatenate((y_test_before, y_test_after)))
    y_pred_test_before, y_pred_test_after = y_pred_test[:
                                                        len_before], y_pred_test[
                                                            len_before:]

    return timestamps_before, y_test_before, y_pred_test_before, timestamps_after, y_test_after, y_pred_test_after
Example #3
0
def load_dataset():
    df, features = data_loader.get_dataset('data/darshan_theta_2017_2020.csv',
                                           'POSIX',
                                           min_job_volume=0)

    df = df[df.POSIX_TOTAL_BYTES >= 10 * 1024**2]
    df = df.sample(100000, random_state=0)

    return df, features
    def load_data(self):

        dataset = get_dataset(self.args.data, normalize=self.args.normalize)
        self.args.num_features, self.args.num_classes, self.args.avg_num_nodes = dataset.num_features, dataset.num_classes, np.ceil(
            np.mean([data.num_nodes for data in dataset]))
        print('# %s: [FEATURES]-%d [NUM_CLASSES]-%d [AVG_NODES]-%d' %
              (dataset, self.args.num_features, self.args.num_classes,
               self.args.avg_num_nodes))

        return dataset
Example #5
0
def load_dataset(module, remove_runtime):
    df, features = data_loader.get_dataset('data/darshan_theta_2017_2020.csv',
                                           module,
                                           min_job_volume=0)

    if module == "POSIX":
        features.remove("POSIX_FDSYNCS_LOG10")

    if remove_runtime:
        features.remove("RUNTIME_LOG10")

    return df, features
Example #6
0
def get_duplicate_data():
    df, features = data_loader.get_dataset("data/darshan_theta_2017_2020.csv", "POSIX")

    df = df[df.duplicated(features, keep=False)]
    df['prediction'] = -1
    df['time_diff'] = -1
    df.reset_index(inplace=True)
    df.drop(columns=['index', 'level_0'], inplace=True)

    for f, duplicate_set in df.groupby(features):
        group_size = duplicate_set.shape[0]
        sum_throughput = duplicate_set.POSIX_AGG_PERF_BY_SLOWEST_LOG10.sum()
        sum_time       = duplicate_set.START_TIME.sum()
        for idx, row in duplicate_set.iterrows(): 
            df.iloc[idx, -2] = (sum_throughput - row.POSIX_AGG_PERF_BY_SLOWEST_LOG10) / (group_size - 1)
            df.iloc[idx, -1] = np.abs((sum_time - row.START_TIME) / (group_size - 1) - row.START_TIME)

    return df
def grid_search(max_log2_trees, max_log2_depth):
    """
    Run a grid search over two parameters: tree depth and number of trees. For 
    each configuration, train an XGBoost regressor and evaluate its performance
    on the test set. Plot a matrix of the results.
    """
    df, features = data_loader.get_dataset('data/darshan_theta_2017_2020.csv',
                                           'POSIX',
                                           min_job_volume=0)

    # Don't include runtime in the set of input features
    features = [f for f in features if f != 'RUNTIME_LOG10']

    df_train, df_test = sklearn.model_selection.train_test_split(df,
                                                                 test_size=0.2)

    X_train, X_test = df_train[features], df_test[features]
    # POSIX_AGG_PERF_BY_SLOWEST_LOG10 is log10 of Darshan's I/O throughput estimate
    y_train, y_test = df_train["POSIX_AGG_PERF_BY_SLOWEST_LOG10"], df_test[
        "POSIX_AGG_PERF_BY_SLOWEST_LOG10"]

    results = {"depth": [], "trees": [], "error": []}

    def evaluate_configuration(depth, trees):
        regressor = xgb.XGBRegressor(obj=huber_approx_obj,
                                     n_estimators=trees,
                                     max_log2_depth=depth)
        regressor.fit(X_train, y_train, eval_metric=huber_approx_obj)
        y_pred_test = regressor.predict(X_test)

        error = np.median(10**np.abs(y_test - y_pred_test))
        return error

    for trees in [2**x for x in range(1, max_log2_trees + 1)]:
        for depth in range(1, max_log2_depth + 1):
            error = evaluate_configuration(depth, trees)
            print(f"Trees: {trees}, depth: {depth}, error: {error}")

            results['depth'].append(depth)
            results['trees'].append(trees)
            results['error'].append(error)

    return pd.DataFrame(results)
def calculate_duplicate_errors():
    """
    Get all duplicates, take their mean, and predict the throughput
    Returns the real (target) throughputs and relative prediction errors.

    Returns: 
        a list of target throughputs and a list of relative errors
    """
    df, features = data_loader.get_dataset("data/darshan_theta_2017_2020.csv",
                                           "POSIX")

    duplicated = df.duplicated(features, keep=False)
    apps = df[duplicated]["apps_short"]

    regressor = xgb.XGBRegressor(n_estimators=4000, depth=8)
    regressor.fit(df[duplicated][features],
                  df[duplicated]['POSIX_AGG_PERF_BY_SLOWEST_LOG10'])
    y_pred = regressor.predict(df[duplicated][features])

    return df[duplicated]['POSIX_AGG_PERF_BY_SLOWEST_LOG10'], y_pred, apps
Example #9
0
def load_dataset():
    df, features = data_loader.get_dataset('data/darshan_theta_2017_2020.csv',
                                           'POSIX',
                                           min_job_volume=0)

    return df, features