def main():
    from sklearn.linear_model import LogisticRegression

    # Load generated data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    params.n_classes = len(np.unique(y_train))

    if params.multiclass == 'auto':
        params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial'

    if not params.tol:
        params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10

    # Create our classifier object
    clf = LogisticRegression(penalty='l2',
                             C=params.C,
                             n_jobs=params.n_jobs,
                             fit_intercept=params.fit_intercept,
                             verbose=params.verbose,
                             tol=params.tol,
                             max_iter=params.maxiter,
                             solver=params.solver,
                             multi_class=params.multiclass)
    # Time fit and predict
    fit_time, _ = bench.measure_function_time(clf.fit,
                                              X_train,
                                              y_train,
                                              params=params)

    y_pred = clf.predict(X_train)
    y_proba = clf.predict_proba(X_train)
    train_acc = bench.accuracy_score(y_train, y_pred)
    train_log_loss = bench.log_loss(y_train, y_proba)
    train_roc_auc = bench.roc_auc_score(y_train, y_proba)

    predict_time, y_pred = bench.measure_function_time(clf.predict,
                                                       X_test,
                                                       params=params)
    y_proba = clf.predict_proba(X_test)
    test_acc = bench.accuracy_score(y_test, y_pred)
    test_log_loss = bench.log_loss(y_test, y_proba)
    test_roc_auc = bench.roc_auc_score(y_test, y_proba)

    bench.print_output(
        library='sklearn',
        algorithm='logistic_regression',
        stages=['training', 'prediction'],
        params=params,
        functions=['LogReg.fit', 'LogReg.predict'],
        times=[fit_time, predict_time],
        metric_type=['accuracy', 'log_loss', 'roc_auc'],
        metrics=[
            [train_acc, test_acc],
            [train_log_loss, test_log_loss],
            [train_roc_auc, test_roc_auc],
        ],
        data=[X_train, X_test],
        alg_instance=clf,
    )
Exemple #2
0
def main():
    from sklearn.ensemble import RandomForestClassifier

    # Load and convert data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    # Create our random forest classifier
    clf = RandomForestClassifier(
        criterion=params.criterion,
        n_estimators=params.num_trees,
        max_depth=params.max_depth,
        max_features=params.max_features,
        min_samples_split=params.min_samples_split,
        max_leaf_nodes=params.max_leaf_nodes,
        min_impurity_decrease=params.min_impurity_decrease,
        bootstrap=params.bootstrap,
        random_state=params.seed,
        n_jobs=params.n_jobs)

    params.n_classes = len(np.unique(y_train))

    fit_time, _ = bench.measure_function_time(clf.fit,
                                              X_train,
                                              y_train,
                                              params=params)
    y_pred = clf.predict(X_train)
    y_proba = clf.predict_proba(X_train)
    train_acc = bench.accuracy_score(y_train, y_pred)
    train_log_loss = bench.log_loss(y_train, y_proba)
    train_roc_auc = bench.roc_auc_score(y_train, y_proba)

    predict_time, y_pred = bench.measure_function_time(clf.predict,
                                                       X_test,
                                                       params=params)
    y_proba = clf.predict_proba(X_test)
    test_acc = bench.accuracy_score(y_test, y_pred)
    test_log_loss = bench.log_loss(y_test, y_proba)
    test_roc_auc = bench.roc_auc_score(y_test, y_proba)

    bench.print_output(
        library='sklearn',
        algorithm='df_clsf',
        stages=['training', 'prediction'],
        params=params,
        functions=['df_clsf.fit', 'df_clsf.predict'],
        times=[fit_time, predict_time],
        metric_type=['accuracy', 'log_loss', 'roc_auc'],
        metrics=[
            [train_acc, test_acc],
            [train_log_loss, test_log_loss],
            [train_roc_auc, test_roc_auc],
        ],
        data=[X_train, X_test],
        alg_instance=clf,
    )
Exemple #3
0
def main():
    from sklearn.neighbors import KNeighborsClassifier

    # Load generated data
    X_train, X_test, y_train, y_test = bench.load_data(params)
    params.n_classes = len(np.unique(y_train))

    # Create classification object
    knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors,
                                    weights=params.weights,
                                    algorithm=params.method,
                                    metric=params.metric,
                                    n_jobs=params.n_jobs)

    # Measure time and accuracy on fitting
    train_time, _ = bench.measure_function_time(knn_clsf.fit,
                                                X_train,
                                                y_train,
                                                params=params)
    if params.task == 'classification':
        y_pred = knn_clsf.predict(X_train)
        y_proba = knn_clsf.predict_proba(X_train)
        train_acc = bench.accuracy_score(y_train, y_pred)
        train_log_loss = bench.log_loss(y_train, y_proba)
        train_roc_auc = bench.roc_auc_score(y_train, y_proba)

    # Measure time and accuracy on prediction
    if params.task == 'classification':
        predict_time, yp = bench.measure_function_time(knn_clsf.predict,
                                                       X_test,
                                                       params=params)
        y_proba = knn_clsf.predict_proba(X_test)
        test_acc = bench.accuracy_score(y_test, yp)
        test_log_loss = bench.log_loss(y_test, y_proba)
        test_roc_auc = bench.roc_auc_score(y_test, y_proba)
    else:
        predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors,
                                                      X_test,
                                                      params=params)

    if params.task == 'classification':
        bench.print_output(
            library='sklearn',
            algorithm=knn_clsf._fit_method + '_knn_classification',
            stages=['training', 'prediction'],
            params=params,
            functions=['knn_clsf.fit', 'knn_clsf.predict'],
            times=[train_time, predict_time],
            metric_type=['accuracy', 'log_loss', 'roc_auc'],
            metrics=[
                [train_acc, test_acc],
                [train_log_loss, test_log_loss],
                [train_roc_auc, test_roc_auc],
            ],
            data=[X_train, X_test],
            alg_instance=knn_clsf,
        )
    else:
        bench.print_output(
            library='sklearn',
            algorithm=knn_clsf._fit_method + '_knn_search',
            stages=['training', 'search'],
            params=params,
            functions=['knn_clsf.fit', 'knn_clsf.kneighbors'],
            times=[train_time, predict_time],
            metric_type=None,
            metrics=[],
            data=[X_train, X_test],
            alg_instance=knn_clsf,
        )
 def metric_call(x, y):
     return 100 * bench.accuracy_score(x, y)
def main():
    from sklearn.svm import SVC

    X_train, X_test, y_train, y_test = bench.load_data(params)
    y_train = np.asfortranarray(y_train).ravel()

    if params.gamma is None:
        params.gamma = 1.0 / X_train.shape[1]

    cache_size_bytes = bench.get_optimal_cache_size(
        X_train.shape[0], max_cache=params.max_cache_size)
    params.cache_size_mb = cache_size_bytes / 1024**2
    params.n_classes = len(np.unique(y_train))

    clf = SVC(C=params.C,
              kernel=params.kernel,
              cache_size=params.cache_size_mb,
              tol=params.tol,
              gamma=params.gamma,
              probability=params.probability,
              random_state=43,
              degree=params.degree)

    fit_time, _ = bench.measure_function_time(clf.fit,
                                              X_train,
                                              y_train,
                                              params=params)
    params.sv_len = clf.support_.shape[0]

    if params.probability:
        state_predict = 'predict_proba'
        clf_predict = clf.predict_proba
        train_acc = None
        test_acc = None

        predict_train_time, y_pred = bench.measure_function_time(clf_predict,
                                                                 X_train,
                                                                 params=params)
        train_log_loss = bench.log_loss(y_train, y_pred)
        train_roc_auc = bench.roc_auc_score(y_train, y_pred)

        _, y_pred = bench.measure_function_time(clf_predict,
                                                X_test,
                                                params=params)
        test_log_loss = bench.log_loss(y_test, y_pred)
        test_roc_auc = bench.roc_auc_score(y_test, y_pred)
    else:
        state_predict = 'prediction'
        clf_predict = clf.predict
        train_log_loss = None
        test_log_loss = None
        train_roc_auc = None
        test_roc_auc = None

        predict_train_time, y_pred = bench.measure_function_time(clf_predict,
                                                                 X_train,
                                                                 params=params)
        train_acc = bench.accuracy_score(y_train, y_pred)

        _, y_pred = bench.measure_function_time(clf_predict,
                                                X_test,
                                                params=params)
        test_acc = bench.accuracy_score(y_test, y_pred)

    bench.print_output(
        library='sklearn',
        algorithm='SVC',
        stages=['training', state_predict],
        params=params,
        functions=['SVM.fit', f'SVM.{state_predict}'],
        times=[fit_time, predict_train_time],
        metric_type=['accuracy', 'log_loss', 'roc_auc', 'n_sv'],
        metrics=[
            [train_acc, test_acc],
            [train_log_loss, test_log_loss],
            [train_roc_auc, test_roc_auc],
            [int(clf.n_support_.sum()),
             int(clf.n_support_.sum())],
        ],
        data=[X_train, X_train],
        alg_instance=clf,
    )
Exemple #6
0
    xgb_params.update({'nthread': params.threads})

if 'OMP_NUM_THREADS' in os.environ.keys():
    xgb_params['nthread'] = int(os.environ['OMP_NUM_THREADS'])

columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
           'num_trees')

if params.objective.startswith('reg'):
    task = 'regression'
    metric_name, metric_func = 'rmse', rmse_score
    columns += ('rmse', 'time')
else:
    task = 'classification'
    metric_name = 'accuracy[%]'
    metric_func = lambda y1, y2: 100 * accuracy_score(y1, y2)
    columns += ('n_classes', 'accuracy', 'time')
    if 'cudf' in str(type(y_train)):
        params.n_classes = y_train[y_train.columns[0]].nunique()
    else:
        params.n_classes = len(np.unique(y_train))
    if params.n_classes > 2:
        xgb_params['num_class'] = params.n_classes

dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

fit_time, booster = measure_function_time(xgb.train,
                                          xgb_params,
                                          dtrain,
                                          params.n_estimators,
Exemple #7
0
if params.gamma is None:
    params.gamma = 1.0 / X_train.shape[1]

cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0],
                                                max_cache=params.max_cache_size)
params.cache_size_mb = cache_size_bytes / 1024**2
params.n_classes = y_train[y_train.columns[0]].nunique()

# Create our C-SVM classifier
clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter,
          cache_size=params.cache_size_mb, tol=params.tol,
          gamma=params.gamma)

# Time fit and predict
fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params)
params.sv_len = clf.support_.shape[0]

predict_time, y_pred = bench.measure_function_time(
    clf.predict, X_train, params=params)
train_acc = 100 * bench.accuracy_score(y_pred, y_train)

y_pred = clf.predict(X_test)
test_acc = 100 * bench.accuracy_score(y_pred, y_test)

bench.print_output(library='cuml', algorithm='svc',
                   stages=['training', 'prediction'], params=params,
                   functions=['SVM.fit', 'SVM.predict'],
                   times=[fit_time, predict_time], accuracy_type='accuracy[%]',
                   accuracies=[train_acc, test_acc], data=[X_train, X_train],
                   alg_instance=clf)
                                  max_iter=params.maxiter,
                                  solver=params.solver,
                                  outer_loops=params.fit_outer_loops,
                                  inner_loops=params.fit_inner_loops)

    beta, intercept, solver_result, params.multiclass = res
    print_row(columns, params, function='LogReg.fit', time=fit_time)

    predict_time, yp = time_mean_min(test_predict,
                                     X,
                                     beta,
                                     intercept=intercept,
                                     multi_class=params.multiclass,
                                     outer_loops=params.predict_outer_loops,
                                     inner_loops=params.predict_inner_loops)
    y_pred = np.argmax(yp, axis=1)
    acc = 100 * accuracy_score(y_pred, y)
    print_row(columns,
              params,
              function='LogReg.predict',
              time=predict_time,
              accuracy=acc)

    if params.verbose:
        print()
        print("@ Number of iterations: {}".format(solver_result.nit))
        print("@ fit coefficients:")
        print("@ {}".format(beta.tolist()))
        print("@ fit intercept:")
        print("@ {}".format(intercept.tolist()))
Exemple #9
0
# Load generated data
X_train, X_test, y_train, y_test = load_data(params)
params.n_classes = len(np.unique(y_train))

# Create classification object
knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors,
                                weights=params.weights,
                                algorithm=params.method,
                                metric=params.metric)

knn_clsf.fit(X_train, y_train)
# Time predict
time, yp = measure_function_time(knn_clsf.predict, X_test, params=params)

acc = 100 * accuracy_score(yp, y_test)

columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
           'n_neighbors', 'n_classes', 'time')

print_output(library='sklearn',
             algorithm='knn_classification',
             stages=['prediction'],
             columns=columns,
             params=params,
             functions=['knn_clsf.predict'],
             times=[time],
             accuracies=[acc],
             accuracy_type='accuracy[%]',
             data=[X_test],
             alg_instance=knn_clsf)
Exemple #10
0
def main():
    parser = argparse.ArgumentParser(description='daal4py SVC benchmark with '
                                     'linear kernel')
    parser.add_argument('-C',
                        dest='C',
                        type=float,
                        default=1.0,
                        help='SVM regularization parameter')
    parser.add_argument('--kernel',
                        choices=('linear', 'rbf'),
                        default='linear',
                        help='SVM kernel function')
    parser.add_argument('--gamma',
                        type=float,
                        default=None,
                        help='Parameter for kernel="rbf"')
    parser.add_argument('--maxiter',
                        type=int,
                        default=100000,
                        help='Maximum iterations for the iterative solver. ')
    parser.add_argument('--max-cache-size',
                        type=int,
                        default=8,
                        help='Maximum cache size, in gigabytes, for SVM.')
    parser.add_argument('--tau',
                        type=float,
                        default=1e-12,
                        help='Tau parameter for working set selection scheme')
    parser.add_argument('--tol', type=float, default=1e-3, help='Tolerance')
    parser.add_argument('--no-shrinking',
                        action='store_false',
                        default=True,
                        dest='shrinking',
                        help="Don't use shrinking heuristic")
    params = parse_args(parser, prefix='daal4py')

    # Load data
    X_train, X_test, y_train, y_test = load_data(params,
                                                 add_dtype=True,
                                                 label_2d=True)

    if params.gamma is None:
        params.gamma = 1 / X_train.shape[1]

    cache_size_bytes = get_optimal_cache_size(X_train.shape[0],
                                              max_cache=params.max_cache_size)
    params.cache_size_mb = cache_size_bytes / 2**20
    params.cache_size_bytes = cache_size_bytes
    params.n_classes = np.unique(y_train).size

    columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype',
               'size', 'kernel', 'cache_size_mb', 'C', 'sv_len', 'n_classes',
               'accuracy', 'time')

    # Time fit and predict
    fit_time, res = measure_function_time(test_fit,
                                          X_train,
                                          y_train,
                                          params,
                                          params=params)
    res, support, indices, n_support = res
    params.sv_len = support.shape[0]

    yp = test_predict(X_train, res, params)
    train_acc = 100 * accuracy_score(yp, y_train)

    predict_time, yp = measure_function_time(test_predict,
                                             X_test,
                                             res,
                                             params,
                                             params=params)

    test_acc = 100 * accuracy_score(yp, y_train)

    print_output(library='daal4py',
                 algorithm='svc',
                 stages=['training', 'prediction'],
                 columns=columns,
                 params=params,
                 functions=['SVM.fit', 'SVM.predict'],
                 times=[fit_time, predict_time],
                 accuracy_type='accuracy[%]',
                 accuracies=[train_acc, test_acc],
                 data=[X_train, X_test])
    # Time fit and predict
    fit_time, res = measure_function_time(
        df_clsf_fit,
        X_train,
        y_train,
        params.n_classes,
        n_trees=params.num_trees,
        n_features_per_node=params.max_features,
        max_depth=params.max_depth,
        min_impurity=params.min_impurity_decrease,
        bootstrap=params.bootstrap,
        seed=params.seed,
        params=params)

    yp = df_clsf_predict(X_train, res, params.n_classes)
    train_acc = 100 * accuracy_score(yp, y_train)

    predict_time, yp = measure_function_time(df_clsf_predict,
                                             X_test,
                                             res,
                                             params.n_classes,
                                             params=params)
    test_acc = 100 * accuracy_score(yp, y_test)

    print_output(library='daal4py',
                 algorithm='decision_forest_classification',
                 stages=['training', 'prediction'],
                 columns=columns,
                 params=params,
                 functions=['df_clsf.fit', 'df_clsf.predict'],
                 times=[fit_time, predict_time],
Exemple #12
0
    print_header(columns, params)

    # Time fit and predict
    fit_time, res = time_mean_min(df_clsf_fit,
                                  X,
                                  y,
                                  params.n_classes,
                                  n_trees=params.num_trees,
                                  seed=params.seed,
                                  n_features_per_node=params.max_features,
                                  max_depth=params.max_depth,
                                  verbose=params.verbose,
                                  outer_loops=params.fit_outer_loops,
                                  inner_loops=params.fit_inner_loops)
    print_row(columns, params, function='df_clsf.fit', time=fit_time)

    predict_time, yp = time_mean_min(df_clsf_predict,
                                     X,
                                     res,
                                     params.n_classes,
                                     verbose=params.verbose,
                                     outer_loops=params.predict_outer_loops,
                                     inner_loops=params.predict_inner_loops)
    acc = 100 * accuracy_score(yp, y)
    print_row(columns,
              params,
              function='df_clsf.predict',
              time=predict_time,
              accuracy=acc)