Ejemplo n.º 1
0
def main():
    from sklearn.decomposition import PCA

    # Load random data
    X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train'])

    if params.n_components is None:
        p, n = X_train.shape
        params.n_components = min((n, (2 + min((n, p))) // 3))

    # Create our PCA object
    pca = PCA(svd_solver=params.svd_solver, whiten=params.whiten,
              n_components=params.n_components)

    # Time fit
    fit_time, _ = bench.measure_function_time(pca.fit, X_train, params=params)

    # Time transform
    transform_time, _ = bench.measure_function_time(
        pca.transform, X_train, params=params)

    bench.print_output(library='sklearn', algorithm='pca',
                       stages=['training', 'transformation'],
                       params=params, functions=['PCA.fit', 'PCA.transform'],
                       times=[fit_time, transform_time], accuracy_type=None,
                       accuracies=[None, None], data=[X_train, X_test],
                       alg_instance=pca)
Ejemplo n.º 2
0
def main():
    from sklearn.ensemble import RandomForestClassifier

    # Load and convert data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    # Create our random forest classifier
    clf = RandomForestClassifier(criterion=params.criterion,
                                 n_estimators=params.num_trees,
                                 max_depth=params.max_depth,
                                 max_features=params.max_features,
                                 min_samples_split=params.min_samples_split,
                                 max_leaf_nodes=params.max_leaf_nodes,
                                 min_impurity_decrease=params.min_impurity_decrease,
                                 bootstrap=params.bootstrap,
                                 random_state=params.seed,
                                 n_jobs=params.n_jobs)

    params.n_classes = len(np.unique(y_train))

    fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params)
    y_pred = clf.predict(X_train)
    train_acc = 100 * accuracy_score(y_pred, y_train)

    predict_time, y_pred = bench.measure_function_time(
        clf.predict, X_test, params=params)
    test_acc = 100 * accuracy_score(y_pred, y_test)

    bench.print_output(library='sklearn', algorithm='decision_forest_classification',
                       stages=['training', 'prediction'], params=params,
                       functions=['df_clsf.fit', 'df_clsf.predict'],
                       times=[fit_time, predict_time], accuracy_type='accuracy[%]',
                       accuracies=[train_acc, test_acc], data=[X_train, X_test],
                       alg_instance=clf)
Ejemplo n.º 3
0
def main():
    from sklearn.linear_model import LogisticRegression

    # Load generated data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    params.n_classes = len(np.unique(y_train))

    if params.multiclass == 'auto':
        params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial'

    if not params.tol:
        params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10

    # Create our classifier object
    clf = LogisticRegression(penalty='l2',
                             C=params.C,
                             n_jobs=params.n_jobs,
                             fit_intercept=params.fit_intercept,
                             verbose=params.verbose,
                             tol=params.tol,
                             max_iter=params.maxiter,
                             solver=params.solver,
                             multi_class=params.multiclass)
    # Time fit and predict
    fit_time, _ = bench.measure_function_time(clf.fit,
                                              X_train,
                                              y_train,
                                              params=params)

    y_pred = clf.predict(X_train)
    y_proba = clf.predict_proba(X_train)
    train_acc = bench.accuracy_score(y_train, y_pred)
    train_log_loss = bench.log_loss(y_train, y_proba)
    train_roc_auc = bench.roc_auc_score(y_train, y_proba)

    predict_time, y_pred = bench.measure_function_time(clf.predict,
                                                       X_test,
                                                       params=params)
    y_proba = clf.predict_proba(X_test)
    test_acc = bench.accuracy_score(y_test, y_pred)
    test_log_loss = bench.log_loss(y_test, y_proba)
    test_roc_auc = bench.roc_auc_score(y_test, y_proba)

    bench.print_output(
        library='sklearn',
        algorithm='logistic_regression',
        stages=['training', 'prediction'],
        params=params,
        functions=['LogReg.fit', 'LogReg.predict'],
        times=[fit_time, predict_time],
        metric_type=['accuracy', 'log_loss', 'roc_auc'],
        metrics=[
            [train_acc, test_acc],
            [train_log_loss, test_log_loss],
            [train_roc_auc, test_roc_auc],
        ],
        data=[X_train, X_test],
        alg_instance=clf,
    )
Ejemplo n.º 4
0
def main():
    from sklearn.ensemble import RandomForestRegressor

    # Load and convert data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    # Create our random forest regressor
    regr = RandomForestRegressor(criterion=params.criterion,
                                 n_estimators=params.num_trees,
                                 max_depth=params.max_depth,
                                 max_features=params.max_features,
                                 min_samples_split=params.min_samples_split,
                                 max_leaf_nodes=params.max_leaf_nodes,
                                 min_impurity_decrease=params.min_impurity_decrease,
                                 bootstrap=params.bootstrap,
                                 random_state=params.seed,
                                 n_jobs=params.n_jobs)

    fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)

    y_pred = regr.predict(X_train)
    train_rmse = bench.rmse_score(y_pred, y_train)

    predict_time, y_pred = bench.measure_function_time(
        regr.predict, X_test, params=params)
    test_rmse = bench.rmse_score(y_pred, y_test)

    bench.print_output(library='sklearn', algorithm='decision_forest_regression',
                       stages=['training', 'prediction'], params=params,
                       functions=['df_regr.fit', 'df_regr.predict'],
                       times=[fit_time, predict_time], accuracy_type='rmse',
                       accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
                       alg_instance=regr)
Ejemplo n.º 5
0
def main():
    from sklearn.linear_model import ElasticNet

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    # Create our regression object
    regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio,
                      alpha=params.alpha, tol=params.tol,
                      max_iter=params.maxiter, copy_X=False)
    # Time fit
    fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)

    # Time predict
    predict_time, pred_train = bench.measure_function_time(regr.predict,
                                                           X_train, params=params)

    train_rmse = bench.rmse_score(pred_train, y_train)
    pred_test = regr.predict(X_test)
    test_rmse = bench.rmse_score(pred_test, y_test)

    bench.print_output(library='sklearn', algorithm='elastic-net',
                       stages=['training', 'prediction'], params=params,
                       functions=['ElasticNet.fit', 'ElasticNet.predict'],
                       times=[fit_time, predict_time], accuracy_type='rmse',
                       accuracies=[train_rmse, test_rmse], data=[X_train, X_train],
                       alg_instance=regr)
Ejemplo n.º 6
0
def main():
    from sklearn.neighbors import KNeighborsRegressor

    # Load generated data
    X_train, X_test, y_train, y_test = bench.load_data(params)
    params.n_classes = len(np.unique(y_train))

    # Create a regression object
    knn_regr = KNeighborsRegressor(n_neighbors=params.n_neighbors,
                                   weights=params.weights,
                                   algorithm=params.method,
                                   metric=params.metric,
                                   n_jobs=params.n_jobs)

    # Measure time and accuracy on fitting
    train_time, _ = bench.measure_function_time(
        knn_regr.fit, X_train, y_train, params=params)
    if params.task == 'regression':
        y_pred = knn_regr.predict(X_train)
        train_rmse = bench.rmse_score(y_train, y_pred)
        train_r2 = bench.r2_score(y_train, y_pred)

    # Measure time and accuracy on prediction
    if params.task == 'regression':
        predict_time, yp = bench.measure_function_time(knn_regr.predict, X_test,
                                                       params=params)
        test_rmse = bench.rmse_score(y_test, yp)
        test_r2 = bench.r2_score(y_test, yp)
    else:
        predict_time, _ = bench.measure_function_time(knn_regr.kneighbors, X_test,
                                                      params=params)

    if params.task == 'regression':
        bench.print_output(
            library='sklearn',
            algorithm=knn_regr._fit_method + '_knn_regr',
            stages=['training', 'prediction'],
            params=params,
            functions=['knn_regr.fit', 'knn_regr.predict'],
            times=[train_time, predict_time],
            metric_type=['rmse', 'r2_score'],
            metrics=[[train_rmse, test_rmse], [train_r2, test_r2]],
            data=[X_train, X_test],
            alg_instance=knn_regr,
        )
    else:
        bench.print_output(
            library='sklearn',
            algorithm=knn_regr._fit_method + '_knn_search',
            stages=['training', 'search'],
            params=params,
            functions=['knn_regr.fit', 'knn_regr.kneighbors'],
            times=[train_time, predict_time],
            metric_type=None,
            metrics=[],
            data=[X_train, X_test],
            alg_instance=knn_regr,
        )
Ejemplo n.º 7
0
def main():
    from sklearn.svm import NuSVR

    X_train, X_test, y_train, y_test = bench.load_data(params)
    y_train = np.asfortranarray(y_train).ravel()

    if params.gamma is None:
        params.gamma = 1.0 / X_train.shape[1]

    cache_size_bytes = bench.get_optimal_cache_size(
        X_train.shape[0], max_cache=params.max_cache_size)
    params.cache_size_mb = cache_size_bytes / 1024**2
    params.n_classes = len(np.unique(y_train))

    regr = NuSVR(C=params.C,
                 nu=params.nu,
                 kernel=params.kernel,
                 cache_size=params.cache_size_mb,
                 tol=params.tol,
                 gamma=params.gamma,
                 degree=params.degree)

    fit_time, _ = bench.measure_function_time(regr.fit,
                                              X_train,
                                              y_train,
                                              params=params)
    params.sv_len = regr.support_.shape[0]

    predict_train_time, y_pred = bench.measure_function_time(regr.predict,
                                                             X_train,
                                                             params=params)
    train_rmse = bench.rmse_score(y_train, y_pred)
    train_r2 = bench.r2_score(y_train, y_pred)

    _, y_pred = bench.measure_function_time(regr.predict,
                                            X_test,
                                            params=params)
    test_rmse = bench.rmse_score(y_test, y_pred)
    test_r2 = bench.r2_score(y_test, y_pred)

    bench.print_output(
        library='sklearn',
        algorithm='nuSVR',
        stages=['training', 'prediction'],
        params=params,
        functions=['NuSVR.fit', 'NuSVR.predict'],
        times=[fit_time, predict_train_time],
        metric_type=['rmse', 'r2_score', 'n_sv'],
        metrics=[
            [train_rmse, test_rmse],
            [train_r2, test_r2],
            [int(regr.n_support_.sum()),
             int(regr.n_support_.sum())],
        ],
        data=[X_train, X_train],
        alg_instance=regr,
    )
Ejemplo n.º 8
0
def main():
    from sklearn.cluster import KMeans
    from sklearn.metrics.cluster import davies_bouldin_score

    # Load and convert generated data
    X_train, X_test, _, _ = bench.load_data(params)

    X_init: Any
    if params.filei == 'k-means++':
        X_init = 'k-means++'
    # Load initial centroids from specified path
    elif params.filei is not None:
        X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()}
        if isinstance(X_init, np.ndarray):
            params.n_clusters = X_init.shape[0]
    # or choose random centroids from training data
    else:
        np.random.seed(params.seed)
        centroids_idx = np.random.randint(low=0, high=X_train.shape[0],
                                          size=params.n_clusters)
        if hasattr(X_train, "iloc"):
            X_init = X_train.iloc[centroids_idx].values
        else:
            X_init = X_train[centroids_idx]

    def fit_kmeans(X, X_init):
        alg = KMeans(n_clusters=params.n_clusters, tol=params.tol,
                     max_iter=params.maxiter, init=X_init, n_init=1)
        alg.fit(X)
        return alg

    # Time fit
    fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train,
                                                   X_init, params=params)

    train_predict = kmeans.predict(X_train)
    acc_train = davies_bouldin_score(X_train, train_predict)

    # Time predict
    predict_time, test_predict = bench.measure_function_time(
        kmeans.predict, X_test, params=params)

    acc_test = davies_bouldin_score(X_test, test_predict)

    bench.print_output(library='sklearn', algorithm='kmeans',
                       stages=['training', 'prediction'],
                       params=params, functions=['KMeans.fit', 'KMeans.predict'],
                       times=[fit_time, predict_time],
                       accuracy_type='davies_bouldin_score',
                       accuracies=[acc_train, acc_test], data=[X_train, X_test],
                       alg_instance=kmeans)
Ejemplo n.º 9
0
def main():
    from sklearn.cluster import DBSCAN
    from sklearn.metrics.cluster import davies_bouldin_score

    # Load generated data
    X, _, _, _ = bench.load_data(params, add_dtype=True)

    # Create our clustering object
    dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs,
                    min_samples=params.min_samples, metric='euclidean',
                    algorithm='auto')

    # N.B. algorithm='auto' will select oneAPI Data Analytics Library (oneDAL)
    # brute force method when running daal4py-patched scikit-learn, and probably
    #  'kdtree' when running unpatched scikit-learn.

    # Time fit
    time, _ = bench.measure_function_time(dbscan.fit, X, params=params)
    labels = dbscan.labels_

    params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    acc = davies_bouldin_score(X, labels)

    bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'],
                       params=params, functions=['DBSCAN'], times=[time],
                       accuracies=[acc], accuracy_type='davies_bouldin_score',
                       data=[X], alg_instance=dbscan)
Ejemplo n.º 10
0
def main():
    from sklearn.manifold import TSNE

    # Load and convert data
    X, _, _, _ = bench.load_data(params)

    # Create our TSNE model
    tsne = TSNE(n_components=params.n_components,
                early_exaggeration=params.early_exaggeration,
                learning_rate=params.learning_rate,
                angle=params.angle,
                min_grad_norm=params.min_grad_norm,
                random_state=params.random_state)

    fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params)
    divergence = tsne.kl_divergence_

    bench.print_output(
        library='sklearn',
        algorithm='TSNE',
        stages=['training'],
        params=params,
        functions=['TSNE.fit'],
        times=[fit_time],
        metric_type='divergence',
        metrics=[divergence],
        data=[X],
        alg_instance=tsne,
    )
Ejemplo n.º 11
0
def main():
    from sklearn.model_selection import train_test_split

    # Load generated data
    X, y, _, _ = bench.load_data(params)

    data_args: Iterable
    if params.include_y:
        data_args = (X, y)
    else:
        data_args = (X, )

    tts_params = {
        'train_size': params.train_size,
        'test_size': params.test_size,
        'shuffle': not params.do_not_shuffle,
        'random_state': params.seed
    }

    if params.rng is not None:
        tts_params['rng'] = params.rng

    time, _ = bench.measure_function_time(
        train_test_split, *data_args, params=params, **tts_params)

    bench.print_output(library='sklearn', algorithm='train_test_split',
                       stages=['training'], params=params,
                       functions=['train_test_split'], times=[time], metrics=[None],
                       metric_type=None, data=[X], alg_params=tts_params)
Ejemplo n.º 12
0
def main():
    from sklearn.linear_model import Lasso

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    # Create our regression object
    regr = Lasso(fit_intercept=params.fit_intercept,
                 alpha=params.alpha,
                 tol=params.tol,
                 max_iter=params.maxiter,
                 copy_X=False)

    # Time fit
    fit_time, _ = bench.measure_function_time(regr.fit,
                                              X_train,
                                              y_train,
                                              params=params)

    # Time predict
    predict_time, yp = bench.measure_function_time(regr.predict,
                                                   X_train,
                                                   params=params)

    train_rmse = bench.rmse_score(y_train, yp)
    train_r2 = bench.r2_score(y_train, yp)
    yp = regr.predict(X_test)
    test_rmse = bench.rmse_score(y_test, yp)
    test_r2 = bench.r2_score(y_test, yp)

    bench.print_output(
        library='sklearn',
        algorithm='lasso',
        stages=['training', 'prediction'],
        params=params,
        functions=['Lasso.fit', 'Lasso.predict'],
        times=[fit_time, predict_time],
        metric_type=['rmse', 'r2_score', 'iter'],
        metrics=[
            [train_rmse, test_rmse],
            [train_r2, test_r2],
            [int(regr.n_iter_), int(regr.n_iter_)],
        ],
        data=[X_train, X_test],
        alg_instance=regr,
    )
Ejemplo n.º 13
0
def main():
    from sklearn.neighbors import KNeighborsClassifier

    # Load generated data
    X_train, X_test, y_train, y_test = bench.load_data(params)
    params.n_classes = len(np.unique(y_train))

    # Create classification object
    knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors,
                                    weights=params.weights,
                                    algorithm=params.method,
                                    metric=params.metric,
                                    n_jobs=params.n_jobs)

    # Measure time and accuracy on fitting
    train_time, _ = bench.measure_function_time(knn_clsf.fit, X_train, y_train, params=params)
    if params.task == 'classification':
        y_pred = knn_clsf.predict(X_train)
        train_acc = 100 * accuracy_score(y_pred, y_train)

    # Measure time and accuracy on prediction
    if params.task == 'classification':
        predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test,
                                                       params=params)
        test_acc = 100 * accuracy_score(yp, y_test)
    else:
        predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test,
                                                      params=params)

    if params.task == 'classification':
        bench.print_output(library='sklearn',
                           algorithm=knn_clsf._fit_method + '_knn_classification',
                           stages=['training', 'prediction'], params=params,
                           functions=['knn_clsf.fit', 'knn_clsf.predict'],
                           times=[train_time, predict_time],
                           accuracies=[train_acc, test_acc], accuracy_type='accuracy[%]',
                           data=[X_train, X_test], alg_instance=knn_clsf)
    else:
        bench.print_output(library='sklearn',
                           algorithm=knn_clsf._fit_method + '_knn_search',
                           stages=['training', 'search'], params=params,
                           functions=['knn_clsf.fit', 'knn_clsf.kneighbors'],
                           times=[train_time, predict_time],
                           accuracies=[], accuracy_type=None,
                           data=[X_train, X_test], alg_instance=knn_clsf)
Ejemplo n.º 14
0
def main():
    from sklearn.linear_model import LinearRegression

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(
        params, generated_data=['X_train', 'y_train'])

    # Create our regression object
    regr = LinearRegression(fit_intercept=params.fit_intercept,
                            n_jobs=params.n_jobs,
                            copy_X=False)

    # Time fit
    fit_time, _ = bench.measure_function_time(regr.fit,
                                              X_train,
                                              y_train,
                                              params=params)

    # Time predict
    predict_time, yp = bench.measure_function_time(regr.predict,
                                                   X_test,
                                                   params=params)

    test_rmse = bench.rmse_score(y_test, yp)
    test_r2 = bench.r2_score(y_test, yp)
    yp = regr.predict(X_train)
    train_rmse = bench.rmse_score(y_train, yp)
    train_r2 = bench.r2_score(y_train, yp)

    bench.print_output(
        library='sklearn',
        algorithm='lin_reg',
        stages=['training', 'prediction'],
        params=params,
        functions=['Linear.fit', 'Linear.predict'],
        times=[fit_time, predict_time],
        metric_type=['rmse', 'r2_score'],
        metrics=[[train_rmse, test_rmse], [train_r2, test_r2]],
        data=[X_train, X_test],
        alg_instance=regr,
    )
Ejemplo n.º 15
0
def main():
    from sklearn.linear_model import ElasticNet

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    # Create our regression object
    regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio,
                      alpha=params.alpha, tol=params.tol,
                      max_iter=params.maxiter)
    # Time fit
    fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)

    # Time predict
    predict_time, y_pred = bench.measure_function_time(regr.predict,
                                                       X_train, params=params)

    train_rmse = bench.rmse_score(y_train, y_pred)
    train_r2 = bench.r2_score(y_train, y_pred)
    y_pred = regr.predict(X_test)
    test_rmse = bench.rmse_score(y_test, y_pred)
    test_r2 = bench.r2_score(y_test, y_pred)

    bench.print_output(
        library='sklearn',
        algorithm='elasticnet',
        stages=['training', 'prediction'],
        params=params,
        functions=['ElasticNet.fit', 'ElasticNet.predict'],
        times=[fit_time, predict_time],
        metric_type=['rmse', 'r2_score', 'iter'],
        metrics=[
            [train_rmse, test_rmse],
            [train_r2, test_r2],
            [int(regr.n_iter_), int(regr.n_iter_)],
        ],
        data=[X_train, X_train],
        alg_instance=regr,
    )
Ejemplo n.º 16
0
def main():
    from sklearn.linear_model import Ridge

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(
        params, generated_data=['X_train', 'y_train'])

    # Create our regression object
    regr = Ridge(fit_intercept=params.fit_intercept,
                 alpha=params.alpha,
                 solver=params.solver)

    # Time fit
    fit_time, _ = bench.measure_function_time(regr.fit,
                                              X_train,
                                              y_train,
                                              params=params)

    # Time predict
    predict_time, yp = bench.measure_function_time(regr.predict,
                                                   X_test,
                                                   params=params)

    test_rmse = bench.rmse_score(yp, y_test)
    yp = regr.predict(X_train)
    train_rmse = bench.rmse_score(yp, y_train)

    bench.print_output(library='sklearn',
                       algorithm='ridge_regression',
                       stages=['training', 'prediction'],
                       params=params,
                       functions=['Ridge.fit', 'Ridge.predict'],
                       times=[fit_time, predict_time],
                       accuracy_type='rmse',
                       accuracies=[train_rmse, test_rmse],
                       data=[X_train, X_test],
                       alg_instance=regr)
Ejemplo n.º 17
0
def main():
    from sklearn.metrics.pairwise import pairwise_distances

    # Load data
    X, _, _, _ = bench.load_data(params,
                                 generated_data=['X_train'],
                                 add_dtype=True)

    time, _ = bench.measure_function_time(pairwise_distances,
                                          X,
                                          metric=params.metric,
                                          n_jobs=params.n_jobs,
                                          params=params)

    bench.print_output(library='sklearn',
                       algorithm='distances',
                       stages=['computation'],
                       params=params,
                       functions=[params.metric.capitalize()],
                       times=[time],
                       metric_type=None,
                       metrics=[None],
                       data=[X],
                       alg_params={'metric': params.metric})
Ejemplo n.º 18
0
    min_impurity_decrease=params.min_impurity_decrease,
    bootstrap=params.bootstrap,
)


def fit(regr, X, y):
    return regr.fit(X, y)


def predict(regr, X):
    return regr.predict(X, predict_model='GPU')


fit_time, _ = bench.measure_function_time(fit,
                                          regr,
                                          X_train,
                                          y_train,
                                          params=params)

y_pred = predict(regr, X_train)
train_rmse = bench.rmse_score(y_pred, y_train)

predict_time, y_pred = bench.measure_function_time(predict,
                                                   regr,
                                                   X_test,
                                                   params=params)
test_rmse = bench.rmse_score(y_pred, y_test)

bench.print_output(library='cuml',
                   algorithm='df_regr',
                   stages=['training', 'prediction'],
Ejemplo n.º 19
0
parser = argparse.ArgumentParser(description='daal4py pairwise distances '
                                 'benchmark')
parser.add_argument('--metric',
                    default='cosine',
                    choices=['cosine', 'correlation'],
                    help='Metric to test for pairwise distances')
params = bench.parse_args(parser)

# Load data
X, _, _, _ = bench.load_data(params,
                             generated_data=['X_train'],
                             add_dtype=True)

pairwise_distances = cosine_distance if params.metric == 'cosine' else correlation_distance

time, _ = bench.measure_function_time(compute_distances,
                                      pairwise_distances,
                                      X,
                                      params=params)

bench.print_output(library='daal4py',
                   algorithm='distances',
                   stages=['computation'],
                   params=params,
                   functions=[params.metric.capitalize()],
                   times=[time],
                   metric_type=None,
                   metrics=[None],
                   data=[X],
                   alg_params={'metric': params.metric})
Ejemplo n.º 20
0
def main():
    from sklearn.svm import SVC

    X_train, X_test, y_train, y_test = bench.load_data(params)
    y_train = np.asfortranarray(y_train).ravel()

    if params.gamma is None:
        params.gamma = 1.0 / X_train.shape[1]

    cache_size_bytes = bench.get_optimal_cache_size(
        X_train.shape[0], max_cache=params.max_cache_size)
    params.cache_size_mb = cache_size_bytes / 1024**2
    params.n_classes = len(np.unique(y_train))

    clf = SVC(C=params.C,
              kernel=params.kernel,
              cache_size=params.cache_size_mb,
              tol=params.tol,
              gamma=params.gamma,
              probability=params.probability,
              random_state=43,
              degree=params.degree)

    fit_time, _ = bench.measure_function_time(clf.fit,
                                              X_train,
                                              y_train,
                                              params=params)
    params.sv_len = clf.support_.shape[0]

    if params.probability:
        state_predict = 'predict_proba'
        clf_predict = clf.predict_proba
        train_acc = None
        test_acc = None

        predict_train_time, y_pred = bench.measure_function_time(clf_predict,
                                                                 X_train,
                                                                 params=params)
        train_log_loss = bench.log_loss(y_train, y_pred)
        train_roc_auc = bench.roc_auc_score(y_train, y_pred)

        _, y_pred = bench.measure_function_time(clf_predict,
                                                X_test,
                                                params=params)
        test_log_loss = bench.log_loss(y_test, y_pred)
        test_roc_auc = bench.roc_auc_score(y_test, y_pred)
    else:
        state_predict = 'prediction'
        clf_predict = clf.predict
        train_log_loss = None
        test_log_loss = None
        train_roc_auc = None
        test_roc_auc = None

        predict_train_time, y_pred = bench.measure_function_time(clf_predict,
                                                                 X_train,
                                                                 params=params)
        train_acc = bench.accuracy_score(y_train, y_pred)

        _, y_pred = bench.measure_function_time(clf_predict,
                                                X_test,
                                                params=params)
        test_acc = bench.accuracy_score(y_test, y_pred)

    bench.print_output(
        library='sklearn',
        algorithm='SVC',
        stages=['training', state_predict],
        params=params,
        functions=['SVM.fit', f'SVM.{state_predict}'],
        times=[fit_time, predict_train_time],
        metric_type=['accuracy', 'log_loss', 'roc_auc', 'n_sv'],
        metrics=[
            [train_acc, test_acc],
            [train_log_loss, test_log_loss],
            [train_roc_auc, test_roc_auc],
            [int(clf.n_support_.sum()),
             int(clf.n_support_.sum())],
        ],
        data=[X_train, X_train],
        alg_instance=clf,
    )
Ejemplo n.º 21
0
params = bench.parse_args(parser)

from sklearn.linear_model import Ridge

# Load data
X_train, X_test, y_train, y_test = bench.load_data(
    params, generated_data=['X_train', 'y_train'])

# Create our regression object
regr = Ridge(fit_intercept=params.fit_intercept,
             alpha=params.alpha,
             solver=params.solver)

# Time fit
fit_time, _ = bench.measure_function_time(regr.fit,
                                          X_train,
                                          y_train,
                                          params=params)

# Time predict
predict_time, yp = bench.measure_function_time(regr.predict,
                                               X_test,
                                               params=params)

test_rmse = bench.rmse_score(yp, y_test)
yp = regr.predict(X_train)
train_rmse = bench.rmse_score(yp, y_train)

bench.print_output(library='sklearn',
                   algorithm='ridge_regression',
                   stages=['training', 'prediction'],
                   params=params,
Ejemplo n.º 22
0
def test_transform(Xp, pca_result, eigenvalues, eigenvectors):
    return pca_transform_daal(pca_result,
                              Xp,
                              params.n_components,
                              X_train.shape[0],
                              eigenvalues,
                              eigenvectors,
                              whiten=params.whiten)


columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
           'svd_solver', 'n_components', 'whiten', 'time')

# Time fit
fit_time, res = measure_function_time(test_fit, X_train, params=params)

# Time transform
transform_time, tr = measure_function_time(test_transform,
                                           X_test,
                                           *res[:3],
                                           params=params)

print_output(library='daal4py',
             algorithm='pca',
             stages=['training', 'transformation'],
             columns=columns,
             params=params,
             functions=['PCA.fit', 'PCA.transform'],
             times=[fit_time, transform_time],
             accuracy_type=None,
Ejemplo n.º 23
0
if params.objective.startswith('reg'):
    task = 'regression'
    metric_name, metric_func = 'rmse', bench.rmse_score
else:
    task = 'classification'
    metric_name, metric_func = 'accuracy[%]', utils.get_accuracy
    if 'cudf' in str(type(y_train)):
        params.n_classes = y_train[y_train.columns[0]].nunique()
    else:
        params.n_classes = len(np.unique(y_train))
    if params.n_classes > 2:
        lgbm_params['num_class'] = params.n_classes

t_creat_train, lgbm_train = bench.measure_function_time(lgbm.Dataset,
                                                        X_train,
                                                        y_train,
                                                        params=params,
                                                        free_raw_data=False)

t_creat_test, lgbm_test = bench.measure_function_time(lgbm.Dataset,
                                                      X_test,
                                                      y_test,
                                                      params=params,
                                                      reference=lgbm_train,
                                                      free_raw_data=False)

t_train, model_lgbm = bench.measure_function_time(
    lgbm.train,
    lgbm_params,
    lgbm_train,
    params=params,
Ejemplo n.º 24
0
# Load data
X_train, X_test, y_train, y_test = load_data(params)

# Create our regression object
regr = Lasso(fit_intercept=params.fit_intercept,
             alpha=params.alpha,
             tol=params.tol,
             max_iter=params.maxiter,
             copy_X=False)

columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
           'time')

# Time fit
fit_time, _ = measure_function_time(regr.fit, X_train, y_train, params=params)

# Time predict
predict_time, pred_train = measure_function_time(regr.predict,
                                                 X_train,
                                                 params=params)

train_rmse = rmse_score(pred_train, y_train)
pred_test = regr.predict(X_test)
test_rmse = rmse_score(pred_test, y_test)

print_output(library='sklearn',
             algorithm='lasso',
             stages=['training', 'prediction'],
             columns=columns,
             params=params,
Ejemplo n.º 25
0
clf = RandomForestClassifier(
    criterion=params.criterion,
    n_estimators=params.num_trees,
    max_depth=params.max_depth,
    max_features=params.max_features,
    min_samples_split=params.min_samples_split,
    max_leaf_nodes=params.max_leaf_nodes,
    min_impurity_decrease=params.min_impurity_decrease,
    bootstrap=params.bootstrap,
    random_state=params.seed,
    n_jobs=params.n_jobs)

params.n_classes = len(np.unique(y_train))

fit_time, _ = bench.measure_function_time(clf.fit,
                                          X_train,
                                          y_train,
                                          params=params)
y_pred = clf.predict(X_train)
train_acc = 100 * accuracy_score(y_pred, y_train)

predict_time, y_pred = bench.measure_function_time(clf.predict,
                                                   X_test,
                                                   params=params)
test_acc = 100 * accuracy_score(y_pred, y_test)

bench.print_output(library='sklearn',
                   algorithm='decision_forest_classification',
                   stages=['training', 'prediction'],
                   params=params,
                   functions=['df_clsf.fit', 'df_clsf.predict'],
                   times=[fit_time, predict_time],
Ejemplo n.º 26
0
    params = bench.parse_args(parser, prefix='daal4py')

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(
        params, add_dtype=True, label_2d=True)

    params.n_classes = len(np.unique(y_train))
    if isinstance(params.max_features, float):
        params.max_features = int(X_train.shape[1] * params.max_features)

    # Time fit and predict
    fit_time, res = bench.measure_function_time(
        df_clsf_fit, X_train, y_train,
        params.n_classes,
        n_trees=params.num_trees,
        n_features_per_node=params.max_features,
        max_depth=params.max_depth,
        min_impurity=params.min_impurity_decrease,
        bootstrap=params.bootstrap,
        seed=params.seed,
        params=params)

    yp = df_clsf_predict(X_train, res, params.n_classes)
    train_acc = 100 * accuracy_score(yp, y_train)

    predict_time, yp = bench.measure_function_time(
        df_clsf_predict, X_test, res, params.n_classes, params=params)
    test_acc = 100 * accuracy_score(yp, y_test)

    bench.print_output(library='daal4py', algorithm='decision_forest_classification',
                       stages=['training', 'prediction'], params=params,
                       functions=['df_clsf.fit', 'df_clsf.predict'],
Ejemplo n.º 27
0
def test_fit(X, y):
    regr_train = ridge_regression_training(fptype=getFPType(X),
                                           ridgeParameters=np.array(
                                               [[params.alpha]]),
                                           interceptFlag=params.fit_intercept)
    return regr_train.compute(X, y)


def test_predict(Xp, model):
    regr_predict = ridge_regression_prediction(fptype=getFPType(Xp))
    return regr_predict.compute(Xp, model)


# Time fit
fit_time, res = bench.measure_function_time(test_fit,
                                            X_train,
                                            y_train,
                                            params=params)

# Time predict
predict_time, yp = bench.measure_function_time(test_predict,
                                               X_test,
                                               res.model,
                                               params=params)

test_rmse = bench.rmse_score(yp.prediction, y_test)
pres = test_predict(X_train, res.model)
train_rmse = bench.rmse_score(pres.prediction, y_train)

bench.print_output(library='daal4py',
                   algorithm='ridge_regression',
                   stages=['training', 'prediction'],
Ejemplo n.º 28
0
    metric_name = 'accuracy[%]'
    metric_func = lambda y1, y2: 100 * accuracy_score(y1, y2)
    columns += ('n_classes', 'accuracy', 'time')
    if 'cudf' in str(type(y_train)):
        params.n_classes = y_train[y_train.columns[0]].nunique()
    else:
        params.n_classes = len(np.unique(y_train))
    if params.n_classes > 2:
        xgb_params['num_class'] = params.n_classes

dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

fit_time, booster = measure_function_time(xgb.train,
                                          xgb_params,
                                          dtrain,
                                          params.n_estimators,
                                          params=params)
y_pred = convert_xgb_predictions(booster.predict(dtrain), params.objective)
train_metric = metric_func(y_pred, y_train)

predict_time, y_pred = measure_function_time(booster.predict,
                                             dtest,
                                             params=params)
test_metric = metric_func(convert_xgb_predictions(y_pred, params.objective),
                          y_test)

print_output(library='xgboost',
             algorithm=f'gradient_boosted_trees_{task}',
             stages=['training', 'prediction'],
             columns=columns,
Ejemplo n.º 29
0

# Workaround for cuML kmeans fail
# when second call of 'fit' method causes AttributeError
def kmeans_fit(X):
    alg = KMeans(n_clusters=params.n_clusters,
                 tol=params.tol,
                 max_iter=params.maxiter,
                 init=X_init,
                 max_samples_per_batch=params.samples_per_batch)
    alg.fit(X)
    return alg


# Time fit
fit_time, kmeans = measure_function_time(kmeans_fit, X_train, params=params)
train_predict = kmeans.predict(X_train)

# Time predict
predict_time, test_predict = measure_function_time(kmeans.predict,
                                                   X_test,
                                                   params=params)

X_train_host = convert_to_numpy(X_train)
train_predict_host = convert_to_numpy(train_predict)
acc_train = davies_bouldin_score(X_train_host, train_predict_host)

X_test_host = convert_to_numpy(X_test)
test_predict_host = convert_to_numpy(test_predict)

acc_test = davies_bouldin_score(X_test_host, test_predict_host)
Ejemplo n.º 30
0
parser.add_argument('-m',
                    '--min-samples',
                    default=5,
                    type=int,
                    help='The minimum number of samples required in a '
                    'neighborhood to consider a point a core point')
params = bench.parse_args(parser)

# Load generated data
X, _, _, _ = bench.load_data(params)

# Create our clustering object
dbscan = DBSCAN(eps=params.eps, min_samples=params.min_samples)

# Time fit
time, _ = bench.measure_function_time(dbscan.fit, X, params=params)
labels = dbscan.labels_

X_host = bench.convert_to_numpy(X)
labels_host = bench.convert_to_numpy(labels)

acc = davies_bouldin_score(X_host, labels_host)
params.n_clusters = len(set(labels_host)) - (1 if -1 in labels_host else 0)

bench.print_output(library='cuml',
                   algorithm='dbscan',
                   stages=['training'],
                   params=params,
                   functions=['DBSCAN'],
                   times=[time],
                   metrics=[acc],