Ejemplo n.º 1
0
def main():
    from sklearn.linear_model import Lasso

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    # Create our regression object
    regr = Lasso(fit_intercept=params.fit_intercept,
                 alpha=params.alpha,
                 tol=params.tol,
                 max_iter=params.maxiter,
                 copy_X=False)

    # Time fit
    fit_time, _ = bench.measure_function_time(regr.fit,
                                              X_train,
                                              y_train,
                                              params=params)

    # Time predict
    predict_time, yp = bench.measure_function_time(regr.predict,
                                                   X_train,
                                                   params=params)

    train_rmse = bench.rmse_score(y_train, yp)
    train_r2 = bench.r2_score(y_train, yp)
    yp = regr.predict(X_test)
    test_rmse = bench.rmse_score(y_test, yp)
    test_r2 = bench.r2_score(y_test, yp)

    bench.print_output(
        library='sklearn',
        algorithm='lasso',
        stages=['training', 'prediction'],
        params=params,
        functions=['Lasso.fit', 'Lasso.predict'],
        times=[fit_time, predict_time],
        metric_type=['rmse', 'r2_score', 'iter'],
        metrics=[
            [train_rmse, test_rmse],
            [train_r2, test_r2],
            [int(regr.n_iter_), int(regr.n_iter_)],
        ],
        data=[X_train, X_test],
        alg_instance=regr,
    )
Ejemplo n.º 2
0
def main():
    from sklearn.neighbors import KNeighborsClassifier

    # Load generated data
    X_train, X_test, y_train, y_test = bench.load_data(params)
    params.n_classes = len(np.unique(y_train))

    # Create classification object
    knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors,
                                    weights=params.weights,
                                    algorithm=params.method,
                                    metric=params.metric,
                                    n_jobs=params.n_jobs)

    # Measure time and accuracy on fitting
    train_time, _ = bench.measure_function_time(knn_clsf.fit, X_train, y_train, params=params)
    if params.task == 'classification':
        y_pred = knn_clsf.predict(X_train)
        train_acc = 100 * accuracy_score(y_pred, y_train)

    # Measure time and accuracy on prediction
    if params.task == 'classification':
        predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test,
                                                       params=params)
        test_acc = 100 * accuracy_score(yp, y_test)
    else:
        predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test,
                                                      params=params)

    if params.task == 'classification':
        bench.print_output(library='sklearn',
                           algorithm=knn_clsf._fit_method + '_knn_classification',
                           stages=['training', 'prediction'], params=params,
                           functions=['knn_clsf.fit', 'knn_clsf.predict'],
                           times=[train_time, predict_time],
                           accuracies=[train_acc, test_acc], accuracy_type='accuracy[%]',
                           data=[X_train, X_test], alg_instance=knn_clsf)
    else:
        bench.print_output(library='sklearn',
                           algorithm=knn_clsf._fit_method + '_knn_search',
                           stages=['training', 'search'], params=params,
                           functions=['knn_clsf.fit', 'knn_clsf.kneighbors'],
                           times=[train_time, predict_time],
                           accuracies=[], accuracy_type=None,
                           data=[X_train, X_test], alg_instance=knn_clsf)
Ejemplo n.º 3
0
def main():
    from sklearn.linear_model import ElasticNet

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    # Create our regression object
    regr = ElasticNet(fit_intercept=params.fit_intercept,
                      l1_ratio=params.l1_ratio,
                      alpha=params.alpha,
                      tol=params.tol,
                      max_iter=params.maxiter,
                      copy_X=False)
    # Time fit
    fit_time, _ = bench.measure_function_time(regr.fit,
                                              X_train,
                                              y_train,
                                              params=params)

    # Time predict
    predict_time, y_pred = bench.measure_function_time(regr.predict,
                                                       X_train,
                                                       params=params)

    train_rmse = bench.rmse_score(y_train, y_pred)
    train_r2 = bench.r2_score(y_train, y_pred)
    y_pred = regr.predict(X_test)
    test_rmse = bench.rmse_score(y_test, y_pred)
    test_r2 = bench.r2_score(y_test, y_pred)

    bench.print_output(
        library='sklearn',
        algorithm='elastic-net',
        stages=['training', 'prediction'],
        params=params,
        functions=['ElasticNet.fit', 'ElasticNet.predict'],
        times=[fit_time, predict_time],
        metric_type=['rmse', 'r2_score'],
        metrics=[[train_rmse, test_rmse], [train_r2, test_r2]],
        data=[X_train, X_train],
        alg_instance=regr,
    )
Ejemplo n.º 4
0
def main():
    from sklearn.ensemble import RandomForestRegressor

    # Load and convert data
    X_train, X_test, y_train, y_test = bench.load_data(params)

    # Create our random forest regressor
    regr = RandomForestRegressor(criterion=params.criterion,
                                 n_estimators=params.num_trees,
                                 max_depth=params.max_depth,
                                 max_features=params.max_features,
                                 min_samples_split=params.min_samples_split,
                                 max_leaf_nodes=params.max_leaf_nodes,
                                 min_impurity_decrease=params.min_impurity_decrease,
                                 bootstrap=params.bootstrap,
                                 random_state=params.seed,
                                 n_jobs=params.n_jobs)

    fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)

    y_pred = regr.predict(X_train)
    train_rmse = bench.rmse_score(y_train, y_pred)
    train_r2 = bench.r2_score(y_train, y_pred)

    predict_time, y_pred = bench.measure_function_time(
        regr.predict, X_test, params=params)
    test_rmse = bench.rmse_score(y_test, y_pred)
    test_r2 = bench.r2_score(y_test, y_pred)

    bench.print_output(
        library='sklearn',
        algorithm='df_regr',
        stages=['training', 'prediction'],
        params=params,
        functions=['df_regr.fit', 'df_regr.predict'],
        times=[fit_time, predict_time],
        metric_type=['rmse', 'r2_score'],
        metrics=[[train_rmse, test_rmse], [train_r2, test_r2]],
        data=[X_train, X_test],
        alg_instance=regr,
    )
Ejemplo n.º 5
0
def main():
    from sklearn.linear_model import LinearRegression

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(
        params, generated_data=['X_train', 'y_train'])

    # Create our regression object
    regr = LinearRegression(fit_intercept=params.fit_intercept,
                            n_jobs=params.n_jobs,
                            copy_X=False)

    # Time fit
    fit_time, _ = bench.measure_function_time(regr.fit,
                                              X_train,
                                              y_train,
                                              params=params)

    # Time predict
    predict_time, yp = bench.measure_function_time(regr.predict,
                                                   X_test,
                                                   params=params)

    test_rmse = bench.rmse_score(y_test, yp)
    test_r2 = bench.r2_score(y_test, yp)
    yp = regr.predict(X_train)
    train_rmse = bench.rmse_score(y_train, yp)
    train_r2 = bench.r2_score(y_train, yp)

    bench.print_output(
        library='sklearn',
        algorithm='lin_reg',
        stages=['training', 'prediction'],
        params=params,
        functions=['Linear.fit', 'Linear.predict'],
        times=[fit_time, predict_time],
        metric_type=['rmse', 'r2_score'],
        metrics=[[train_rmse, test_rmse], [train_r2, test_r2]],
        data=[X_train, X_test],
        alg_instance=regr,
    )
Ejemplo n.º 6
0
def main():
    from sklearn.linear_model import Ridge

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(
        params, generated_data=['X_train', 'y_train'])

    # Create our regression object
    regr = Ridge(fit_intercept=params.fit_intercept,
                 alpha=params.alpha,
                 solver=params.solver)

    # Time fit
    fit_time, _ = bench.measure_function_time(regr.fit,
                                              X_train,
                                              y_train,
                                              params=params)

    # Time predict
    predict_time, yp = bench.measure_function_time(regr.predict,
                                                   X_test,
                                                   params=params)

    test_rmse = bench.rmse_score(yp, y_test)
    yp = regr.predict(X_train)
    train_rmse = bench.rmse_score(yp, y_train)

    bench.print_output(library='sklearn',
                       algorithm='ridge_regression',
                       stages=['training', 'prediction'],
                       params=params,
                       functions=['Ridge.fit', 'Ridge.predict'],
                       times=[fit_time, predict_time],
                       accuracy_type='rmse',
                       accuracies=[train_rmse, test_rmse],
                       data=[X_train, X_test],
                       alg_instance=regr)
Ejemplo n.º 7
0
def main():
    from sklearn.model_selection import train_test_split

    # Load generated data
    X, y, _, _ = bench.load_data(params)

    data_args: Iterable
    if params.include_y:
        data_args = (X, y)
    else:
        data_args = (X, )

    tts_params = {
        'train_size': params.train_size,
        'test_size': params.test_size,
        'shuffle': not params.do_not_shuffle,
        'random_state': params.seed
    }

    if params.rng is not None:
        tts_params['rng'] = params.rng

    time, _ = bench.measure_function_time(train_test_split,
                                          *data_args,
                                          params=params,
                                          **tts_params)

    bench.print_output(library='sklearn',
                       algorithm='train_test_split',
                       stages=['training'],
                       params=params,
                       functions=['train_test_split'],
                       times=[time],
                       accuracies=[None],
                       accuracy_type=None,
                       data=[X],
                       alg_params=tts_params)
Ejemplo n.º 8
0
def main():
    from sklearn.metrics.pairwise import pairwise_distances

    # Load data
    X, _, _, _ = bench.load_data(params,
                                 generated_data=['X_train'],
                                 add_dtype=True)

    time, _ = bench.measure_function_time(pairwise_distances,
                                          X,
                                          metric=params.metric,
                                          n_jobs=params.n_jobs,
                                          params=params)

    bench.print_output(library='sklearn',
                       algorithm='distances',
                       stages=['computation'],
                       params=params,
                       functions=[params.metric.capitalize()],
                       times=[time],
                       metric_type=None,
                       metrics=[None],
                       data=[X],
                       alg_params={'metric': params.metric})
Ejemplo n.º 9
0
def main():
    from sklearn.cluster import KMeans
    from sklearn.metrics.cluster import davies_bouldin_score

    # Load and convert generated data
    X_train, X_test, _, _ = bench.load_data(params)

    X_init: Any
    if params.filei == 'k-means++':
        X_init = 'k-means++'
    # Load initial centroids from specified path
    elif params.filei is not None:
        X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()}
        if isinstance(X_init, np.ndarray):
            params.n_clusters = X_init.shape[0]
    # or choose random centroids from training data
    else:
        np.random.seed(params.seed)
        centroids_idx = np.random.randint(low=0, high=X_train.shape[0],
                                          size=params.n_clusters)
        if hasattr(X_train, "iloc"):
            X_init = X_train.iloc[centroids_idx].values
        else:
            X_init = X_train[centroids_idx]

    def fit_kmeans(X, X_init):
        alg = KMeans(n_clusters=params.n_clusters, tol=params.tol,
                     max_iter=params.maxiter, init=X_init, n_init=params.n_init,
                     algorithm=params.algorithm, random_state=params.random_state)
        alg.fit(X)
        return alg

    # Time fit
    fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train,
                                                   X_init, params=params)

    train_predict = kmeans.predict(X_train)
    acc_train = davies_bouldin_score(X_train, train_predict)

    # Time predict
    predict_time, test_predict = bench.measure_function_time(
        kmeans.predict, X_test, params=params)

    acc_test = davies_bouldin_score(X_test, test_predict)

    bench.print_output(
        library='sklearn',
        algorithm='kmeans',
        stages=['training', 'prediction'],
        params=params,
        functions=['KMeans.fit', 'KMeans.predict'],
        times=[fit_time, predict_time],
        metric_type=['davies_bouldin_score', 'inertia', 'iter'],
        metrics=[
            [acc_train, acc_test],
            [kmeans.inertia_, kmeans.inertia_],
            [kmeans.n_iter_, kmeans.n_iter_]
        ],
        data=[X_train, X_test],
        alg_instance=kmeans,
    )
Ejemplo n.º 10
0
    parser.add_argument('--max-depth', type=int, default=0,
                        help='Upper bound on depth of constructed trees')
    parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
                        help='Minimum samples number for node splitting')
    parser.add_argument('--max-leaf-nodes', type=int, default=None,
                        help='Maximum leaf nodes per tree')
    parser.add_argument('--min-impurity-decrease', type=float, default=0.,
                        help='Needed impurity decrease for node splitting')
    parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
                        action='store_false',
                        help="Don't control bootstraping")

    params = bench.parse_args(parser, prefix='daal4py')

    # Load data
    X_train, X_test, y_train, y_test = bench.load_data(
        params, add_dtype=True, label_2d=True)

    params.n_classes = len(np.unique(y_train))
    if isinstance(params.max_features, float):
        params.max_features = int(X_train.shape[1] * params.max_features)

    # Time fit and predict
    fit_time, res = bench.measure_function_time(
        df_clsf_fit, X_train, y_train,
        params.n_classes,
        n_trees=params.num_trees,
        n_features_per_node=params.max_features,
        max_depth=params.max_depth,
        min_impurity=params.min_impurity_decrease,
        bootstrap=params.bootstrap,
        seed=params.seed,
Ejemplo n.º 11
0
                    type=float,
                    default=0.,
                    help='Absolute threshold')
parser.add_argument('--maxiter',
                    type=int,
                    default=100,
                    help='Maximum number of iterations')
parser.add_argument('--samples-per-batch',
                    type=int,
                    default=32768,
                    help='Maximum number of iterations')
parser.add_argument('--n-clusters', type=int, help='Number of clusters')
params = bench.parse_args(parser, prefix='cuml', loop_types=('fit', 'predict'))

# Load and convert generated data
X_train, X_test, _, _ = bench.load_data(params)

X_init: Any
if params.filei == 'k-means++':
    X_init = 'k-means++'
# Load initial centroids from specified path
elif params.filei is not None:
    X_init = {
        k: v.astype(params.dtype)
        for k, v in np.load(params.filei).items()
    }
    if isinstance(X_init, np.ndarray):
        params.n_clusters = X_init.shape[0]
# or choose random centroids from training data
else:
    np.random.seed(params.seed)
Ejemplo n.º 12
0
def main():
    from sklearn.svm import SVC

    X_train, X_test, y_train, y_test = bench.load_data(params)

    if params.gamma is None:
        params.gamma = 1.0 / X_train.shape[1]

    cache_size_bytes = bench.get_optimal_cache_size(
        X_train.shape[0], max_cache=params.max_cache_size)
    params.cache_size_mb = cache_size_bytes / 1024**2
    params.n_classes = len(np.unique(y_train))

    clf = SVC(C=params.C,
              kernel=params.kernel,
              cache_size=params.cache_size_mb,
              tol=params.tol,
              gamma=params.gamma,
              probability=params.probability,
              random_state=43)

    fit_time, _ = bench.measure_function_time(clf.fit,
                                              X_train,
                                              y_train,
                                              params=params)
    params.sv_len = clf.support_.shape[0]

    if params.probability:
        state_predict = 'predict_proba'
        accuracy_type = 'log_loss'

        def metric_call(x, y):
            return bench.log_loss(x, y)

        clf_predict = clf.predict_proba
    else:
        state_predict = 'predict'
        accuracy_type = 'accuracy[%]'

        def metric_call(x, y):
            return bench.accuracy_score(x, y)

        clf_predict = clf.predict

    predict_train_time, y_pred = bench.measure_function_time(clf_predict,
                                                             X_train,
                                                             params=params)
    train_acc = metric_call(y_train, y_pred)

    predict_test_time, y_pred = bench.measure_function_time(clf_predict,
                                                            X_test,
                                                            params=params)
    test_acc = metric_call(y_test, y_pred)

    bench.print_output(library='sklearn',
                       algorithm='svc',
                       stages=['training', state_predict],
                       params=params,
                       functions=['SVM.fit', f'SVM.{state_predict}'],
                       times=[fit_time, predict_train_time],
                       accuracy_type=accuracy_type,
                       accuracies=[train_acc, test_acc],
                       data=[X_train, X_train],
                       alg_instance=clf)
Ejemplo n.º 13
0
                    dest='alpha',
                    type=float,
                    default=1.0,
                    help='Regularization parameter')
parser.add_argument('--maxiter',
                    type=int,
                    default=1000,
                    help='Maximum iterations for the iterative solver')
parser.add_argument('--tol',
                    type=float,
                    default=0.0,
                    help='Tolerance for solver.')
params = parse_args(parser)

# Load data
X_train, X_test, y_train, y_test = load_data(params)

# Create our regression object
regr = Lasso(fit_intercept=params.fit_intercept,
             alpha=params.alpha,
             tol=params.tol,
             max_iter=params.maxiter,
             copy_X=False)

columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
           'time')

# Time fit
fit_time, _ = measure_function_time(regr.fit, X_train, y_train, params=params)

# Time predict
Ejemplo n.º 14
0
                    type=int,
                    default=None,
                    help='Number of components to find')
parser.add_argument('--whiten',
                    action='store_true',
                    default=False,
                    help='Perform whitening')
parser.add_argument('--write-results',
                    action='store_true',
                    default=False,
                    help='Write results to disk for verification')
params = parse_args(parser, size=(10000, 1000))

# Load data
X_train, X_test, _, _ = load_data(params,
                                  generated_data=['X_train'],
                                  add_dtype=True)

if params.n_components is None:
    p, n = X_train.shape
    params.n_components = min((n, (2 + min((n, p))) // 3))


# Define how to do our scikit-learn PCA using DAAL...
def pca_fit_daal(X, n_components, method):

    if n_components < 1:
        n_components = min(X.shape)

    fptype = getFPType(X)
Ejemplo n.º 15
0
                    help='Upper bound on features used at each split')
parser.add_argument('--max-depth', type=int, default=None,
                    help='Upper bound on depth of constructed trees')
parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
                    help='Minimum samples number for node splitting')
parser.add_argument('--max-leaf-nodes', type=int, default=-1,
                    help='Maximum leaf nodes per tree')
parser.add_argument('--min-impurity-decrease', type=float, default=0.,
                    help='Needed impurity decrease for node splitting')
parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
                    action='store_false', help="Don't control bootstraping")

params = bench.parse_args(parser)

# Load and convert data
X_train, X_test, y_train, y_test = bench.load_data(params, int_label=True)

if params.criterion == 'gini':
    params.criterion = 0
else:
    params.criterion = 1

if params.split_algorithm == 'hist':
    params.split_algorithm = 0
else:
    params.split_algorithm = 1

params.n_classes = y_train[y_train.columns[0]].nunique()
clf: Any

Ejemplo n.º 16
0
                    default=True,
                    action='store_false',
                    help="Don't fit intercept (assume data already centered)")
parser.add_argument('--solver',
                    default='auto',
                    help='Solver used for training')
parser.add_argument('--alpha',
                    type=float,
                    default=1.0,
                    help='Regularization strength')
params = bench.parse_args(parser)

from sklearn.linear_model import Ridge

# Load data
X_train, X_test, y_train, y_test = bench.load_data(
    params, generated_data=['X_train', 'y_train'])

# Create our regression object
regr = Ridge(fit_intercept=params.fit_intercept,
             alpha=params.alpha,
             solver=params.solver)

# Time fit
fit_time, _ = bench.measure_function_time(regr.fit,
                                          X_train,
                                          y_train,
                                          params=params)

# Time predict
predict_time, yp = bench.measure_function_time(regr.predict,
                                               X_test,
Ejemplo n.º 17
0
                    help='Do not perform data shuffle before splitting')
parser.add_argument('--include-y',
                    default=False,
                    action='store_true',
                    help='Include label (Y) in splitting')
parser.add_argument('--rng',
                    default=None,
                    choices=('MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH',
                             'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10',
                             'NONDETERM', None),
                    help='Random numbers generator for shuffling '
                    '(only for IDP scikit-learn)')
params = parse_args(parser)

# Load generated data
X, y, _, _ = load_data(params)

if params.include_y:
    data_args = (X, y)
else:
    data_args = (X, )

tts_params = {
    'train_size': params.train_size,
    'test_size': params.test_size,
    'shuffle': not params.do_not_shuffle,
    'random_state': params.seed
}

if params.rng is not None:
    tts_params['rng'] = params.rng
Ejemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser(description='daal4py SVC benchmark with '
                                     'linear kernel')
    parser.add_argument('-C',
                        dest='C',
                        type=float,
                        default=1.0,
                        help='SVM regularization parameter')
    parser.add_argument('--kernel',
                        choices=('linear', 'rbf'),
                        default='linear',
                        help='SVM kernel function')
    parser.add_argument('--gamma',
                        type=float,
                        default=None,
                        help='Parameter for kernel="rbf"')
    parser.add_argument('--maxiter',
                        type=int,
                        default=100000,
                        help='Maximum iterations for the iterative solver. ')
    parser.add_argument('--max-cache-size',
                        type=int,
                        default=8,
                        help='Maximum cache size, in gigabytes, for SVM.')
    parser.add_argument('--tau',
                        type=float,
                        default=1e-12,
                        help='Tau parameter for working set selection scheme')
    parser.add_argument('--tol', type=float, default=1e-3, help='Tolerance')
    parser.add_argument('--no-shrinking',
                        action='store_false',
                        default=True,
                        dest='shrinking',
                        help="Don't use shrinking heuristic")
    params = parse_args(parser, prefix='daal4py')

    # Load data
    X_train, X_test, y_train, y_test = load_data(params,
                                                 add_dtype=True,
                                                 label_2d=True)

    if params.gamma is None:
        params.gamma = 1 / X_train.shape[1]

    cache_size_bytes = get_optimal_cache_size(X_train.shape[0],
                                              max_cache=params.max_cache_size)
    params.cache_size_mb = cache_size_bytes / 2**20
    params.cache_size_bytes = cache_size_bytes
    params.n_classes = np.unique(y_train).size

    columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype',
               'size', 'kernel', 'cache_size_mb', 'C', 'sv_len', 'n_classes',
               'accuracy', 'time')

    # Time fit and predict
    fit_time, res = measure_function_time(test_fit,
                                          X_train,
                                          y_train,
                                          params,
                                          params=params)
    res, support, indices, n_support = res
    params.sv_len = support.shape[0]

    yp = test_predict(X_train, res, params)
    train_acc = 100 * accuracy_score(yp, y_train)

    predict_time, yp = measure_function_time(test_predict,
                                             X_test,
                                             res,
                                             params,
                                             params=params)

    test_acc = 100 * accuracy_score(yp, y_train)

    print_output(library='daal4py',
                 algorithm='svc',
                 stages=['training', 'prediction'],
                 columns=columns,
                 params=params,
                 functions=['SVM.fit', 'SVM.predict'],
                 times=[fit_time, predict_time],
                 accuracy_type='accuracy[%]',
                 accuracies=[train_acc, test_acc],
                 data=[X_train, X_test])
Ejemplo n.º 19
0
def main():
    from sklearn.svm import SVC

    X_train, X_test, y_train, y_test = bench.load_data(params)
    y_train = np.asfortranarray(y_train).ravel()

    if params.gamma is None:
        params.gamma = 1.0 / X_train.shape[1]

    cache_size_bytes = bench.get_optimal_cache_size(
        X_train.shape[0], max_cache=params.max_cache_size)
    params.cache_size_mb = cache_size_bytes / 1024**2
    params.n_classes = len(np.unique(y_train))

    clf = SVC(C=params.C,
              kernel=params.kernel,
              cache_size=params.cache_size_mb,
              tol=params.tol,
              gamma=params.gamma,
              probability=params.probability,
              random_state=43,
              degree=params.degree)

    fit_time, _ = bench.measure_function_time(clf.fit,
                                              X_train,
                                              y_train,
                                              params=params)
    params.sv_len = clf.support_.shape[0]

    if params.probability:
        state_predict = 'predict_proba'
        clf_predict = clf.predict_proba
        train_acc = None
        test_acc = None

        predict_train_time, y_pred = bench.measure_function_time(clf_predict,
                                                                 X_train,
                                                                 params=params)
        train_log_loss = bench.log_loss(y_train, y_pred)
        train_roc_auc = bench.roc_auc_score(y_train, y_pred)

        _, y_pred = bench.measure_function_time(clf_predict,
                                                X_test,
                                                params=params)
        test_log_loss = bench.log_loss(y_test, y_pred)
        test_roc_auc = bench.roc_auc_score(y_test, y_pred)
    else:
        state_predict = 'prediction'
        clf_predict = clf.predict
        train_log_loss = None
        test_log_loss = None
        train_roc_auc = None
        test_roc_auc = None

        predict_train_time, y_pred = bench.measure_function_time(clf_predict,
                                                                 X_train,
                                                                 params=params)
        train_acc = bench.accuracy_score(y_train, y_pred)

        _, y_pred = bench.measure_function_time(clf_predict,
                                                X_test,
                                                params=params)
        test_acc = bench.accuracy_score(y_test, y_pred)

    bench.print_output(
        library='sklearn',
        algorithm='SVC',
        stages=['training', state_predict],
        params=params,
        functions=['SVM.fit', f'SVM.{state_predict}'],
        times=[fit_time, predict_train_time],
        metric_type=['accuracy', 'log_loss', 'roc_auc', 'n_sv'],
        metrics=[
            [train_acc, test_acc],
            [train_log_loss, test_log_loss],
            [train_roc_auc, test_roc_auc],
            [int(clf.n_support_.sum()),
             int(clf.n_support_.sum())],
        ],
        data=[X_train, X_train],
        alg_instance=clf,
    )
Ejemplo n.º 20
0
                    type=str,
                    default='full',
                    choices=['auto', 'full', 'jacobi'],
                    help='SVD solver to use')
parser.add_argument('--n-components',
                    type=int,
                    default=None,
                    help='Number of components to find')
parser.add_argument('--whiten',
                    action='store_true',
                    default=False,
                    help='Perform whitening')
params = bench.parse_args(parser)

# Load random data
X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train'])

if params.n_components is None:
    p, n = X_train.shape
    params.n_components = min((n, (2 + min((n, p))) // 3))

# Create our PCA object
pca = PCA(svd_solver=params.svd_solver,
          whiten=params.whiten,
          n_components=params.n_components)

# Time fit
fit_time, _ = bench.measure_function_time(pca.fit, X_train, params=params)

# Time transform
transform_time, _ = bench.measure_function_time(pca.transform,
Ejemplo n.º 21
0
def compute_distances(pairwise_distances, X):
    algorithm = pairwise_distances(fptype=getFPType(X))
    return algorithm.compute(X)


parser = argparse.ArgumentParser(description='daal4py pairwise distances '
                                 'benchmark')
parser.add_argument('--metric',
                    default='cosine',
                    choices=['cosine', 'correlation'],
                    help='Metric to test for pairwise distances')
params = bench.parse_args(parser)

# Load data
X, _, _, _ = bench.load_data(params,
                             generated_data=['X_train'],
                             add_dtype=True)

pairwise_distances = cosine_distance if params.metric == 'cosine' else correlation_distance

time, _ = bench.measure_function_time(compute_distances,
                                      pairwise_distances,
                                      X,
                                      params=params)

bench.print_output(library='daal4py',
                   algorithm='distances',
                   stages=['computation'],
                   params=params,
                   functions=[params.metric.capitalize()],
                   times=[time],
Ejemplo n.º 22
0
parser.add_argument('-e',
                    '--eps',
                    '--epsilon',
                    type=float,
                    default=10.,
                    help='Radius of neighborhood of a point')
parser.add_argument('-m',
                    '--min-samples',
                    default=5,
                    type=int,
                    help='The minimum number of samples required in a '
                    'neighborhood to consider a point a core point')
params = bench.parse_args(parser, prefix='daal4py')

# Load generated data
X, _, _, _ = bench.load_data(params, add_dtype=True)


# Define functions to time
def test_dbscan(X):
    algorithm = dbscan(fptype=getFPType(X),
                       epsilon=params.eps,
                       minObservations=params.min_samples,
                       resultsToCompute='computeCoreIndices')
    return algorithm.compute(X)


# Time clustering
time, result = bench.measure_function_time(test_dbscan, X, params=params)
params.n_clusters = int(result.nClusters[0, 0])
Ejemplo n.º 23
0
parser.add_argument('--min-impurity-decrease',
                    type=float,
                    default=0.,
                    help='Needed impurity decrease for node splitting')
parser.add_argument('--no-bootstrap',
                    dest='bootstrap',
                    default=True,
                    action='store_false',
                    help="Don't control bootstraping")

params = bench.parse_args(parser)

from sklearn.ensemble import RandomForestClassifier

# Load and convert data
X_train, X_test, y_train, y_test = bench.load_data(params)

# Create our random forest classifier
clf = RandomForestClassifier(
    criterion=params.criterion,
    n_estimators=params.num_trees,
    max_depth=params.max_depth,
    max_features=params.max_features,
    min_samples_split=params.min_samples_split,
    max_leaf_nodes=params.max_leaf_nodes,
    min_impurity_decrease=params.min_impurity_decrease,
    bootstrap=params.bootstrap,
    random_state=params.seed,
    n_jobs=params.n_jobs)

params.n_classes = len(np.unique(y_train))
Ejemplo n.º 24
0
                    type=str,
                    help='Initial clusters')
parser.add_argument('-t',
                    '--tol',
                    default=0.,
                    type=float,
                    help='Absolute threshold')
parser.add_argument('--maxiter',
                    type=int,
                    default=100,
                    help='Maximum number of iterations')
parser.add_argument('--n-clusters', type=int, help='Number of clusters')
params = bench.parse_args(parser, prefix='daal4py')

# Load generated data
X_train, X_test, _, _ = bench.load_data(params, add_dtype=True)

# Load initial centroids from specified path
if params.filei is not None:
    X_init = np.load(params.filei).astype(params.dtype)
    params.n_clusters = X_init.shape[0]
# or choose random centroids from training data
else:
    np.random.seed(params.seed)
    centroids_idx = np.random.randint(0,
                                      X_train.shape[0],
                                      size=params.n_clusters)
    if hasattr(X_train, "iloc"):
        X_init = X_train.iloc[centroids_idx].values
    else:
        X_init = X_train[centroids_idx]
Ejemplo n.º 25
0
                                 'benchmark')
parser.add_argument('--no-fit-intercept',
                    dest='fit_intercept',
                    default=True,
                    action='store_false',
                    help="Don't fit intercept (assume data already centered)")
parser.add_argument('--alpha',
                    type=float,
                    default=1.0,
                    help='Regularization strength')
params = bench.parse_args(parser, prefix='daal4py')

# Generate random data
X_train, X_test, y_train, y_test = bench.load_data(
    params,
    generated_data=['X_train', 'y_train'],
    add_dtype=True,
    label_2d=True if params.file_X_train is not None else False)


# Create our regression objects
def test_fit(X, y):
    regr_train = ridge_regression_training(fptype=getFPType(X),
                                           ridgeParameters=np.array(
                                               [[params.alpha]]),
                                           interceptFlag=params.fit_intercept)
    return regr_train.compute(X, y)


def test_predict(Xp, model):
    regr_predict = ridge_regression_prediction(fptype=getFPType(Xp))
Ejemplo n.º 26
0
# SPDX-License-Identifier: MIT

import argparse
from bench import measure_function_time, parse_args, load_data, print_output
from sklearn.cluster import DBSCAN

parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark')
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10.,
                    help='Radius of neighborhood of a point')
parser.add_argument('-m', '--min-samples', default=5, type=int,
                    help='The minimum number of samples required in a '
                    'neighborhood to consider a point a core point')
params = parse_args(parser, n_jobs_supported=True)

# Load generated data
X, _, _, _ = load_data(params, add_dtype=True)

# Create our clustering object
dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs,
                min_samples=params.min_samples, metric='euclidean',
                algorithm='auto')

# N.B. algorithm='auto' will select DAAL's brute force method when running
# daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched
# scikit-learn.

columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
           'n_clusters', 'time')

# Time fit
time, _ = measure_function_time(dbscan.fit, X, params=params)
Ejemplo n.º 27
0
                    type=float,
                    default=0.,
                    help='Absolute threshold')
parser.add_argument('--maxiter',
                    type=int,
                    default=100,
                    help='Maximum number of iterations')
parser.add_argument('--samples-per-batch',
                    type=int,
                    default=32768,
                    help='Maximum number of iterations')
parser.add_argument('--n-clusters', type=int, help='Number of clusters')
params = parse_args(parser, prefix='cuml', loop_types=('fit', 'predict'))

# Load and convert generated data
X_train, X_test, _, _ = load_data(params)

if params.filei == 'k-means++':
    X_init = 'k-means++'
# Load initial centroids from specified path
elif params.filei is not None:
    X_init = np.load(params.filei).astype(params.dtype)
    params.n_clusters = X_init.shape[0]
# or choose random centroids from training data
else:
    np.random.seed(params.seed)
    centroids_idx = np.random.randint(0,
                                      X_train.shape[0],
                                      size=params.n_clusters)
    if hasattr(X_train, "iloc"):
        X_init = X_train.iloc[centroids_idx].to_pandas().values
Ejemplo n.º 28
0
parser.add_argument('-e',
                    '--eps',
                    '--epsilon',
                    type=float,
                    default=10.,
                    help='Radius of neighborhood of a point')
parser.add_argument('-m',
                    '--min-samples',
                    default=5,
                    type=int,
                    help='The minimum number of samples required in a '
                    'neighborhood to consider a point a core point')
params = bench.parse_args(parser)

# Load generated data
X, _, _, _ = bench.load_data(params)

# Create our clustering object
dbscan = DBSCAN(eps=params.eps, min_samples=params.min_samples)

# Time fit
time, _ = bench.measure_function_time(dbscan.fit, X, params=params)
labels = dbscan.labels_

X_host = bench.convert_to_numpy(X)
labels_host = bench.convert_to_numpy(labels)

acc = davies_bouldin_score(X_host, labels_host)
params.n_clusters = len(set(labels_host)) - (1 if -1 in labels_host else 0)

bench.print_output(library='cuml',