Beispiel #1
0
test_data = [
    # ((1, 0, 1, 0, 0, 0, 0, 1), test_101c),
    ((0, 0, 2, 0, 0, 0, 0, 1), test_002c),
    ((0, 1, 0, 0, 0, 0, 0, 1), test_010c),
    ((1, 1, 0, 0, 0, 0, 0, 0), test_110),
    ((0, 1, 1, 0, 0, 0, 0, 1), test_011c),
    ((0, 1, 1, 0, 0, 0, 0, 1), test_011c_exog),
    ((1, 2, 1, 0, 0, 0, 0, 1), test_121c),
    ((1, 1, 1, 0, 0, 0, 0, 1), test_111c_missing),
    ((1, 0, 1, 1, 1, 1, 4, 0), test_101_111_4),
    ((5, 1, 0, 0, 0, 0, 0, 0), test_510),
    ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c),
    ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c_missing),
    ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c_missing_exog),
    ((1, 1, 2, 0, 1, 2, 4, 0), test_112_012_4),
    stress_param((1, 1, 1, 1, 1, 1, 12, 0), test_111_111_12),
    stress_param((1, 1, 1, 1, 1, 1, 12, 0), test_111_111_12_missing),
    stress_param((1, 0, 1, 1, 1, 1, 12, 1), test_111_111_12c_missing_exog),
]

# Dictionary for lazy-loading of datasets
# (name, dtype) -> (pandas dataframe, cuDF dataframe)
lazy_data = {}

# Dictionary for lazy-evaluation of reference fits
# (p, d, q, P, D, Q, s, k, name, dtype) -> SARIMAXResults
lazy_ref_fit = {}


def extract_order(tup):
    """Extract the order from a tuple of parameters"""
Beispiel #2
0
from sklearn.linear_model import SGDClassifier
from sklearn.datasets.samples_generator import make_classification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


@pytest.mark.parametrize('lrate', ['constant', 'invscaling', 'adaptive'])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('input_type', ['ndarray'])
@pytest.mark.parametrize('penalty', ['none', 'l1', 'l2', 'elasticnet'])
@pytest.mark.parametrize('loss', ['hinge', 'log', 'squared_loss'])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize('column_info', [
    unit_param([20, 10]),
    quality_param([100, 50]),
    stress_param([1000, 500])
])
def test_mbsgd_classifier(datatype, lrate, input_type, penalty, loss, nrows,
                          column_info):
    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_informative=n_info,
                               n_features=ncols,
                               random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
Beispiel #3
0
from sklearn.ensemble import RandomForestClassifier as skrfc
from sklearn.ensemble import RandomForestRegressor as skrfr
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.datasets import fetch_california_housing, \
    make_classification, make_regression
from sklearn.model_selection import train_test_split


@pytest.fixture(
    scope="session",
    params=[
        unit_param({'n_samples': 350, 'n_features': 20, 'n_informative': 10}),
        quality_param({'n_samples': 5000, 'n_features': 200,
                      'n_informative': 80}),
        stress_param({'n_samples': 500000, 'n_features': 400,
                     'n_informative': 180})
    ])
def small_clf(request):
    X, y = make_classification(n_samples=request.param['n_samples'],
                               n_features=request.param['n_features'],
                               n_clusters_per_class=1,
                               n_informative=request.param['n_informative'],
                               random_state=123, n_classes=2)
    return X, y


@pytest.fixture(
    scope="session",
    params=[
        unit_param({'n_samples': 350, 'n_features': 30, 'n_informative': 15}),
        quality_param({'n_samples': 5000, 'n_features': 200,
Beispiel #4
0

@pytest.mark.mg
@pytest.mark.parametrize('chunks', ['auto', 2, 1])
def test_confusion_matrix_binary(cluster, chunks):
    client = Client(cluster)
    y_true = da.from_array(cp.array([0, 1, 0, 1]), chunks=chunks)
    y_pred = da.from_array(cp.array([1, 1, 1, 0]), chunks=chunks)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    ref = cp.array([0, 2, 1, 1])
    cp.testing.assert_array_equal(ref, cp.array([tn, fp, fn, tp]))
    client.close()


@pytest.mark.mg
@pytest.mark.parametrize('n_samples', [50, 3000, stress_param(500000)])
@pytest.mark.parametrize('dtype', [np.int32, np.int64])
@pytest.mark.parametrize('problem_type', ['binary', 'multiclass'])
def test_confusion_matrix_random(n_samples, dtype, problem_type, cluster):
    client = Client(cluster)
    upper_range = 2 if problem_type == 'binary' else 1000

    y_true, y_pred, np_y_true, np_y_pred = generate_random_labels(
        lambda rng: rng.randint(0, upper_range, n_samples).astype(dtype),
        as_cupy=True)
    y_true, y_pred = da.from_array(y_true), da.from_array(y_pred)

    cm = confusion_matrix(y_true, y_pred)
    ref = sk_confusion_matrix(np_y_true, np_y_pred)
    cp.testing.assert_array_almost_equal(ref, cm, decimal=4)
    client.close()
Beispiel #5
0
from cuml.test.utils import quality_param
from cuml.test.utils import stress_param

import dask.array as da

from cuml.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_rand_score as sk_adjusted_rand_score

from cuml.dask.common.dask_arr_utils import to_dask_cudf


@pytest.mark.mg
@pytest.mark.parametrize(
    "nrows",
    [unit_param(1e3), quality_param(1e5),
     stress_param(5e6)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize(
    "nclusters",
    [unit_param(5), quality_param(10),
     stress_param(50)])
@pytest.mark.parametrize(
    "n_parts",
    [unit_param(None), quality_param(7),
     stress_param(50)])
@pytest.mark.parametrize("delayed_predict", [True, False])
@pytest.mark.parametrize("input_type", ["dataframe", "array"])
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    input_type, client):

    from cuml.dask.cluster import KMeans as cumlKMeans
Beispiel #6
0
@pytest.fixture(scope="session",
                params=[
                    unit_param({
                        'n_samples': 350,
                        'n_features': 20,
                        'n_informative': 10
                    }),
                    quality_param({
                        'n_samples': 5000,
                        'n_features': 200,
                        'n_informative': 80
                    }),
                    stress_param({
                        'n_samples': 500000,
                        'n_features': 400,
                        'n_informative': 180
                    })
                ])
def small_clf(request):
    X, y = make_classification(n_samples=request.param['n_samples'],
                               n_features=request.param['n_features'],
                               n_clusters_per_class=1,
                               n_informative=request.param['n_informative'],
                               random_state=123,
                               n_classes=2)
    return X, y


@pytest.fixture(scope="session",
                params=[
Beispiel #7
0
    cuSVC.fit(X_train, y_train)

    sklSVC = svm.SVC(**params)
    sklSVC.fit(X_train, y_train)

    compare_svm(cuSVC, sklSVC, X_train, y_train, cmp_decision_func=True)


@pytest.mark.parametrize('params', [
    {'kernel': 'linear', 'C': 1},
    {'kernel': 'rbf', 'C': 1, 'gamma': 1},
    {'kernel': 'poly', 'C': 1, 'gamma': 1},
])
@pytest.mark.parametrize('dataset', ['classification2', 'gaussian', 'blobs'])
@pytest.mark.parametrize('n_rows', [3, unit_param(100), quality_param(1000),
                                    stress_param(5000)])
@pytest.mark.parametrize('n_cols', [2, unit_param(100), quality_param(1000),
                         stress_param(1000)])
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols):
    if (params['kernel'] == 'linear' and
            dataset in ['gaussian', 'classification2'] and
            n_rows > 1000 and n_cols >= 1000):
        # linear kernel will not fit the gaussian dataset, but takes very long
        return
    X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols)

    # Default to numpy for testing
    with cuml.using_output_type("numpy"):

        cuSVC = cu_svm.SVC(**params)
        cuSVC.fit(X_train, y_train)
Beispiel #8
0
    X = StandardScaler().fit_transform(X)

    cu_y_pred = cuml_kmeans.fit_predict(X)
    cu_score = adjusted_rand_score(cu_y_pred, y)
    kmeans = cluster.KMeans(random_state=12, n_clusters=params['n_clusters'])
    sk_y_pred = kmeans.fit_predict(X)
    sk_score = adjusted_rand_score(sk_y_pred, y)

    assert sk_score - 1e-2 <= cu_score <= sk_score + 1e-2


@pytest.mark.parametrize('name', dataset_names)
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
def test_kmeans_sklearn_comparison_default(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])
    X_train_df, = dask_utils.persist_across_workers(c, [X_train_df],
                                                    workers=list(workers))

    return X_train_df


def _scale_rows(client, nrows):
    workers = list(client.scheduler_info()['workers'].keys())
    n_workers = len(workers)
    return n_workers * nrows


@pytest.mark.parametrize(
    "nrows",
    [unit_param(300), quality_param(1e6),
     stress_param(5e8)])
@pytest.mark.parametrize("ncols", [10, 30])
@pytest.mark.parametrize(
    "nclusters",
    [unit_param(5), quality_param(10),
     stress_param(15)])
@pytest.mark.parametrize(
    "n_neighbors",
    [unit_param(10), quality_param(4),
     stress_param(100)])
@pytest.mark.parametrize(
    "n_parts",
    [unit_param(1),
     unit_param(5),
     quality_param(7),
     stress_param(50)])
    knn_cu.fit(X)

    ret = knn_cu.kneighbors(X, k, return_distance=False)
    assert not isinstance(ret, tuple)
    assert ret.shape == (n_samples, k)

    ret = knn_cu.kneighbors(X, k, return_distance=True)
    assert isinstance(ret, tuple)
    assert len(ret) == 2


@pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'n_feats',
    [unit_param(3), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize(
    'k', [unit_param(3), quality_param(30),
          stress_param(50)])
@pytest.mark.parametrize("metric", valid_metrics())
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    X_index = X[:100]
    X_search = X[101:]

    p = 5  # Testing 5-norm of the minkowski metric only
Beispiel #11
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pytest
from dask.distributed import Client, wait

import numpy as np
from cuml.test.utils import array_equal, \
    unit_param, stress_param


@pytest.mark.mg
@pytest.mark.parametrize("nrows", [unit_param(6e5),
                         stress_param(5e6)])
@pytest.mark.parametrize("ncols", [unit_param(20),
                         stress_param(1000)])
@pytest.mark.parametrize("n_parts", [unit_param(67)])
def test_pca_fit(nrows, ncols, n_parts, cluster):

    client = Client(cluster)

    try:

        from cuml.dask.decomposition import TruncatedSVD as daskTPCA
        from sklearn.decomposition import TruncatedSVD

        from cuml.dask.datasets import make_blobs

        X_cudf, _ = make_blobs(n_samples=nrows,
Beispiel #12
0
                            tolerance_integration=0.001)

# Dictionary matching a test case to a tuple of model parameters
# (a test case could be used with different models)
# (p, d, q, P, D, Q, s, k) -> ARIMAData
test_data = [
    # (1, 0, 1, 0, 0, 0, 0, 1): test_101c,
    ((0, 0, 2, 0, 0, 0, 0, 1), test_002c),
    ((0, 1, 0, 0, 0, 0, 0, 1), test_010c),
    ((1, 1, 0, 0, 0, 0, 0, 0), test_110),
    ((0, 1, 1, 0, 0, 0, 0, 1), test_011c),
    ((1, 2, 1, 0, 0, 0, 0, 1), test_121c),
    ((1, 0, 1, 1, 1, 1, 4, 0), test_101_111_4),
    ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c),
    ((1, 1, 2, 0, 1, 2, 4, 0), test_112_012_4),
    stress_param((1, 1, 1, 1, 1, 1, 12, 0), test_111_111_12),
]

# Dictionary for lazy-loading of datasets
# (name, dtype) -> (pandas dataframe, cuDF dataframe)
lazy_data = {}

# Dictionary for lazy-evaluation of reference fits
# (p, d, q, P, D, Q, s, k, name, dtype) -> SARIMAXResults
lazy_ref_fit = {}


def extract_order(tup):
    """Extract the order from a tuple of parameters"""
    p, d, q, P, D, Q, s, k = tup
    return (p, d, q), (P, D, Q, s), k
Beispiel #13
0
from cuml.test.utils import get_pattern, unit_param, \
    quality_param, stress_param, array_equal, assert_dbscan_equal

from sklearn.cluster import DBSCAN as skDBSCAN
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler


@pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize('out_dtype', [
    unit_param("int32"),
    unit_param(np.int32),
    unit_param("int64"),
    unit_param(np.int64),
    quality_param("int32"),
    stress_param("int32")
])
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    if nrows == 500000 and pytest.max_gpu_memory < 32:
from cuml.metrics import r2_score
from cuml.test.utils import get_handle, unit_param, \
    quality_param, stress_param

from sklearn.ensemble import RandomForestClassifier as skrfc
from sklearn.ensemble import RandomForestRegressor as skrfr
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_california_housing, \
    make_classification, make_regression
from sklearn.model_selection import train_test_split


@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize('column_info', [
    unit_param([20, 10]),
    quality_param([200, 100]),
    stress_param([500, 350])
])
@pytest.mark.parametrize(
    'rows_sample',
    [unit_param(1.0), quality_param(0.90),
     stress_param(0.95)])
@pytest.mark.parametrize('datatype', [np.float32])
@pytest.mark.parametrize('split_algo', [0, 1])
@pytest.mark.parametrize('max_features', [1.0, 'auto', 'log2', 'sqrt'])
def test_rf_classification(datatype, split_algo, rows_sample, nrows,
                           column_info, max_features):
    use_handle = True
Beispiel #15
0
    cuml_metrics = cuml.neighbors.VALID_METRICS_SPARSE[cuml_algo]
    sklearn_metrics = set(sklearn.neighbors.VALID_METRICS_SPARSE[algo])
    sklearn_metrics.update(sklearn.neighbors.VALID_METRICS[algo])
    return [value for value in cuml_metrics if value in sklearn_metrics]


def metric_p_combinations():
    for metric in valid_metrics():
        yield metric, 2
        if metric in ("minkowski", "lp"):
            yield metric, 3


@pytest.mark.parametrize("datatype", ["dataframe", "numpy"])
@pytest.mark.parametrize("metric_p", metric_p_combinations())
@pytest.mark.parametrize("nrows", [1000, stress_param(10000)])
@pytest.mark.skipif(not has_scipy(), reason="Skipping test_self_neighboring"
                    " because Scipy is missing")
def test_self_neighboring(datatype, metric_p, nrows):
    """Test that searches using an indexed vector itself return sensible
    results for that vector

    For L2-derived metrics, this specifically exercises the slow high-precision
    mode used to correct for approximation errors in L2 computation during NN
    searches.
    """
    ncols = 1000
    n_clusters = 10
    n_neighbors = 3

    metric, p = metric_p
Beispiel #16
0
    """
    X = dataset.data

    tsne = TSNE(n_components=2,
                random_state=1,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                learning_rate_method='none',
                method=method,
                min_grad_norm=1e-12,
                perplexity=DEFAULT_PERPLEXITY)

    Y = tsne.fit_transform(X)
    validate_embedding(X, Y)


@pytest.mark.parametrize('nrows', [stress_param(2400000)])
@pytest.mark.parametrize('ncols', [stress_param(225)])
@pytest.mark.parametrize('method', ['fft', 'barnes_hut'])
def test_tsne_large(nrows, ncols, method):
    """
    This tests how TSNE handles large input
    """
    X, y = make_blobs(n_samples=nrows,
                      centers=8,
                      n_features=ncols,
                      random_state=1).astype(np.float32)

    tsne = TSNE(random_state=1,
                exaggeration_iter=1,
                n_iter=2,
                method=method,
Beispiel #17
0
                       solver="eig")

    assert getattr(cu_clf, 'score', False)
    sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False)

    gdf_data = cudf.DataFrame.from_gpu_matrix(cuda.to_device(X_train))
    gdf_train = cudf.DataFrame(dict(train=y_train))

    sk_cu_grid.fit(gdf_data, gdf_train.train)
    assert sk_cu_grid.best_params_ == {'alpha': 0.1}


@pytest.mark.parametrize(
    'nrows',
    [unit_param(30), quality_param(5000),
     stress_param(500000)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(10), quality_param(100),
     stress_param(200)])
@pytest.mark.parametrize(
    'n_info',
    [unit_param(7), quality_param(50),
     stress_param(100)])
@pytest.mark.parametrize('datatype', [np.float32])
def test_accuracy(nrows, ncols, n_info, datatype):

    use_handle = True
    train_rows = np.int32(nrows * 0.8)
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
Beispiel #18
0
                       solver="eig")

    assert getattr(cu_clf, 'score', False)
    sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False)

    gdf_data = cudf.DataFrame(X_train)
    gdf_train = cudf.DataFrame(dict(train=y_train))

    sk_cu_grid.fit(gdf_data, gdf_train.train)
    assert sk_cu_grid.best_params_ == {'alpha': 0.1}


@pytest.mark.parametrize(
    'nrows',
    [unit_param(30), quality_param(5000),
     stress_param(500000)])
@pytest.mark.parametrize(
    'ncols',
    [unit_param(10), quality_param(100),
     stress_param(200)])
@pytest.mark.parametrize(
    'n_info',
    [unit_param(7), quality_param(50),
     stress_param(100)])
@pytest.mark.parametrize('datatype', [np.float32])
def test_accuracy(nrows, ncols, n_info, datatype):

    use_handle = True
    train_rows = np.int32(nrows * 0.8)
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
Beispiel #19
0
        n_classes=num_classes,
        random_state=0,
    )
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=10
    )

    return X_train, X_test, y_train, y_test


@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("algorithm", ["eig", "svd"])
@pytest.mark.parametrize(
    "nrows", [unit_param(1000), quality_param(5000), stress_param(500000)]
)
@pytest.mark.parametrize(
    "column_info",
    [
        unit_param([20, 10]),
        quality_param([100, 50]),
        stress_param([1000, 500])
    ],
)
def test_linear_regression_model(datatype, algorithm, nrows, column_info):

    if algorithm == "svd" and nrows > 46340:
        pytest.skip("svd solver is not supported for the data that has more"
                    "than 46340 rows or columns if you are using CUDA version"
                    "10.x")
Beispiel #20
0
from cuml.linear_model import MBSGDRegressor as cumlMBSGRegressor
from cuml.metrics import r2_score
from cuml.test.utils import unit_param, quality_param, stress_param

from sklearn.linear_model import SGDRegressor
from cuml.datasets import make_regression
from sklearn.model_selection import train_test_split


@pytest.fixture(scope="module",
                params=[
                    unit_param([500, 20, 10, np.float32]),
                    unit_param([500, 20, 10, np.float64]),
                    quality_param([5000, 100, 50, np.float32]),
                    quality_param([5000, 100, 50, np.float64]),
                    stress_param([500000, 1000, 500, np.float32]),
                    stress_param([500000, 1000, 500, np.float64]),
                ],
                ids=[
                    '500-20-10-f32', '500-20-10-f64', '5000-100-50-f32',
                    '5000-100-50-f64', '500000-1000-500-f32',
                    '500000-1000-500-f64'
                ])
def make_dataset(request):
    nrows, ncols, n_info, datatype = request.param
    if nrows == 500000 and datatype == np.float64 and \
            pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test."
Beispiel #21
0
        params['eval_metric'] = 'error'
        params['objective'] = 'reg:squarederror'
        params['base_score'] = 0.0

    params['max_depth'] = 25
    params.update(xgboost_params)

    bst = xgb.train(params, dtrain, num_rounds)
    bst.save_model(model_path)
    return bst


@pytest.mark.parametrize(
    'n_rows', [unit_param(1000),
               quality_param(10000),
               stress_param(500000)])
@pytest.mark.parametrize(
    'n_columns',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
@pytest.mark.parametrize(
    'num_rounds',
    [unit_param(1),
     unit_param(5),
     quality_param(50),
     stress_param(90)])
@pytest.mark.skipif(has_xgboost() is False, reason="need to install xgboost")
def test_fil_classification(n_rows, n_columns, num_rounds, tmp_path):
    # settings
    classification = True  # change this to false to use regression
    n_rows = n_rows  # we'll use 1 millions rows
Beispiel #22
0
    X = dask_cudf.from_cudf(X, npartitions=2)
    Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]])
    Y_ohe = da.from_array(Y_ohe)

    enc = OneHotEncoder(handle_unknown='ignore')
    enc = enc.fit(X)
    df = enc.inverse_transform(Y_ohe)
    ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]})
    assert_frame_equal(df.compute().to_pandas(), ref.to_pandas())


@pytest.mark.mg
@pytest.mark.parametrize('drop', [None, 'first'])
@pytest.mark.parametrize('as_array', [True, False], ids=['cupy', 'cudf'])
@pytest.mark.parametrize('sparse', [True, False], ids=['sparse', 'dense'])
@pytest.mark.parametrize("n_samples", [10, 1000, stress_param(50000)])
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples):
    X, ary = generate_inputs_from_categories(n_samples=n_samples,
                                             as_array=as_array)
    if as_array:
        dX = da.from_array(X)
    else:
        dX = dask_cudf.from_cudf(X, npartitions=1)

    enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    ohe = enc.fit_transform(dX)
    ref = sk_enc.fit_transform(ary)
    if sparse:
        cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray())
    else:
Beispiel #23
0
    quality_param, stress_param

from sklearn import datasets
from sklearn.datasets import make_multilabel_classification
from sklearn.decomposition import PCA as skPCA
from sklearn.datasets import make_blobs
from cuml.common.exceptions import NotFittedError


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('input_type', ['ndarray'])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'name', [unit_param(None),
             quality_param('digits'),
             stress_param('blobs')])
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)
Beispiel #24
0
import joblib

from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs
from sklearn.manifold.t_sne import trustworthiness
from sklearn.metrics import adjusted_rand_score

dataset_names = ['iris', 'digits', 'wine', 'blobs']


@pytest.mark.parametrize(
    'nrows', [unit_param(500),
              quality_param(5000),
              stress_param(500000)])
@pytest.mark.parametrize(
    'n_feats',
    [unit_param(20), quality_param(100),
     stress_param(1000)])
def test_blobs_cluster(nrows, n_feats):
    data, labels = datasets.make_blobs(n_samples=nrows,
                                       n_features=n_feats,
                                       centers=5,
                                       random_state=0)
    embedding = cuUMAP(verbose=False).fit_transform(data, convert_dtype=True)

    if nrows < 500000:
        score = adjusted_rand_score(labels, KMeans(5).fit_predict(embedding))
        assert score == 1.0
Beispiel #25
0
import pytest

import numpy as np
from cuml.test.utils import array_equal, \
    unit_param, stress_param
import cupy as cp

from cuml.dask.common.dask_arr_utils import to_dask_cudf


@pytest.mark.mg
@pytest.mark.parametrize(
    "data_info",
    [unit_param([1000, 20, 30]),
     stress_param([int(9e6), 5000, 30])])
@pytest.mark.parametrize("input_type", ["dataframe", "array"])
def test_pca_fit(data_info, input_type, client):

    nrows, ncols, n_parts = data_info
    if nrows == int(9e6) and pytest.max_gpu_memory < 48:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 256
            ncols = ncols * pytest.max_gpu_memory // 256
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from sklearn.decomposition import TruncatedSVD
Beispiel #26
0
from cuml.test.utils import get_handle, array_equal, unit_param, \
    quality_param, stress_param

from sklearn import datasets
from sklearn.datasets import make_multilabel_classification
from sklearn.decomposition import PCA as skPCA
from sklearn.datasets.samples_generator import make_blobs


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('input_type', ['ndarray'])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.parametrize(
    'name', [unit_param(None),
             quality_param('digits'),
             stress_param('blobs')])
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)
    cuml_metrics = cuml.neighbors.VALID_METRICS_SPARSE[cuml_algo]
    sklearn_metrics = set(sklearn.neighbors.VALID_METRICS_SPARSE[algo])
    sklearn_metrics.update(sklearn.neighbors.VALID_METRICS[algo])
    return [value for value in cuml_metrics if value in sklearn_metrics]


def metric_p_combinations():
    for metric in valid_metrics():
        yield metric, 2
        if metric in ("minkowski", "lp"):
            yield metric, 3


@pytest.mark.parametrize("datatype", ["dataframe", "numpy"])
@pytest.mark.parametrize("metric_p", metric_p_combinations())
@pytest.mark.parametrize("nrows", [1000, stress_param(10000)])
@pytest.mark.skipif(not has_scipy(),
                    reason="Skipping test_self_neighboring"
                    " because Scipy is missing")
def test_self_neighboring(datatype, metric_p, nrows):
    """Test that searches using an indexed vector itself return sensible
    results for that vector

    For L2-derived metrics, this specifically exercises the slow high-precision
    mode used to correct for approximation errors in L2 computation during NN
    searches.
    """
    ncols = 1000
    n_clusters = 10
    n_neighbors = 3
Beispiel #28
0
        return model, X_test

    def assert_model(pickled_model, X_test):
        assert array_equal(result["rf_res"], pickled_model.predict(X_test))
        # Confirm no crash from score
        pickled_model.score(X_test, np.zeros(X_test.shape[0]))

    pickle_save_load(tmpdir, create_mod, assert_model)


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('keys', regression_models.keys())
@pytest.mark.parametrize(
    'data_size',
    [unit_param([500, 20, 10]),
     stress_param([500000, 1000, 500])])
@pytest.mark.parametrize('fit_intercept', [True, False])
def test_regressor_pickle(tmpdir, datatype, keys, data_size, fit_intercept):
    result = {}

    def create_mod():
        nrows, ncols, n_info = data_size
        X_train, y_train, X_test = make_dataset(datatype, nrows, ncols, n_info)
        model = regression_models[keys](fit_intercept=fit_intercept)
        model.fit(X_train, y_train)
        result["regressor"] = model.predict(X_test)
        return model, X_test

    def assert_model(pickled_model, X_test):
        assert array_equal(result["regressor"], pickled_model.predict(X_test))
    n_samples = np_array.shape[0]
    n_samples_per_part = int(n_samples / n_parts)
    chunks = [n_samples_per_part] * n_parts
    chunks[-1] += n_samples % n_samples_per_part
    chunks = tuple(chunks)
    return da.from_array(np_array, chunks=(chunks, -1))


@pytest.fixture(
    scope="module",
    params=[
        unit_param({'n_samples': 3000, 'n_features': 30,
                    'n_classes': 5, 'n_targets': 2}),
        quality_param({'n_samples': 8000, 'n_features': 35,
                       'n_classes': 12, 'n_targets': 3}),
        stress_param({'n_samples': 20000, 'n_features': 40,
                      'n_classes': 12, 'n_targets': 4})
    ])
def dataset(request):
    X, y = make_multilabel_classification(
        n_samples=int(request.param['n_samples'] * 1.2),
        n_features=request.param['n_features'],
        n_classes=request.param['n_classes'],
        n_labels=request.param['n_classes'],
        length=request.param['n_targets'])
    new_x = []
    new_y = []
    for i in range(y.shape[0]):
        a = np.argwhere(y[i] == 1)[:, 0]
        if len(a) >= request.param['n_targets']:
            new_x.append(i)
            np.random.shuffle(a)
Beispiel #30
0
import treelite


pytestmark = pytest.mark.filterwarnings("ignore: For reproducible results(.*)"
                                        "::cuml[.*]")


@pytest.fixture(
    scope="session",
    params=[
        unit_param({"n_samples": 350, "n_features": 20, "n_informative": 10}),
        quality_param(
            {"n_samples": 5000, "n_features": 200, "n_informative": 80}
        ),
        stress_param(
            {"n_samples": 500000, "n_features": 400, "n_informative": 180}
        ),
    ],
)
def small_clf(request):
    X, y = make_classification(
        n_samples=request.param["n_samples"],
        n_features=request.param["n_features"],
        n_clusters_per_class=1,
        n_informative=request.param["n_informative"],
        random_state=123,
        n_classes=2,
    )
    return X, y