def test_pipeline():
    X, y = make_classification(random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    assert score > 0.8
Exemple #2
0
def make_classification_dataset(datatype, nrows, ncols, nclasses):
    n_real_features = min(ncols, int(max(nclasses * 2, math.ceil(ncols / 10))))
    n_clusters_per_class = min(2, max(1, int(2**n_real_features / nclasses)))
    n_redundant = min(ncols - n_real_features, max(2, math.ceil(ncols / 20)))
    try:
        X, y = data.make_classification(
            dtype=datatype,
            n_samples=nrows + 1000,
            n_features=ncols,
            random_state=SEED,
            class_sep=1.0,
            n_informative=n_real_features,
            n_clusters_per_class=n_clusters_per_class,
            n_redundant=n_redundant,
            n_classes=nclasses)

        r = dsel.train_test_split(X, y, random_state=SEED, train_size=nrows)

        if len(cp.unique(r[2])) < nclasses:
            raise ValueError("Training data does not have all classes.")

        return r

    except ValueError:
        pytest.skip(
            "Skipping the test for invalid combination of ncols/nclasses")
Exemple #3
0
def create_rand_clf():
    clf, _ = make_classification(n_samples=500,
                                 n_features=20,
                                 n_clusters_per_class=1,
                                 n_informative=12,
                                 n_classes=5,
                                 order='F')
    return clf
Exemple #4
0
def make_dataset(request):
    nrows, ncols, n_info, datatype = request.param
    X, y = make_classification(n_samples=nrows, n_informative=n_info,
                               n_features=ncols, random_state=10)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=10)

    y_train = y_train.astype(datatype)
    y_test = y_test.astype(datatype)

    return nrows, X_train, X_test, y_train, y_test
Exemple #5
0
def test_complement_partial_fit(x_dtype, y_dtype, norm):
    chunk_size = 500
    n_rows, n_cols = 1500, 100
    weights = [0.6, 0.2, 0.15, 0.05]
    rtol = 1e-3 if x_dtype == cp.float32 else 1e-6

    X, y = make_classification(n_rows,
                               n_cols,
                               n_classes=len(weights),
                               weights=weights,
                               dtype=x_dtype,
                               n_informative=9,
                               random_state=1)
    X -= X.min(0)  # Make all inputs positive
    y = y.astype(y_dtype)

    model = ComplementNB(norm=norm)
    modelsk = skComplementNB(norm=norm)

    classes = np.unique(y)

    for i in range(math.ceil(X.shape[0] / chunk_size)):

        upper = i * chunk_size + chunk_size
        if upper > X.shape[0]:
            upper = -1

        if upper > 0:
            x = X[i * chunk_size:upper]
            y_c = y[i * chunk_size:upper]
        else:
            x = X[i * chunk_size:]
            y_c = y[i * chunk_size:]

        model.partial_fit(x, y_c, classes=classes)
        modelsk.partial_fit(x.get(), y_c.get(), classes=classes.get())
        if upper == -1:
            break

    y_hat = model.predict_proba(X).get()
    y_sk = modelsk.predict_proba(X.get())

    assert_allclose(y_hat, y_sk, rtol=rtol)
Exemple #6
0
def test_stratified_split(type, test_size, train_size):
    # For more tolerance and reliable estimates
    X, y = make_classification(n_samples=10000)

    if type == 'cupy':
        X = cp.asarray(X)
        y = cp.asarray(y)

    if type == 'numba':
        X = cuda.to_device(X)
        y = cuda.to_device(y)

    def counts(y):
        _, y_indices = cp.unique(y, return_inverse=True)
        class_counts = cp.bincount(y_indices)
        total = cp.sum(class_counts)
        percent_counts = []
        for count in (class_counts):
            percent_counts.append(
                cp.around(float(count) / total.item(), decimals=2).item())
        return percent_counts

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size,
                                                        test_size=test_size,
                                                        stratify=y)

    original_counts = counts(y)
    split_counts = counts(y_train)
    assert cp.isclose(original_counts, split_counts, equal_nan=False,
                      rtol=0.1).all()
    if type == 'cupy':
        assert isinstance(X_train, cp.ndarray)
        assert isinstance(X_test, cp.ndarray)

    if type in ['numba']:
        assert cuda.devicearray.is_cuda_ndarray(X_train)
        assert cuda.devicearray.is_cuda_ndarray(X_test)
def classification_dataset(request):
    X, y = make_classification(n_samples=10, n_features=5, random_state=0)
    return train_test_split(X, y, random_state=0)
n_estimators = 40
max_depth = 16
max_features = 1.0

# Other parameters
random_state = 32
"""## Generate Data"""

import cudf
import pandas as pd
from cuml.datasets import make_classification
from cuml.preprocessing.model_selection import train_test_split

X, y = make_classification(n_samples=n_samples,
                           n_features=n_features,
                           n_informative=n_informative,
                           n_classes=n_clases,
                           random_state=random_state)

# Create cuDF DataFrame and Series from CuPy ndarray.
X = cudf.DataFrame(X)
y = cudf.Series(y)

# Split dataset into training and testing datasets.
X_train_cudf, X_test_cudf, y_train_cudf, y_test_cudf = \
    train_test_split(X, y, test_size=0.2, random_state=random_state)

# Copy dataset from GPU memory to host memory.
# This is done to later compare CPU and GPU results.
X_train_skl = X_train_cudf.to_pandas()
X_test_skl = X_test_cudf.to_pandas()