def test_pipeline(): X, y = make_classification(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())]) pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) assert score > 0.8
def make_classification_dataset(datatype, nrows, ncols, nclasses): n_real_features = min(ncols, int(max(nclasses * 2, math.ceil(ncols / 10)))) n_clusters_per_class = min(2, max(1, int(2**n_real_features / nclasses))) n_redundant = min(ncols - n_real_features, max(2, math.ceil(ncols / 20))) try: X, y = data.make_classification( dtype=datatype, n_samples=nrows + 1000, n_features=ncols, random_state=SEED, class_sep=1.0, n_informative=n_real_features, n_clusters_per_class=n_clusters_per_class, n_redundant=n_redundant, n_classes=nclasses) r = dsel.train_test_split(X, y, random_state=SEED, train_size=nrows) if len(cp.unique(r[2])) < nclasses: raise ValueError("Training data does not have all classes.") return r except ValueError: pytest.skip( "Skipping the test for invalid combination of ncols/nclasses")
def create_rand_clf(): clf, _ = make_classification(n_samples=500, n_features=20, n_clusters_per_class=1, n_informative=12, n_classes=5, order='F') return clf
def make_dataset(request): nrows, ncols, n_info, datatype = request.param X, y = make_classification(n_samples=nrows, n_informative=n_info, n_features=ncols, random_state=10) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10) y_train = y_train.astype(datatype) y_test = y_test.astype(datatype) return nrows, X_train, X_test, y_train, y_test
def test_complement_partial_fit(x_dtype, y_dtype, norm): chunk_size = 500 n_rows, n_cols = 1500, 100 weights = [0.6, 0.2, 0.15, 0.05] rtol = 1e-3 if x_dtype == cp.float32 else 1e-6 X, y = make_classification(n_rows, n_cols, n_classes=len(weights), weights=weights, dtype=x_dtype, n_informative=9, random_state=1) X -= X.min(0) # Make all inputs positive y = y.astype(y_dtype) model = ComplementNB(norm=norm) modelsk = skComplementNB(norm=norm) classes = np.unique(y) for i in range(math.ceil(X.shape[0] / chunk_size)): upper = i * chunk_size + chunk_size if upper > X.shape[0]: upper = -1 if upper > 0: x = X[i * chunk_size:upper] y_c = y[i * chunk_size:upper] else: x = X[i * chunk_size:] y_c = y[i * chunk_size:] model.partial_fit(x, y_c, classes=classes) modelsk.partial_fit(x.get(), y_c.get(), classes=classes.get()) if upper == -1: break y_hat = model.predict_proba(X).get() y_sk = modelsk.predict_proba(X.get()) assert_allclose(y_hat, y_sk, rtol=rtol)
def test_stratified_split(type, test_size, train_size): # For more tolerance and reliable estimates X, y = make_classification(n_samples=10000) if type == 'cupy': X = cp.asarray(X) y = cp.asarray(y) if type == 'numba': X = cuda.to_device(X) y = cuda.to_device(y) def counts(y): _, y_indices = cp.unique(y, return_inverse=True) class_counts = cp.bincount(y_indices) total = cp.sum(class_counts) percent_counts = [] for count in (class_counts): percent_counts.append( cp.around(float(count) / total.item(), decimals=2).item()) return percent_counts X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, stratify=y) original_counts = counts(y) split_counts = counts(y_train) assert cp.isclose(original_counts, split_counts, equal_nan=False, rtol=0.1).all() if type == 'cupy': assert isinstance(X_train, cp.ndarray) assert isinstance(X_test, cp.ndarray) if type in ['numba']: assert cuda.devicearray.is_cuda_ndarray(X_train) assert cuda.devicearray.is_cuda_ndarray(X_test)
def classification_dataset(request): X, y = make_classification(n_samples=10, n_features=5, random_state=0) return train_test_split(X, y, random_state=0)
n_estimators = 40 max_depth = 16 max_features = 1.0 # Other parameters random_state = 32 """## Generate Data""" import cudf import pandas as pd from cuml.datasets import make_classification from cuml.preprocessing.model_selection import train_test_split X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_classes=n_clases, random_state=random_state) # Create cuDF DataFrame and Series from CuPy ndarray. X = cudf.DataFrame(X) y = cudf.Series(y) # Split dataset into training and testing datasets. X_train_cudf, X_test_cudf, y_train_cudf, y_test_cudf = \ train_test_split(X, y, test_size=0.2, random_state=random_state) # Copy dataset from GPU memory to host memory. # This is done to later compare CPU and GPU results. X_train_skl = X_train_cudf.to_pandas() X_test_skl = X_test_cudf.to_pandas()