def test_tsvd_fit_transform(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) if name != 'blobs': skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) Xcutsvd = cutsvd.fit_transform(X) cutsvd.handle.sync() if name != 'blobs': assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
def run_tsvd(X, n_components, algorithm, random_state, model): if model == 'sklearn': tsvd = skTSVD(n_components=n_components, algorithm=algorithm, random_state=random_state) elif model == 'h2o4gpu': from h2o4gpu.solvers import TruncatedSVDH2O as h2oTSVD if algorithm == 'arpack': algorithm = 'cusolver' tsvd = h2oTSVD(n_components=n_components, algorithm=algorithm, random_state=random_state) elif model == 'cuml': from cuSKL import TruncatedSVD as cumlTSVD tsvd = cumlTSVD(n_components=n_components, random_state=random_state) else: raise NotImplementedError @timer def fit_(tsvd, X, model): tsvd.fit(X) return tsvd @timer def transform_(tsvd, X, model): return tsvd.transform(X) tsvd = fit_(tsvd, X, model=model) Xtsvd = transform_(tsvd, X, model=model) tsvd.transformed_result = lambda: None setattr(tsvd, 'transformed_result', Xtsvd) return tsvd
def test_tsvd_fit(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) sktsvd = skTSVD(n_components=1) sktsvd.fit(X) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cutsvd.fit(gdf) else: cutsvd.fit(X) for attr in [ 'singular_values_', 'components_', 'explained_variance_ratio_' ]: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)
def test_tsvd_fit(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) if name != 'blobs': sktsvd = skTSVD(n_components=1) sktsvd.fit(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) cutsvd.fit(X) cutsvd.handle.sync() if name != 'blobs': for attr in ['singular_values_', 'components_', 'explained_variance_ratio_']: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)
def run_cpu(X, y): tsvd_sk = skTSVD(n_components=n_components, algorithm="arpack", n_iter=5000, tol=0.00001, random_state=random_state) result_sk = tsvd_sk.fit_transform(X) return result_sk
def test_pca_fit_transform(datatype): gdf = pygdf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) print("Calling fit_transform") cutsvd = cuTSVD(n_components=1) Xcutsvd = cutsvd.fit_transform(gdf) sktsvd = skTSVD(n_components=1) Xsktsvd = sktsvd.fit_transform(X) assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=False)
def test_tsvd_fit_transform(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) Xcutsvd = cutsvd.fit_transform(gdf) else: Xcutsvd = cutsvd.fit_transform(X) assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
def test_tsvd_fit(datatype, input_type, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) if name != 'blobs': sktsvd = skTSVD(n_components=1) sktsvd.fit(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) if input_type == 'dataframe': X = pd.DataFrame( {'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) cutsvd.fit(X_cudf) else: cutsvd.fit(X) cutsvd.handle.sync() if name != 'blobs': for attr in ['singular_values_', 'components_', 'explained_variance_ratio_']: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)
def test_tsvd_fit_transform(datatype, input_type, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) if name != 'blobs': skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) if input_type == 'dataframe': X = pd.DataFrame( {'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) Xcutsvd = cutsvd.fit_transform(X_cudf) else: Xcutsvd = cutsvd.fit_transform(X) cutsvd.handle.sync() if name != 'blobs': assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)