Ejemplo n.º 1
0
def test_tsvd_fit_transform(datatype, name, use_handle):
    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
        rng = np.random.RandomState(0)
        X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5])

    if name != 'blobs':
        skpca = skTSVD(n_components=1)
        Xsktsvd = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cutsvd = cuTSVD(n_components=1, handle=handle)

    Xcutsvd = cutsvd.fit_transform(X)
    cutsvd.handle.sync()

    if name != 'blobs':
        assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
Ejemplo n.º 2
0
def test_tsvd_inverse_transform(datatype, input_type,
                                name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    X_pd = pd.DataFrame(
           {'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
    X_cudf = cudf.DataFrame.from_pandas(X_pd)
    cutsvd = cuTSVD(n_components=1)

    if input_type == 'dataframe':
        Xcutsvd = cutsvd.fit_transform(X_cudf)

    else:
        Xcutsvd = cutsvd.fit_transform(X)

    input_gdf = cutsvd.inverse_transform(Xcutsvd)

    cutsvd.handle.sync()

    assert array_equal(input_gdf, X_cudf, 0.4, with_sign=True)
Ejemplo n.º 3
0
def test_tsvd_fit(datatype, input_type):
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)
    sktsvd = skTSVD(n_components=1)
    sktsvd.fit(X)

    cutsvd = cuTSVD(n_components=1)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
        gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
        cutsvd.fit(gdf)

    else:
        cutsvd.fit(X)

    for attr in [
            'singular_values_', 'components_', 'explained_variance_ratio_'
    ]:
        with_sign = False if attr in ['components_'] else True
        assert array_equal(getattr(cutsvd, attr),
                           getattr(sktsvd, attr),
                           0.4,
                           with_sign=with_sign)
Ejemplo n.º 4
0
def test_tsvd_fit(datatype, name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
        rng = np.random.RandomState(0)
        X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5])

    if name != 'blobs':
        sktsvd = skTSVD(n_components=1)
        sktsvd.fit(X)

    handle, stream = get_handle(use_handle)
    cutsvd = cuTSVD(n_components=1, handle=handle)

    cutsvd.fit(X)
    cutsvd.handle.sync()

    if name != 'blobs':
        for attr in ['singular_values_', 'components_',
                     'explained_variance_ratio_']:
            with_sign = False if attr in ['components_'] else True
            assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr),
                               0.4, with_sign=with_sign)
Ejemplo n.º 5
0
def test_tsvd_inverse_transform(datatype, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
        rng = np.random.RandomState(0)
        X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5])

    cutsvd = cuTSVD(n_components=1)
    Xcutsvd = cutsvd.fit_transform(X)
    input_gdf = cutsvd.inverse_transform(Xcutsvd)

    cutsvd.handle.sync()
    assert array_equal(input_gdf, X, 0.4, with_sign=True)
Ejemplo n.º 6
0
def test_pca_inverse_transform(datatype):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)

    cutsvd = cuTSVD(n_components=1)
    Xcutsvd = cutsvd.fit_transform(gdf)

    print("Calling inverse_transform")
    input_gdf = cutsvd.inverse_transform(Xcutsvd)
    print(input_gdf)
    assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
Ejemplo n.º 7
0
def test_pca_fit_transform(datatype):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)

    print("Calling fit_transform")
    cutsvd = cuTSVD(n_components=1)
    Xcutsvd = cutsvd.fit_transform(gdf)
    sktsvd = skTSVD(n_components=1)
    Xsktsvd = sktsvd.fit_transform(X)

    assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=False)
Ejemplo n.º 8
0
def test_tsvd_inverse_transform(datatype, input_type):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
    cutsvd = cuTSVD(n_components=1)

    if input_type == 'dataframe':
        Xcutsvd = cutsvd.fit_transform(gdf)

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)
        Xcutsvd = cutsvd.fit_transform(X)

    input_gdf = cutsvd.inverse_transform(Xcutsvd)
    assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
Ejemplo n.º 9
0
def test_tsvd_fit_transform(datatype, input_type):
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)
    skpca = skTSVD(n_components=1)
    Xsktsvd = skpca.fit_transform(X)

    cutsvd = cuTSVD(n_components=1)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
        gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
        Xcutsvd = cutsvd.fit_transform(gdf)

    else:
        Xcutsvd = cutsvd.fit_transform(X)

    assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
Ejemplo n.º 10
0
def test_tsvd_fit(datatype, input_type,
                  name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    if name != 'blobs':
        sktsvd = skTSVD(n_components=1)
        sktsvd.fit(X)

    handle, stream = get_handle(use_handle)
    cutsvd = cuTSVD(n_components=1, handle=handle)

    if input_type == 'dataframe':
        X = pd.DataFrame(
            {'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        cutsvd.fit(X_cudf)

    else:
        cutsvd.fit(X)

    cutsvd.handle.sync()

    if name != 'blobs':
        for attr in ['singular_values_', 'components_',
                     'explained_variance_ratio_']:
            with_sign = False if attr in ['components_'] else True
            assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr),
                               0.4, with_sign=with_sign)