Example #1
0
def test_validity_add(nelem):
    # LHS
    lhs_data = np.random.random(nelem)
    lhs_mask = utils.random_bitmask(nelem)
    lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)
    lhs_null_count = utils.count_zero(lhs_bitmask)
    lhs = Series.from_masked_array(lhs_data, lhs_mask, lhs_null_count)
    # RHS
    rhs_data = np.random.random(nelem)
    rhs_mask = utils.random_bitmask(nelem)
    rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)
    rhs_null_count = utils.count_zero(rhs_bitmask)
    rhs = Series.from_masked_array(rhs_data, rhs_mask, rhs_null_count)
    # Result
    res = lhs + rhs
    res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask & rhs_mask),
                          dtype=np.bool)[:nelem]
    # Fill NA values
    na_value = -10000
    got = res.fillna(na_value).to_array()
    expect = lhs_data + rhs_data
    expect[~res_mask] = na_value
    # Check
    print('expect')
    print(expect)
    print('got')
    print(got)

    np.testing.assert_array_equal(expect, got)
Example #2
0
def test_series_scale():
    arr = np.random.randint(low=-10, high=10, size=100)
    sr = Series(arr)

    vmin = arr.min()
    vmax = arr.max()
    scaled = (arr - vmin) / (vmax - vmin)
    assert scaled.min() == 0
    assert scaled.max() == 1
    np.testing.assert_equal(sr.scale().to_array(), scaled)
Example #3
0
def test_series_unique():
    for size in [10 ** x for x in range(5)]:
        arr = np.random.randint(low=-1, high=10, size=size)
        mask = arr != -1
        sr = Series.from_masked_array(arr, Series(mask).as_mask())
        assert set(arr[mask]) == set(sr.unique_k(k=10).to_array())
    # test out of space
    arr = np.arange(10)
    sr = Series(arr)
    with pytest.raises(ValueError) as raises:
        sr.unique_k(k=7)
    raises.match('too many unique value')
Example #4
0
def test_categorical_unary_ceil():
    cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c'])
    pdsr = pd.Series(cat)
    sr = Series(cat)

    with pytest.raises(AttributeError) as raises:
        pdsr.ceil()
    raises.match(r'''no attribute ['"]ceil['"]''')

    with pytest.raises(TypeError) as raises:
        sr.ceil()
    raises.match('Categorical cannot perform the operation: ceil')
Example #5
0
def test_categorical_empty():
    cat = pd.Categorical([])
    pdsr = pd.Series(cat)
    sr = Series(cat)
    np.testing.assert_array_equal(cat.codes, sr.to_array())
    assert sr.dtype == pdsr.dtype

    # Test attributes
    assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories)
    assert pdsr.cat.ordered == sr.cat.ordered

    np.testing.assert_array_equal(pdsr.cat.codes.data, sr.cat.codes.to_array())
    np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)
Example #6
0
def test_series_unique():
    for size in [10 ** x for x in range(5)]:
        arr = np.random.randint(low=-1, high=10, size=size)
        mask = arr != -1
        sr = Series.from_masked_array(arr, Series(mask).as_mask())
        assert set(arr[mask]) == set(sr.unique().to_array())
        assert len(set(arr[mask])) == sr.unique_count()
        df = pd.DataFrame(data=arr[mask], columns=['col'])
        expect = df.col.value_counts().sort_index()
        got = sr.value_counts().to_pandas().sort_index()
        print(expect.head())
        print(got.head())
        assert got.equals(expect)
Example #7
0
def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype):
    print(cmpop, lhs_dtype, rhs_dtype)
    nelem = 5
    lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype)
    rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype)
    print(lhs)
    print(rhs)
    exp = cmpop(lhs, rhs)
    got = cmpop(Series(lhs), Series(rhs)).to_array()
    print('exp', exp)
    print('got', got)

    np.testing.assert_array_equal(cmpop(Series(lhs), Series(rhs)).to_array(),
                                  cmpop(lhs, rhs))
Example #8
0
def test_series(data):
    pd_data = pd.Series(data.copy())
    gdf_data = Series(pd_data)
    np.testing.assert_equal(
        np.array(pd_data),
        np.array(gdf_data),
    )
Example #9
0
def test_categorical_compare_unordered(data):
    cat = data.copy()
    pdsr = pd.Series(cat)
    sr = Series(cat)
    dsr = dgd.from_pygdf(sr, npartitions=2)

    # Test equality
    out = dsr == dsr
    assert out.dtype == np.bool_
    assert np.all(out.compute())
    assert np.all(pdsr == pdsr)

    # Test inequality
    out = dsr != dsr
    assert not np.any(out.compute())
    assert not np.any(pdsr != pdsr)

    assert not dsr.cat.ordered.compute()
    assert not pdsr.cat.ordered

    with pytest.raises((TypeError, ValueError)) as raises:
        pdsr < pdsr

    raises.match("Unordered Categoricals can only compare equality or not")

    with pytest.raises((TypeError, ValueError)) as raises:
        dsr < dsr

    raises.match("Unordered Categoricals can only compare equality or not")
Example #10
0
def test_categorical_value_counts(num_elements):
    from string import ascii_letters, digits

    # create categorical series
    np.random.seed(12)
    pd_cat = pd.Categorical(
        pd.Series(
            np.random.choice(list(ascii_letters + digits), num_elements),
            dtype='category'
            )
        )

    # gdf
    gdf = DataFrame()
    gdf['a'] = Series.from_categorical(pd_cat)
    gdf_value_counts = gdf['a'].value_counts()

    # pandas
    pdf = pd.DataFrame()
    pdf['a'] = pd_cat
    pdf_value_counts = pdf['a'].value_counts()

    # verify
    pandas_dict = pdf_value_counts.to_dict()
    gdf_dict = gdf_value_counts.to_pandas().to_dict()

    assert pandas_dict == gdf_dict
Example #11
0
def test_series_std(dtype):
    arr = np.random.random(100)
    if issubclass(arr.dtype.type, np.integer):
        arr *= 100
    arr = arr.astype(dtype)
    sr = Series.from_any(arr)
    np.testing.assert_almost_equal(arr.std(), sr.std())
Example #12
0
def test_categorical_masking():
    """
    Test common operation for getting a all rows that matches a certain
    category.
    """
    cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c'])
    pdsr = pd.Series(cat)
    sr = Series(cat)

    # check scalar comparison
    expect_matches = (pdsr == 'a')
    got_matches = (sr == 'a')

    print('---expect_matches---')
    print(expect_matches)
    print('---got_matches---')
    print(got_matches)
    np.testing.assert_array_equal(expect_matches.values,
                                  got_matches.to_array())

    # mask series
    expect_masked = pdsr[expect_matches]
    got_masked = sr[got_matches]

    print('---expect_masked---')
    print(expect_masked)
    print('---got_masked---')
    print(got_masked)

    assert len(expect_masked) == len(got_masked)
    assert len(expect_masked) == got_masked.valid_count
    assert list(expect_masked) == list(got_masked)
Example #13
0
def test_validity_ceil(nelem):
    # Data
    data = np.random.random(nelem) * 100
    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)
    null_count = utils.count_zero(bitmask)
    sr = Series.from_masked_array(data, mask, null_count)

    # Result
    res = sr.ceil()

    na_value = -100000
    got = res.fillna(na_value).to_array()
    res_mask = np.asarray(bitmask, dtype=np.bool_)[:data.size]

    expect = np.ceil(data)
    expect[~res_mask] = na_value

    # Check
    print('expect')
    print(expect)
    print('got')
    print(got)

    np.testing.assert_array_equal(expect, got)
Example #14
0
def test_categorical_compare_unordered():
    cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c'])
    pdsr = pd.Series(cat)

    sr = Series.from_any(cat)

    # test equal
    out = sr == sr
    assert out.dtype == np.bool_
    assert type(out[0]) == np.bool_
    assert np.all(out)
    assert np.all(pdsr == pdsr)

    # test inequal
    out = sr != sr
    assert not np.any(out)
    assert not np.any(pdsr != pdsr)

    assert not pdsr.cat.ordered
    assert not sr.cat.ordered

    # test using ordered operators
    with pytest.raises(TypeError) as raises:
        pdsr < pdsr

    raises.match("Unordered Categoricals can only compare equality or not")

    with pytest.raises(TypeError) as raises:
        sr < sr

    raises.match("Unordered Categoricals can only compare equality or not")
Example #15
0
def test_validity_ceil(nelem):
    # Data
    data = np.random.random(nelem) * 100
    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)
    null_count = utils.count_zero(bitmask)
    sr = Series.from_masked_array(data, mask, null_count)

    # Result
    res = sr.ceil()

    na_value = -100000
    got = res.fillna(na_value).to_array()
    res_mask = np.asarray(bitmask, dtype=np.bool_)[:data.size]

    expect = np.ceil(data)
    expect[~res_mask] = na_value

    # Check
    print('expect')
    print(expect)
    print('got')
    print(got)

    np.testing.assert_array_equal(expect, got)
Example #16
0
def test_categorical_basic(data):
    cat = data.copy()
    pdsr = pd.Series(cat)
    sr = Series(cat)
    dsr = dgd.from_pygdf(sr, npartitions=2)
    result = dsr.compute()
    np.testing.assert_array_equal(cat.codes, result.to_array())
    assert dsr.dtype == pdsr.dtype

    # Test attributes
    assert pdsr.cat.ordered == dsr.cat.ordered.compute()
    # TODO: Investigate dsr.cat.categories: It raises
    # ValueError: Expected iterable of tuples of (name, dtype),
    # got ('a', 'b', 'c')
    # assert(tuple(pdsr.cat.categories) == tuple(dsr.cat.categories))

    np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array())
    np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype)

    string = str(result)
    expect_str = """
0 a
1 a
2 b
3 c
4 a
"""
    assert all(x == y for x, y in zip(string.split(), expect_str.split()))
Example #17
0
def test_dt_series(data, field):
    pd_data = pd.Series(data.copy())
    gdf_data = Series(pd_data)
    dask_gdf_data = dgd.from_pygdf(gdf_data, npartitions=5)
    base = getattr(pd_data.dt, field)
    test = getattr(dask_gdf_data.dt, field).compute()\
                                           .to_pandas().astype('int64')
    assert_series_equal(base, test)
Example #18
0
def test_dt_series(data, field):
    pdsr = pd.Series(data.copy())
    sr = Series(pdsr)
    dsr = dgd.from_pygdf(sr, npartitions=5)
    base = getattr(pdsr.dt, field)
    test = getattr(dsr.dt, field).compute()\
                                 .to_pandas().astype('int64')
    assert_series_equal(base, test)
Example #19
0
def test_dataframe_to_string():
    with set_options(formatting={'nrows': 5, 'ncols': 8}):
        # Test basic
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16])])
        string = str(df)
        print(string)
        assert string.splitlines()[-1] == '[1 more rows]'

        # Test skipped columns
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16]),
                        ('c', [11, 12, 13, 14, 15, 16]),
                        ('d', [11, 12, 13, 14, 15, 16])])
        string = df.to_string(ncols=3)
        print(string)
        assert string.splitlines()[-2] == '[1 more rows]'
        assert string.splitlines()[-1] == '[1 more columns]'

        # Test masked
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16])])

        data = np.arange(6)
        mask = np.zeros(1, dtype=np.uint8)
        mask[0] = 0b00101101

        masked = Series.from_masked_array(data, mask)
        assert masked.null_count == 2
        df['c'] = masked

        # check data
        values = list(masked)
        validids = [0, 2, 3, 5]
        densearray = masked.to_array()
        np.testing.assert_equal(data[validids], densearray)
        # valid position is corret
        for i in validids:
            assert data[i] == values[i]
        # null position is correct
        for i in range(len(values)):
            if i not in validids:
                assert values[i] is None

        got = df.to_string(nrows=None)
        print(got)
        expect = '''
  a b  c
0 1 11 0
1 2 12
2 3 13 2
3 4 14 3
4 5 15
5 6 16 5
'''
        # values should match despite whitespace difference
        assert got.split() == expect.split()
Example #20
0
def test_series(data):
    pd_data = pd.Series(data.copy())
    gdf_data = Series(pd_data)
    dask_gdf_data = dgd.from_pygdf(gdf_data, npartitions=5)

    np.testing.assert_equal(
        np.array(pd_data),
        np.array(dask_gdf_data.compute()),
    )
Example #21
0
def test_series(data):
    pdsr = pd.Series(data.copy())
    sr = Series(pdsr)
    dsr = dgd.from_pygdf(sr, npartitions=5)

    np.testing.assert_equal(
        np.array(pdsr),
        np.array(dsr.compute()),
    )
Example #22
0
def test_series_scale():
    arr = np.random.randint(low=-10, high=10, size=100)
    sr = Series.from_any(arr)

    vmin = arr.min()
    vmax = arr.max()
    scaled = (arr - vmin) / (vmax - vmin)
    assert scaled.min() == 0
    assert scaled.max() == 1
    np.testing.assert_equal(sr.scale().to_array(), scaled)
Example #23
0
def test_categorical_element_indexing():
    """
    Element indexing to a cat column must give the underlying object
    not the numerical index.
    """
    cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c'])
    pdsr = pd.Series(cat)
    sr = Series(cat)
    assert list(pdsr) == list(sr)
    assert list(pdsr.cat.codes) == list(sr.cat.codes)
Example #24
0
def test_to_dense_array():
    data = np.random.random(8)
    mask = np.asarray([0b11010110], dtype=np.byte)

    sr = Series.from_masked_array(data=data, mask=mask, null_count=3)
    assert sr.null_count > 0
    assert sr.null_count != len(sr)
    filled = sr.to_array(fillna='pandas')
    dense = sr.to_array()
    assert dense.size < filled.size
    assert filled.size == len(sr)
Example #25
0
def test_fillna():
    schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)
    masked_col = gar[8]
    assert masked_col.null_count
    sr = Series.from_masked_array(data=masked_col.data, mask=masked_col.null,
                                  null_count=masked_col.null_count)
    dense = sr.fillna(123)
    np.testing.assert_equal(123, dense.to_array())
    assert len(dense) == len(sr)
    assert not dense.has_null_mask
Example #26
0
def test_series_reductions(method, dtype):
    np.random.seed(0)
    arr = np.random.random(100)
    if np.issubdtype(dtype, np.integer):
        arr *= 100
        mask = arr > 10
    else:
        mask = arr > 0.5

    arr = arr.astype(dtype)
    arr2 = arr[mask]
    sr = Series.from_masked_array(arr, Series(mask).as_mask())

    def call_test(sr):
        fn = getattr(sr, method)
        return fn()

    expect, got = call_test(arr2), call_test(sr)
    print(expect, got)
    np.testing.assert_approx_equal(expect, got)
Example #27
0
def test_to_dense_array():
    data = np.random.random(8)
    mask = np.asarray([0b11010110], dtype=np.byte)

    sr = Series.from_masked_array(data=data, mask=mask, null_count=3)
    assert sr.null_count > 0
    assert sr.null_count != len(sr)
    filled = sr.to_array(fillna='pandas')
    dense = sr.to_array()
    assert dense.size < filled.size
    assert filled.size == len(sr)
Example #28
0
def test_categorical_integer():
    cat = pd.Categorical(['a', '_', '_', 'c', 'a'], categories=['a', 'b', 'c'])
    pdsr = pd.Series(cat)
    sr = Series(cat)
    np.testing.assert_array_equal(cat.codes, sr.to_array(fillna='pandas'))
    assert sr.null_count == 2

    np.testing.assert_array_equal(pdsr.cat.codes.data,
                                  sr.cat.codes.fillna(-1).to_array())
    np.testing.assert_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)

    string = str(sr)
    expect_str = """
0 a
1
2
3 c
4 a
"""
    assert string.split() == expect_str.split()
Example #29
0
def test_fillna():
    schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)
    masked_col = gar[8]
    assert masked_col.null_count
    sr = Series.from_masked_array(data=masked_col.data, mask=masked_col.null,
                                  null_count=masked_col.null_count)
    dense = sr.fillna(123)
    np.testing.assert_equal(123, dense.to_array())
    assert len(dense) == len(sr)
    assert not dense.has_null_mask
Example #30
0
def test_categorical_missing():
    cat = pd.Categorical(['a', '_', '_', 'c', 'a'], categories=['a', 'b', 'c'])
    pdsr = pd.Series(cat)
    sr = Series(cat)
    fillna = lambda x: np.where(np.isnan(x), -1, x)
    np.testing.assert_array_equal(cat.codes, fillna(sr.to_array(fillna='pandas')))
    assert sr.null_count == 2

    np.testing.assert_array_equal(pdsr.cat.codes.data,
                                  fillna(sr.cat.codes.to_array(fillna='pandas')))
    np.testing.assert_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)

    string = str(sr)
    expect_str = """
0 a
1
2
3 c
4 a
"""
    assert string.split() == expect_str.split()
Example #31
0
def test_categorical_binary_add():
    cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c'])
    pdsr = pd.Series(cat)
    sr = Series.from_any(cat)

    with pytest.raises(TypeError) as raises:
        pdsr + pdsr
    raises.match('Categorical cannot perform the operation \+')

    with pytest.raises(TypeError) as raises:
        sr + sr
    raises.match('Categorical cannot perform the operation: add')
Example #32
0
def test_nonmatching_index_setitem(nrows):
    np.random.seed(0)

    gdf = DataFrame()
    gdf['a'] = np.random.randint(2147483647, size=nrows)
    gdf['b'] = np.random.randint(2147483647, size=nrows)
    gdf = gdf.set_index('b')

    test_values = np.random.randint(2147483647, size=nrows)
    gdf['c'] = test_values
    assert (len(test_values) == len(gdf['c']))
    assert (gdf['c'].to_pandas().equals(
        Series(test_values).set_index(gdf._index).to_pandas()))
Example #33
0
def test_categorical_basic():
    cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c'])
    pdsr = pd.Series(cat)
    sr = Series(cat)
    np.testing.assert_array_equal(cat.codes, sr.to_array())
    assert sr.dtype == pdsr.dtype

    # Test attributes
    assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories)
    assert pdsr.cat.ordered == sr.cat.ordered

    np.testing.assert_array_equal(pdsr.cat.codes.data, sr.cat.codes.to_array())
    np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)

    string = str(sr)
    expect_str = """
0 a
1 a
2 b
3 c
4 a
"""
    assert all(x == y for x, y in zip(string.split(), expect_str.split()))
Example #34
0
def test_series_indexing():
    a1 = np.arange(20)
    series = Series(a1)
    # Indexing
    sr1 = series[:12]
    assert sr1.null_count == 0
    np.testing.assert_equal(sr1.to_array(), a1[:12])
    sr2 = sr1[3:]
    assert sr2.null_count == 0
    np.testing.assert_equal(sr2.to_array(), a1[3:12])
    # Index with stride
    sr3 = sr2[::2]
    assert sr3.null_count == 0
    np.testing.assert_equal(sr3.to_array(), a1[3:12:2])
Example #35
0
def test_series_indexing():
    a1 = np.arange(20)
    series = Series.from_any(a1)
    # Indexing
    sr1 = series[:12]
    assert not sr1.has_null_mask
    np.testing.assert_equal(sr1.to_array(), a1[:12])
    sr2 = sr1[3:]
    assert not sr2.has_null_mask
    np.testing.assert_equal(sr2.to_array(), a1[3:12])
    # Index with stride
    sr3 = sr2[::2]
    assert not sr3.has_null_mask
    np.testing.assert_equal(sr3.to_array(), a1[3:12:2])
Example #36
0
def test_onehot_generic_index():
    np.random.seed(0)
    size = 33
    indices = np.random.randint(low=0, high=100, size=size)
    df = DataFrame()
    values = np.random.randint(low=0, high=4, size=size)
    df['fo'] = Series(values, index=GenericIndex(indices))
    out = df.one_hot_encoding('fo',
                              cats=df.fo.unique(),
                              prefix='fo',
                              dtype=np.int32)
    assert set(out.columns) == {'fo', 'fo_0', 'fo_1', 'fo_2', 'fo_3'}
    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
Example #37
0
def test_series_floor():
    arr = np.random.random(100) * 100
    sr = Series(arr)
    np.testing.assert_equal(sr.floor().to_array(), np.floor(arr))