def test_dataframe_to_string():
    with set_options(formatting={'nrows': 5, 'ncols': 8}):
        # Test basic
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16])])
        string = str(df)
        print(string)
        assert string.splitlines()[-1] == '[1 more rows]'

        # Test skipped columns
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16]),
                        ('c', [11, 12, 13, 14, 15, 16]),
                        ('d', [11, 12, 13, 14, 15, 16])])
        string = df.to_string(ncols=3)
        print(string)
        assert string.splitlines()[-2] == '[1 more rows]'
        assert string.splitlines()[-1] == '[1 more columns]'

        # Test masked
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16])])

        data = np.arange(6)
        mask = np.zeros(1, dtype=np.uint8)
        mask[0] = 0b00101101

        masked = Series.from_masked_array(data, mask)
        assert masked.null_count == 2
        df['c'] = masked

        # check data
        values = list(masked)
        validids = [0, 2, 3, 5]
        densearray = masked.to_array()
        np.testing.assert_equal(data[validids], densearray)
        # valid position is corret
        for i in validids:
            assert data[i] == values[i]
        # null position is correct
        for i in range(len(values)):
            if i not in validids:
                assert values[i] is None

        got = df.to_string(nrows=None)
        print(got)
        expect = '''
  a b  c
0 1 11 0
1 2 12
2 3 13 2
3 4 14 3
4 5 15
5 6 16 5
'''
        # values should match despite whitespace difference
        assert got.split() == expect.split()
Exemple #2
0
def test_categorical_value_counts(num_elements):
    from string import ascii_letters, digits

    # create categorical series
    np.random.seed(12)
    pd_cat = pd.Categorical(
        pd.Series(
            np.random.choice(list(ascii_letters + digits), num_elements),
            dtype='category'
            )
        )

    # gdf
    gdf = DataFrame()
    gdf['a'] = Series.from_categorical(pd_cat)
    gdf_value_counts = gdf['a'].value_counts()

    # pandas
    pdf = pd.DataFrame()
    pdf['a'] = pd_cat
    pdf_value_counts = pdf['a'].value_counts()

    # verify
    pandas_dict = pdf_value_counts.to_dict()
    gdf_dict = gdf_value_counts.to_pandas().to_dict()

    assert pandas_dict == gdf_dict
Exemple #3
0
def test_dataframe_hash_partition(nrows, nparts, nkeys):
    np.random.seed(123)
    gdf = DataFrame()
    keycols = []
    for i in range(nkeys):
        keyname = 'key{}'.format(i)
        gdf[keyname] = np.random.randint(0, 7 - i, nrows)
        keycols.append(keyname)
    gdf['val1'] = np.random.randint(0, nrows * 2, nrows)

    got = gdf.partition_by_hash(keycols, nparts=nparts)
    # Must return a list
    assert isinstance(got, list)
    # Must have correct number of partitions
    assert len(got) == nparts
    # All partitions must be DataFrame type
    assert all(isinstance(p, DataFrame) for p in got)
    # Check that all partitions have unique keys
    part_unique_keys = set()
    for p in got:
        if len(p):
            # Take rows of the keycolums and build a set of the key-values
            unique_keys = set(map(tuple, p.as_matrix(columns=keycols)))
            # Ensure that none of the key-values have occurred in other groups
            assert not (unique_keys & part_unique_keys)
            part_unique_keys |= unique_keys
    assert len(part_unique_keys)
Exemple #4
0
def test_dataframe_as_gpu_matrix_null_values():
    df = DataFrame()

    nelem = 123
    na = -10000

    refvalues = {}
    for k in 'abcd':
        df[k] = data = np.random.random(nelem)
        bitmask = utils.random_bitmask(nelem)
        df[k] = df[k].set_mask(bitmask)
        boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem],
                              dtype=np.bool_)
        data[~boolmask] = na
        refvalues[k] = data

    # Check null value causes error
    with pytest.raises(ValueError) as raises:
        df.as_gpu_matrix()
    raises.match("column 'a' has null values")

    for k in df.columns:
        df[k] = df[k].fillna(na)

    mat = df.as_gpu_matrix().copy_to_host()
    for i, k in enumerate(df.columns):
        np.testing.assert_array_equal(refvalues[k], mat[:, i])
Exemple #5
0
def test_dataframe_setitem_from_masked_object():
    ary = np.random.randn(100)
    mask = np.zeros(100, dtype=bool)
    mask[:20] = True
    np.random.shuffle(mask)
    ary[mask] = np.nan

    test1 = Series(ary)
    assert (test1.has_null_mask)
    assert (test1.null_count == 20)

    test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary}))
    assert (test2['a'].has_null_mask)
    assert (test2['a'].null_count == 20)

    gpu_ary = cuda.to_device(ary)
    test3 = Series(gpu_ary)
    assert (test3.has_null_mask)
    assert (test3.null_count == 20)

    test4 = DataFrame()
    lst = [1, 2, None, 4, 5, 6, None, 8, 9]
    test4['lst'] = lst
    assert (test4['lst'].has_null_mask)
    assert (test4['lst'].null_count == 2)
Exemple #6
0
def test_dataframe_loc():
    df = DataFrame()
    size = 123
    df['a'] = ha = np.random.randint(low=0, high=100,
                                     size=size).astype(np.int32)
    df['b'] = hb = np.random.random(size).astype(np.float32)
    df['c'] = hc = np.random.randint(low=0, high=100,
                                     size=size).astype(np.int64)
    df['d'] = hd = np.random.random(size).astype(np.float64)

    begin = 117
    end = 122
    fewer = df.loc[begin:end, ['c', 'd', 'a']]
    assert len(fewer) == end - begin + 1
    assert fewer.columns == tuple(['c', 'd', 'a'])
    np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1])
    np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1])
    np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1])
    del fewer

    # Make int64 index
    offset = 50
    df2 = df[offset:]
    begin = 117
    end = 122
    fewer = df2.loc[begin:end, ['c', 'd', 'a']]
    assert len(fewer) == end - begin + 1
    assert fewer.columns == tuple(['c', 'd', 'a'])
    np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1])
    np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1])
    np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1])
Exemple #7
0
def test_label_encode_drop_one():
    random.seed(0)
    np.random.seed(0)

    df = DataFrame()

    # initialize data frame
    df['cats'] = np.random.randint(7, size=10, dtype=np.int32)
    vals = list(df['cats'].unique())
    # drop 1 randomly
    del vals[random.randrange(len(vals))]

    lab = dict(zip(vals, list(range(len(vals)))))

    # label encode series
    ncol = df['cats'].label_encoding(cats=vals, dtype='float32')
    arr = ncol.to_array()

    # verify labels of new column

    for i in range(arr.size):
        # assuming -1 is used for missing value
        np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1))

    # label encode data frame
    df2 = df.label_encoding(column='cats', prefix='cats', cats=vals, dtype='float32')

    assert df2.columns[0] == 'cats'
    assert df2.columns[1] == 'cats_labels'
Exemple #8
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in 'abc':
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index('a')
    right = df.set_index('c')
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match("there are overlapping columns but lsuffix"
                 " and rsuffix are not defined")

    got = left.join(right, lsuffix='_left', rsuffix='_right', sort=True)
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index('a').join(pddf.set_index('c'),
                                      lsuffix='_left',
                                      rsuffix='_right')
    # Check
    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    for k in expect.columns:
        _check_series(expect[k], got[k])
Exemple #9
0
def test_dataframe_join_how(aa, bb, how, method):
    df = DataFrame()
    df['a'] = aa
    df['b'] = bb

    def work_pandas(df):
        ts = timer()
        df1 = df.set_index('a')
        df2 = df.set_index('b')
        joined = df1.join(df2, how=how, sort=True)
        te = timer()
        print('timing', type(df), te - ts)
        return joined

    def work_gdf(df):
        ts = timer()
        df1 = df.set_index('a')
        df2 = df.set_index('b')
        joined = df1.join(df2, how=how, sort=True, method=method)
        te = timer()
        print('timing', type(df), te - ts)
        return joined

    expect = work_pandas(df.to_pandas())
    got = work_gdf(df)
    expecto = expect.copy()
    goto = got.copy()

    # Type conversion to handle NoneType
    expectb = expect.b
    expecta = expect.a
    gotb = got.b
    gota = got.a
    got.drop_column('b')
    got.add_column('b', gotb.astype(np.float64).fillna(np.nan))
    got.drop_column('a')
    got.add_column('a', gota.astype(np.float64).fillna(np.nan))
    expect.drop(['b'], axis=1)
    expect['b'] = expectb.astype(np.float64).fillna(np.nan)
    expect.drop(['a'], axis=1)
    expect['a'] = expecta.astype(np.float64).fillna(np.nan)

    # print(expect)
    # print(got.to_string(nrows=None))

    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    if(how != 'outer'):
        pd.util.testing.assert_frame_equal(
            got.to_pandas().sort_values(['b', 'a']).reset_index(drop=True),
            expect.sort_values(['b', 'a']).reset_index(drop=True))
        # if(how=='right'):
        #     _sorted_check_series(expect['a'], expect['b'],
        #                          got['a'], got['b'])
        # else:
        #     _sorted_check_series(expect['b'], expect['a'], got['b'],
        #                          got['a'])
    else:
        _check_series(expecto['b'], goto['b'])
        _check_series(expecto['a'], goto['a'])
Exemple #10
0
def test_assign():
    gdf = DataFrame({'x': [1, 2, 3]})
    gdf2 = gdf.assign(y=gdf.x + 1)
    assert list(gdf.columns) == ['x']
    assert list(gdf2.columns) == ['x', 'y']

    np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
Exemple #11
0
def test_pickle_dataframe_numeric():
    np.random.seed(0)
    df = DataFrame()
    nelem = 10
    df['keys'] = np.arange(nelem, dtype=np.float64)
    df['vals'] = np.random.random(nelem)

    check_serialization(df)
Exemple #12
0
def test_dataframe_astype():
    df = DataFrame()
    data = np.asarray(range(10), dtype=np.int32)
    df['a'] = data
    assert df['a'].dtype == np.dtype(np.int32)
    df['b'] = df['a'].astype(np.float32)
    assert df['b'].dtype == np.dtype(np.float32)
    np.testing.assert_equal(df['a'].to_array(), df['b'].to_array())
Exemple #13
0
def test_pickle_dataframe_categorical():
    np.random.seed(0)

    df = DataFrame()
    df['keys'] = pd.Categorical("aaabababac")
    df['vals'] = np.random.random(len(df))

    check_serialization(df)
Exemple #14
0
def test_to_records_noindex():
    df = DataFrame()
    df['a'] = aa = np.arange(10, dtype=np.int32)
    df['b'] = bb = np.arange(10, 20, dtype=np.float64)

    rec = df.to_records(index=False)
    assert rec.dtype.names == ('a', 'b')
    np.testing.assert_array_equal(rec['a'], aa)
    np.testing.assert_array_equal(rec['b'], bb)
Exemple #15
0
def test_dataframe_empty_concat():
    gdf1 = DataFrame()
    gdf1['a'] = []
    gdf1['b'] = []

    gdf2 = gdf1.copy()

    gdf3 = gd.concat([gdf1, gdf2])
    assert len(gdf3) == 0
    assert len(gdf3.columns) == 2
Exemple #16
0
def test_dataframe_append_to_empty():
    pdf = pd.DataFrame()
    pdf['a'] = []
    pdf['b'] = [1, 2, 3]

    gdf = DataFrame()
    gdf['a'] = []
    gdf['b'] = [1, 2, 3]

    pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
Exemple #17
0
def test_dataframe_join_cats():
    ldf = DataFrame()
    ldf['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    ldf['b'] = bb = np.arange(len(ldf))
    lhs = ldf.set_index('a')

    rdf = DataFrame()
    rdf['a'] = pd.Categorical(list('abcac'), categories=list('abc'))
    rdf['c'] = cc = np.arange(len(rdf))
    rhs = rdf.set_index('a')

    got = lhs.join(rhs)
    # Just do some rough checking here.
    # Note: pandas fails to join on categorical index.
    assert list(got.columns) == ['b', 'c']
    assert len(got) > 0
    assert set(got.index.values) & set('abc')
    assert set(got['b']) & set(bb)
    assert set(got['c']) & set(cc)
Exemple #18
0
def test_dataframe_multi_column_join():
    np.random.seed(0)

    # Make GDF
    df_left = DataFrame()
    nelem = 500
    df_left['key1'] = np.random.randint(0, 30, nelem)
    df_left['key2'] = np.random.randint(0, 50, nelem)
    df_left['val1'] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right['key1'] = np.random.randint(0, 30, nelem)
    df_right['key2'] = np.random.randint(0, 50, nelem)
    df_right['val1'] = np.arange(nelem)

    # Make pandas DF
    pddf_left = df_left.to_pandas()
    pddf_right = df_right.to_pandas()
    # print(pddf_left)
    # print(pddf_right)

    # Expected result
    pddf_joined = pddf_left.merge(pddf_right, on=['key1', 'key2'], how='left',
                                  sort=True)
    # print(pddf_joined)

    # Test (doesn't check for ordering)
    join_result = df_left.merge(df_right, on=['key1', 'key2'], how='left')

    for col in list(pddf_joined.columns):
        if(col.count('_y') > 0):
            join_result[col] = (join_result[col]
                                .astype(np.float64)
                                .fillna(np.nan))

    pd.util.testing.assert_frame_equal(
        join_result
        .to_pandas()
        .sort_values(list(pddf_joined.columns))
        .reset_index(drop=True),
        pddf_joined)
Exemple #19
0
def test_to_records_withindex():
    df = DataFrame()
    df['a'] = aa = np.arange(10, dtype=np.int32)
    df['b'] = bb = np.arange(10, 20, dtype=np.float64)

    rec_indexed = df.to_records(index=True)
    assert rec_indexed.size == len(aa)
    assert rec_indexed.dtype.names == ('index', 'a', 'b')
    np.testing.assert_array_equal(rec_indexed['a'], aa)
    np.testing.assert_array_equal(rec_indexed['b'], bb)
    np.testing.assert_array_equal(rec_indexed['index'], np.arange(10))
Exemple #20
0
def test_dataframe_sort_values(nelem, dtype):
    np.random.seed(0)
    df = DataFrame()
    df['a'] = aa = (100 * np.random.random(nelem)).astype(dtype)
    df['b'] = bb = (100 * np.random.random(nelem)).astype(dtype)
    sorted_df = df.sort_values(by='a')
    # Check
    sorted_index = np.argsort(aa, kind='mergesort')
    np.testing.assert_array_equal(sorted_df.index.values, sorted_index)
    np.testing.assert_array_equal(sorted_df['a'], aa[sorted_index])
    np.testing.assert_array_equal(sorted_df['b'], bb[sorted_index])
def test_reading_arrow_sparse_data():
    schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)

    df = DataFrame(gar.to_dict().items())

    # preprocessing
    num_cols = set()
    cat_cols = set()
    response_set = set(['INCEARN '])
    feature_names = set(df.columns) - response_set

    # Determine cat and numeric columns
    uniques = {}
    for k in feature_names:
        try:
            uniquevals = df[k].unique()
            uniques[k] = uniquevals
        except ValueError:
            num_cols.add(k)
        else:
            nunique = len(uniquevals)
            if nunique < 2:
                del df[k]
            elif 1 < nunique < 1000:
                cat_cols.add(k)
            else:
                num_cols.add(k)

    # Fix numeric columns
    for k in (num_cols - response_set):
        df[k] = df[k].fillna(df[k].mean())
        assert df[k].null_count == 0
        std = df[k].std()
        # drop near constant columns
        if not np.isfinite(std) or std < 1e-4:
            del df[k]
            print('drop near constant', k)
        else:
            df[k] = df[k].scale()

    # Expand categorical columns
    for k in cat_cols:
        cats = uniques[k][1:]  # drop first
        df = df.one_hot_encoding(k, prefix=k, cats=cats)
        del df[k]

    # Print dtypes
    assert {df[k].dtype for k in df.columns} == {np.dtype('float64')}

    mat = df.as_matrix()

    assert mat.max() == 1
    assert mat.min() == 0
Exemple #22
0
def test_df_cat_sort_index():
    df = DataFrame()
    df['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    df['b'] = np.arange(len(df))

    got = df.set_index('a').sort_index()
    expect = df.to_pandas().set_index('a').sort_index()

    assert list(expect.columns) == list(got.columns)
    assert list(expect.index.values) == list(got.index.values)
    np.testing.assert_array_equal(expect.index.values, got.index.values)
    np.testing.assert_array_equal(expect['b'].values, got['b'].to_array())
Exemple #23
0
def test_dataframe_nsmallest(nelem, n):
    np.random.seed(0)
    df = DataFrame()
    df['a'] = aa = np.random.random(nelem)
    df['b'] = bb = np.random.random(nelem)
    res = df.nsmallest(n, 'a')

    # Check
    inds = np.argsort(-aa)
    np.testing.assert_array_equal(res['a'].to_array(), aa[inds][-n:][::-1])
    np.testing.assert_array_equal(res['b'].to_array(), bb[inds][-n:][::-1])
    np.testing.assert_array_equal(res.index.values, inds[-n:][::-1])
Exemple #24
0
def test_query_env_changing():
    df = DataFrame()
    df['a'] = aa = np.arange(100)
    expr = 'a < @c'
    # first attempt
    c = 10
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got['a'].to_array())
    # change env
    c = 50
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got['a'].to_array())
Exemple #25
0
def test_df_set_index_from_series():
    df = DataFrame()
    df['a'] = list(range(10))
    df['b'] = list(range(0, 20, 2))

    # Check set_index(Series)
    df2 = df.set_index(df['b'])
    assert list(df2.columns) == ['a', 'b']
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]
Exemple #26
0
def test_dataframe_column_add_drop():
    df = DataFrame()
    data = np.asarray(range(10))
    df['a'] = data
    df['b'] = data
    assert df.columns == ('a', 'b')
    del df['a']
    assert df.columns == ('b',)
    df['c'] = data
    assert df.columns == ('b', 'c')
    df['a'] = data
    assert df.columns == ('b', 'c', 'a')
Exemple #27
0
def test_dataframe_dir_and_getattr():
    df = DataFrame([('a', np.ones(10)), ('b', np.ones(10)),
                    ('not an id', np.ones(10)), ('oop$', np.ones(10))])
    o = dir(df)
    assert {'a', 'b'}.issubset(o)
    assert 'not an id' not in o
    assert 'oop$' not in o

    # Getattr works
    assert df.a is df['a']
    assert df.b is df['b']
    with pytest.raises(AttributeError):
        df.not_a_column
Exemple #28
0
def test_nonmatching_index_setitem(nrows):
    np.random.seed(0)

    gdf = DataFrame()
    gdf['a'] = np.random.randint(2147483647, size=nrows)
    gdf['b'] = np.random.randint(2147483647, size=nrows)
    gdf = gdf.set_index('b')

    test_values = np.random.randint(2147483647, size=nrows)
    gdf['c'] = test_values
    assert (len(test_values) == len(gdf['c']))
    assert (gdf['c'].to_pandas().equals(
        Series(test_values).set_index(gdf._index).to_pandas()))
Exemple #29
0
def test_dataframe_basic():
    np.random.seed(0)
    df = DataFrame()

    # Populate with cuda memory
    df['keys'] = cuda.to_device(np.arange(10, dtype=np.float64))
    np.testing.assert_equal(df['keys'].to_array(), np.arange(10))
    assert len(df) == 10

    # Populate with numpy array
    rnd_vals = np.random.random(10)
    df['vals'] = rnd_vals
    np.testing.assert_equal(df['vals'].to_array(), rnd_vals)
    assert len(df) == 10
    assert df.columns == ('keys', 'vals')

    # Make another dataframe
    df2 = DataFrame()
    df2['keys'] = np.array([123], dtype=np.float64)
    df2['vals'] = np.array([321], dtype=np.float64)

    # Concat
    df = df.concat(df2)
    assert len(df) == 11

    hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123])
    hvals = np.asarray(rnd_vals.tolist() + [321])

    np.testing.assert_equal(df['keys'].to_array(), hkeys)
    np.testing.assert_equal(df['vals'].to_array(), hvals)

    # As matrix
    mat = df.as_matrix()

    expect = np.vstack([hkeys, hvals]).T

    print(expect)
    print(mat)
    np.testing.assert_equal(mat, expect)
Exemple #30
0
def test_sizeof_dataframe():
    np.random.seed(0)
    df = DataFrame()
    nelem = 1000
    df['keys'] = hkeys = np.arange(nelem, dtype=np.float64)
    df['vals'] = hvals = np.random.random(nelem)

    nbytes = hkeys.nbytes + hvals.nbytes
    sizeof = sys.getsizeof(df)
    assert sizeof >= nbytes

    serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL))
    # Serialized size should be close to what __sizeof__ is giving
    np.testing.assert_approx_equal(sizeof, serialized_nbytes, significant=2)