Esempio n. 1
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in 'abc':
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index('a')
    right = df.set_index('c')
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match("there are overlapping columns but lsuffix"
                 " and rsuffix are not defined")

    got = left.join(right, lsuffix='_left', rsuffix='_right', sort=True)
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index('a').join(pddf.set_index('c'),
                                      lsuffix='_left',
                                      rsuffix='_right')
    # Check
    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    for k in expect.columns:
        _check_series(expect[k], got[k])
Esempio n. 2
0
def test_dataframe_join_mismatch_cats(how):
    pdf1 = pd.DataFrame({"join_col": ["a", "b", "c", "d", "e"],
                         "data_col_left": [10, 20, 30, 40, 50]})
    pdf2 = pd.DataFrame({"join_col": ["c", "e", "f"],
                         "data_col_right": [6, 7, 8]})

    pdf1["join_col"] = pdf1["join_col"].astype("category")
    pdf2["join_col"] = pdf2["join_col"].astype("category")

    gdf1 = DataFrame.from_pandas(pdf1)
    gdf2 = DataFrame.from_pandas(pdf2)

    gdf1 = gdf1.set_index("join_col")
    gdf2 = gdf2.set_index("join_col")

    pdf1 = pdf1.set_index('join_col')
    pdf2 = pdf2.set_index('join_col')
    join_gdf = gdf1.join(gdf2, how=how, sort=True)
    join_pdf = pdf1.join(pdf2, how=how)

    got = join_gdf.to_pandas()
    expect = join_pdf.fillna(-1)  # note: pygdf join doesn't mask NA

    expect.data_col_right = expect.data_col_right.astype(np.int64)
    expect.data_col_left = expect.data_col_left.astype(np.int64)
    pd.util.testing.assert_frame_equal(got, expect, check_names=False,
                                       check_index_type=False,
                                       # For inner joins, pandas returns
                                       # weird categories.
                                       check_categorical=how != 'inner')
    assert list(got.index) == list(expect.index)
Esempio n. 3
0
def test_assign():
    gdf = DataFrame({'x': [1, 2, 3]})
    gdf2 = gdf.assign(y=gdf.x + 1)
    assert list(gdf.columns) == ['x']
    assert list(gdf2.columns) == ['x', 'y']

    np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
Esempio n. 4
0
def test_dataframe_as_gpu_matrix_null_values():
    df = DataFrame()

    nelem = 123
    na = -10000

    refvalues = {}
    for k in 'abcd':
        df[k] = data = np.random.random(nelem)
        bitmask = utils.random_bitmask(nelem)
        df[k] = df[k].set_mask(bitmask)
        boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem],
                              dtype=np.bool_)
        data[~boolmask] = na
        refvalues[k] = data

    # Check null value causes error
    with pytest.raises(ValueError) as raises:
        df.as_gpu_matrix()
    raises.match("column 'a' has null values")

    for k in df.columns:
        df[k] = df[k].fillna(na)

    mat = df.as_gpu_matrix().copy_to_host()
    for i, k in enumerate(df.columns):
        np.testing.assert_array_equal(refvalues[k], mat[:, i])
Esempio n. 5
0
def test_dataframe_setitem_from_masked_object():
    ary = np.random.randn(100)
    mask = np.zeros(100, dtype=bool)
    mask[:20] = True
    np.random.shuffle(mask)
    ary[mask] = np.nan

    test1 = Series(ary)
    assert (test1.has_null_mask)
    assert (test1.null_count == 20)

    test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary}))
    assert (test2['a'].has_null_mask)
    assert (test2['a'].null_count == 20)

    gpu_ary = cuda.to_device(ary)
    test3 = Series(gpu_ary)
    assert (test3.has_null_mask)
    assert (test3.null_count == 20)

    test4 = DataFrame()
    lst = [1, 2, None, 4, 5, 6, None, 8, 9]
    test4['lst'] = lst
    assert (test4['lst'].has_null_mask)
    assert (test4['lst'].null_count == 2)
Esempio n. 6
0
def test_dataframe_hash_partition(nrows, nparts, nkeys):
    np.random.seed(123)
    gdf = DataFrame()
    keycols = []
    for i in range(nkeys):
        keyname = 'key{}'.format(i)
        gdf[keyname] = np.random.randint(0, 7 - i, nrows)
        keycols.append(keyname)
    gdf['val1'] = np.random.randint(0, nrows * 2, nrows)

    got = gdf.partition_by_hash(keycols, nparts=nparts)
    # Must return a list
    assert isinstance(got, list)
    # Must have correct number of partitions
    assert len(got) == nparts
    # All partitions must be DataFrame type
    assert all(isinstance(p, DataFrame) for p in got)
    # Check that all partitions have unique keys
    part_unique_keys = set()
    for p in got:
        if len(p):
            # Take rows of the keycolums and build a set of the key-values
            unique_keys = set(map(tuple, p.as_matrix(columns=keycols)))
            # Ensure that none of the key-values have occurred in other groups
            assert not (unique_keys & part_unique_keys)
            part_unique_keys |= unique_keys
    assert len(part_unique_keys)
Esempio n. 7
0
def test_label_encode_drop_one():
    random.seed(0)
    np.random.seed(0)

    df = DataFrame()

    # initialize data frame
    df['cats'] = np.random.randint(7, size=10, dtype=np.int32)
    vals = list(df['cats'].unique())
    # drop 1 randomly
    del vals[random.randrange(len(vals))]

    lab = dict(zip(vals, list(range(len(vals)))))

    # label encode series
    ncol = df['cats'].label_encoding(cats=vals, dtype='float32')
    arr = ncol.to_array()

    # verify labels of new column

    for i in range(arr.size):
        # assuming -1 is used for missing value
        np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1))

    # label encode data frame
    df2 = df.label_encoding(column='cats', prefix='cats', cats=vals, dtype='float32')

    assert df2.columns[0] == 'cats'
    assert df2.columns[1] == 'cats_labels'
Esempio n. 8
0
def test_dataframe_join_how(aa, bb, how, method):
    df = DataFrame()
    df['a'] = aa
    df['b'] = bb

    def work_pandas(df):
        ts = timer()
        df1 = df.set_index('a')
        df2 = df.set_index('b')
        joined = df1.join(df2, how=how, sort=True)
        te = timer()
        print('timing', type(df), te - ts)
        return joined

    def work_gdf(df):
        ts = timer()
        df1 = df.set_index('a')
        df2 = df.set_index('b')
        joined = df1.join(df2, how=how, sort=True, method=method)
        te = timer()
        print('timing', type(df), te - ts)
        return joined

    expect = work_pandas(df.to_pandas())
    got = work_gdf(df)
    expecto = expect.copy()
    goto = got.copy()

    # Type conversion to handle NoneType
    expectb = expect.b
    expecta = expect.a
    gotb = got.b
    gota = got.a
    got.drop_column('b')
    got.add_column('b', gotb.astype(np.float64).fillna(np.nan))
    got.drop_column('a')
    got.add_column('a', gota.astype(np.float64).fillna(np.nan))
    expect.drop(['b'], axis=1)
    expect['b'] = expectb.astype(np.float64).fillna(np.nan)
    expect.drop(['a'], axis=1)
    expect['a'] = expecta.astype(np.float64).fillna(np.nan)

    # print(expect)
    # print(got.to_string(nrows=None))

    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    if(how != 'outer'):
        pd.util.testing.assert_frame_equal(
            got.to_pandas().sort_values(['b', 'a']).reset_index(drop=True),
            expect.sort_values(['b', 'a']).reset_index(drop=True))
        # if(how=='right'):
        #     _sorted_check_series(expect['a'], expect['b'],
        #                          got['a'], got['b'])
        # else:
        #     _sorted_check_series(expect['b'], expect['a'], got['b'],
        #                          got['a'])
    else:
        _check_series(expecto['b'], goto['b'])
        _check_series(expecto['a'], goto['a'])
Esempio n. 9
0
def test_to_records_noindex():
    df = DataFrame()
    df['a'] = aa = np.arange(10, dtype=np.int32)
    df['b'] = bb = np.arange(10, 20, dtype=np.float64)

    rec = df.to_records(index=False)
    assert rec.dtype.names == ('a', 'b')
    np.testing.assert_array_equal(rec['a'], aa)
    np.testing.assert_array_equal(rec['b'], bb)
Esempio n. 10
0
def test_dataframe_append_to_empty():
    pdf = pd.DataFrame()
    pdf['a'] = []
    pdf['b'] = [1, 2, 3]

    gdf = DataFrame()
    gdf['a'] = []
    gdf['b'] = [1, 2, 3]

    pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
Esempio n. 11
0
def test_dataframe_empty_concat():
    gdf1 = DataFrame()
    gdf1['a'] = []
    gdf1['b'] = []

    gdf2 = gdf1.copy()

    gdf3 = gd.concat([gdf1, gdf2])
    assert len(gdf3) == 0
    assert len(gdf3.columns) == 2
Esempio n. 12
0
def test_dataframe_join_cats():
    lhs = DataFrame()
    lhs['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    lhs['b'] = bb = np.arange(len(lhs))
    lhs = lhs.set_index('a')

    rhs = DataFrame()
    rhs['a'] = pd.Categorical(list('abcac'), categories=list('abc'))
    rhs['c'] = cc = np.arange(len(rhs))
    rhs = rhs.set_index('a')

    got = lhs.join(rhs)
    expect = lhs.to_pandas().join(rhs.to_pandas())

    # Note: pandas make a object Index after joining
    pd.util.testing.assert_frame_equal(
        got.sort_values(by='b')
        .to_pandas()
        .sort_index()
        .reset_index(drop=True),
        expect.reset_index(drop=True))

    # Just do some rough checking here.
    assert list(got.columns) == ['b', 'c']
    assert len(got) > 0
    assert set(got.index.values) & set('abc')
    assert set(got['b']) & set(bb)
    assert set(got['c']) & set(cc)
Esempio n. 13
0
def test_to_records_withindex():
    df = DataFrame()
    df['a'] = aa = np.arange(10, dtype=np.int32)
    df['b'] = bb = np.arange(10, 20, dtype=np.float64)

    rec_indexed = df.to_records(index=True)
    assert rec_indexed.size == len(aa)
    assert rec_indexed.dtype.names == ('index', 'a', 'b')
    np.testing.assert_array_equal(rec_indexed['a'], aa)
    np.testing.assert_array_equal(rec_indexed['b'], bb)
    np.testing.assert_array_equal(rec_indexed['index'], np.arange(10))
Esempio n. 14
0
def test_dataframe_sort_values(nelem, dtype):
    np.random.seed(0)
    df = DataFrame()
    df['a'] = aa = (100 * np.random.random(nelem)).astype(dtype)
    df['b'] = bb = (100 * np.random.random(nelem)).astype(dtype)
    sorted_df = df.sort_values(by='a')
    # Check
    sorted_index = np.argsort(aa, kind='mergesort')
    np.testing.assert_array_equal(sorted_df.index.values, sorted_index)
    np.testing.assert_array_equal(sorted_df['a'], aa[sorted_index])
    np.testing.assert_array_equal(sorted_df['b'], bb[sorted_index])
Esempio n. 15
0
def test_reading_arrow_sparse_data():
    schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)

    df = DataFrame(gar.to_dict().items())

    # preprocessing
    num_cols = set()
    cat_cols = set()
    response_set = set(['INCEARN '])
    feature_names = set(df.columns) - response_set

    # Determine cat and numeric columns
    uniques = {}
    for k in feature_names:
        try:
            uniquevals = df[k].unique()
            uniques[k] = uniquevals
        except ValueError:
            num_cols.add(k)
        else:
            nunique = len(uniquevals)
            if nunique < 2:
                del df[k]
            elif 1 < nunique < 1000:
                cat_cols.add(k)
            else:
                num_cols.add(k)

    # Fix numeric columns
    for k in (num_cols - response_set):
        df[k] = df[k].fillna(df[k].mean())
        assert df[k].null_count == 0
        std = df[k].std()
        # drop near constant columns
        if not np.isfinite(std) or std < 1e-4:
            del df[k]
            print('drop near constant', k)
        else:
            df[k] = df[k].scale()

    # Expand categorical columns
    for k in cat_cols:
        cats = uniques[k][1:]  # drop first
        df = df.one_hot_encoding(k, prefix=k, cats=cats)
        del df[k]

    # Print dtypes
    assert {df[k].dtype for k in df.columns} == {np.dtype('float64')}

    mat = df.as_matrix()

    assert mat.max() == 1
    assert mat.min() == 0
Esempio n. 16
0
def test_reading_arrow_sparse_data():
    schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)

    df = DataFrame(gar.to_dict().items())

    # preprocessing
    num_cols = set()
    cat_cols = set()
    response_set = set(['INCEARN '])
    feature_names = set(df.columns) - response_set

    # Determine cat and numeric columns
    uniques = {}
    for k in feature_names:
        try:
            uniquevals = df[k].unique_k(k=1000)
            uniques[k] = uniquevals
        except ValueError:
            num_cols.add(k)
        else:
            nunique = len(uniquevals)
            if nunique < 2:
                del df[k]
            elif 1 < nunique < 1000:
                cat_cols.add(k)
            else:
                num_cols.add(k)

    # Fix numeric columns
    for k in (num_cols - response_set):
        df[k] = df[k].fillna(df[k].mean())
        assert df[k].null_count == 0
        std = df[k].std()
        # drop near constant columns
        if not np.isfinite(std) or std < 1e-4:
            del df[k]
            print('drop near constant', k)
        else:
            df[k] = df[k].scale()

    # Expand categorical columns
    for k in cat_cols:
        cats = uniques[k][1:]  # drop first
        df = df.one_hot_encoding(k, prefix=k, cats=cats)
        del df[k]

    # Print dtypes
    assert {df[k].dtype for k in df.columns} == {np.dtype('float64')}

    mat = df.as_matrix()

    assert mat.max() == 1
    assert mat.min() == 0
Esempio n. 17
0
def test_df_cat_sort_index():
    df = DataFrame()
    df['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    df['b'] = np.arange(len(df))

    got = df.set_index('a').sort_index()
    expect = df.to_pandas().set_index('a').sort_index()

    assert list(expect.columns) == list(got.columns)
    assert list(expect.index.values) == list(got.index.values)
    np.testing.assert_array_equal(expect.index.values, got.index.values)
    np.testing.assert_array_equal(expect['b'].values, got['b'].to_array())
Esempio n. 18
0
def test_df_set_index_from_series():
    df = DataFrame()
    df['a'] = list(range(10))
    df['b'] = list(range(0, 20, 2))

    # Check set_index(Series)
    df2 = df.set_index(df['b'])
    assert list(df2.columns) == ['a', 'b']
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]
Esempio n. 19
0
def test_dataframe_nsmallest(nelem, n):
    np.random.seed(0)
    df = DataFrame()
    df['a'] = aa = np.random.random(nelem)
    df['b'] = bb = np.random.random(nelem)
    res = df.nsmallest(n, 'a')

    # Check
    inds = np.argsort(-aa)
    np.testing.assert_array_equal(res['a'].to_array(), aa[inds][-n:][::-1])
    np.testing.assert_array_equal(res['b'].to_array(), bb[inds][-n:][::-1])
    np.testing.assert_array_equal(res.index.values, inds[-n:][::-1])
Esempio n. 20
0
def test_query_env_changing():
    df = DataFrame()
    df['a'] = aa = np.arange(100)
    expr = 'a < @c'
    # first attempt
    c = 10
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got['a'].to_array())
    # change env
    c = 50
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got['a'].to_array())
Esempio n. 21
0
def test_query_env_changing():
    df = DataFrame()
    df['a'] = aa = np.arange(100)
    expr = 'a < @c'
    # first attempt
    c = 10
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got['a'].to_array())
    # change env
    c = 50
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got['a'].to_array())
Esempio n. 22
0
def test_nonmatching_index_setitem(nrows):
    np.random.seed(0)

    gdf = DataFrame()
    gdf['a'] = np.random.randint(2147483647, size=nrows)
    gdf['b'] = np.random.randint(2147483647, size=nrows)
    gdf = gdf.set_index('b')

    test_values = np.random.randint(2147483647, size=nrows)
    gdf['c'] = test_values
    assert (len(test_values) == len(gdf['c']))
    assert (gdf['c'].to_pandas().equals(
        Series(test_values).set_index(gdf._index).to_pandas()))
Esempio n. 23
0
def test_dataframe_masked_slicing(nelem, slice_start, slice_end):
    gdf = DataFrame()
    gdf['a'] = list(range(nelem))
    gdf['b'] = list(range(nelem, 2 * nelem))
    gdf['a'] = gdf['a'].set_mask(utils.random_bitmask(nelem))
    gdf['b'] = gdf['b'].set_mask(utils.random_bitmask(nelem))

    def do_slice(x):
        return x[slice_start:slice_end]

    expect = do_slice(gdf.to_pandas())
    got = do_slice(gdf).to_pandas()

    pd.testing.assert_frame_equal(expect, got)
Esempio n. 24
0
def test_to_pandas():
    df = DataFrame()
    df['a'] = np.arange(10, dtype=np.int32)
    df['b'] = np.arange(10, 20, dtype=np.float64)

    pdf = df.to_pandas()

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df['a'].dtype == pdf['a'].dtype
    assert df['b'].dtype == pdf['b'].dtype

    assert len(df['a']) == len(pdf['a'])
    assert len(df['b']) == len(pdf['b'])
Esempio n. 25
0
def test_to_pandas():
    df = DataFrame()
    df['a'] = np.arange(10, dtype=np.int32)
    df['b'] = np.arange(10, 20, dtype=np.float64)

    pdf = df.to_pandas()

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df['a'].dtype == pdf['a'].dtype
    assert df['b'].dtype == pdf['b'].dtype

    assert len(df['a']) == len(pdf['a'])
    assert len(df['b']) == len(pdf['b'])
Esempio n. 26
0
def test_df_set_index_from_name():
    df = DataFrame()
    df['a'] = list(range(10))
    df['b'] = list(range(0, 20, 2))

    # Check set_index(column_name)
    df2 = df.set_index('b')
    print(df2)
    # 1 less column because 'b' is used as index
    assert list(df2.columns) == ['a']
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]
Esempio n. 27
0
def test_onehot_random():
    df = DataFrame()
    low = 10
    high = 17
    size = 10
    df['src'] = src = np.random.randint(low=low, high=high, size=size)
    df2 = df.one_hot_encoding(column='src', prefix='out_',
                              cats=tuple(range(10, 17)))
    mat = df2.as_matrix(columns=df2.columns[1:])

    for val in range(low, high):
        colidx = val - low
        arr = mat[:, colidx]
        mask = src == val
        np.testing.assert_equal(arr, mask)
Esempio n. 28
0
def test_query(data, fn):
    # prepare
    nelem, seed = data
    expect_fn, query_expr = fn
    np.random.seed(seed)
    df = DataFrame()
    df['a'] = aa = np.arange(nelem)
    df['b'] = bb = np.random.random(nelem) * nelem
    # udt
    expect_mask = expect_fn(aa, bb)
    df2 = df.query(query_expr)
    # check
    assert len(df2) == np.count_nonzero(expect_mask)
    np.testing.assert_array_almost_equal(df2['a'].to_array(), aa[expect_mask])
    np.testing.assert_array_almost_equal(df2['b'].to_array(), bb[expect_mask])
Esempio n. 29
0
def test_query(data, fn):
    # prepare
    nelem, seed = data
    expect_fn, query_expr = fn
    np.random.seed(seed)
    df = DataFrame()
    df['a'] = aa = np.arange(nelem)
    df['b'] = bb = np.random.random(nelem) * nelem
    # udt
    expect_mask = expect_fn(aa, bb)
    df2 = df.query(query_expr)
    # check
    assert len(df2) == np.count_nonzero(expect_mask)
    np.testing.assert_array_almost_equal(df2['a'].to_array(), aa[expect_mask])
    np.testing.assert_array_almost_equal(df2['b'].to_array(), bb[expect_mask])
Esempio n. 30
0
def test_categorical_value_counts(num_elements):
    from string import ascii_letters, digits

    # create categorical series
    np.random.seed(12)
    pd_cat = pd.Categorical(
        pd.Series(
            np.random.choice(list(ascii_letters + digits), num_elements),
            dtype='category'
            )
        )

    # gdf
    gdf = DataFrame()
    gdf['a'] = Series.from_categorical(pd_cat)
    gdf_value_counts = gdf['a'].value_counts()

    # pandas
    pdf = pd.DataFrame()
    pdf['a'] = pd_cat
    pdf_value_counts = pdf['a'].value_counts()

    # verify
    pandas_dict = pdf_value_counts.to_dict()
    gdf_dict = gdf_value_counts.to_pandas().to_dict()

    assert pandas_dict == gdf_dict
Esempio n. 31
0
def test_dataframe_hash_partition_masked_value(nrows):
    gdf = DataFrame()
    gdf['key'] = np.arange(nrows)
    gdf['val'] = np.arange(nrows) + 100
    bitmask = utils.random_bitmask(nrows)
    bytemask = utils.expand_bits_to_bytes(bitmask)
    gdf['val'] = gdf['val'].set_mask(bitmask)
    parted = gdf.partition_by_hash(['key'], nparts=3)
    # Verify that the valid mask is correct
    for p in parted:
        df = p.to_pandas()
        for row in df.itertuples():
            valid = bool(bytemask[row.key])
            expected_value = row.key + 100 if valid else -1
            got_value = row.val
            assert expected_value == got_value
Esempio n. 32
0
def test_groupby_iterate_groups():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df['key1'] = np.random.randint(0, 3, nelem)
    df['key2'] = np.random.randint(0, 2, nelem)
    df['val1'] = np.random.random(nelem)
    df['val2'] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    for grp in df.groupby(['key1', 'key2']):
        pddf = grp.to_pandas()
        for k in 'key1,key2'.split(','):
            assert_values_equal(pddf[k].values)
Esempio n. 33
0
def test_dataframe_to_string_wide():
    # Test basic
    df = DataFrame()
    for i in range(100):
        df['a{}'.format(i)] = list(range(3))
    got = df.to_string(ncols=8)
    print(got)
    expect = '''
    a0   a1   a2   a3   a4   a5   a6 ...  a99
0    0    0    0    0    0    0    0 ...    0
1    1    1    1    1    1    1    1 ...    1
2    2    2    2    2    2    2    2 ...    2
[92 more columns]
'''
    # values should match despite whitespace difference
    assert got.split() == expect.split()
Esempio n. 34
0
def test_onehot_random():
    df = DataFrame()
    low = 10
    high = 17
    size = 10
    df['src'] = src = np.random.randint(low=low, high=high, size=size)
    df2 = df.one_hot_encoding(column='src',
                              prefix='out_',
                              cats=tuple(range(10, 17)))
    mat = df2.as_matrix(columns=df2.columns[1:])

    for val in range(low, high):
        colidx = val - low
        arr = mat[:, colidx]
        mask = src == val
        np.testing.assert_equal(arr, mask)
Esempio n. 35
0
def test_onehot_generic_index():
    np.random.seed(0)
    size = 33
    indices = np.random.randint(low=0, high=100, size=size)
    df = DataFrame()
    values = np.random.randint(low=0, high=4, size=size)
    df['fo'] = Series(values, index=GenericIndex(indices))
    out = df.one_hot_encoding('fo',
                              cats=df.fo.unique(),
                              prefix='fo',
                              dtype=np.int32)
    assert set(out.columns) == {'fo', 'fo_0', 'fo_1', 'fo_2', 'fo_3'}
    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
Esempio n. 36
0
def test_dataframe_join_cats():
    ldf = DataFrame()
    ldf['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    ldf['b'] = bb = np.arange(len(ldf))
    lhs = ldf.set_index('a')

    rdf = DataFrame()
    rdf['a'] = pd.Categorical(list('abcac'), categories=list('abc'))
    rdf['c'] = cc = np.arange(len(rdf))
    rhs = rdf.set_index('a')

    got = lhs.join(rhs)
    # Just do some rough checking here.
    # Note: pandas fails to join on categorical index.
    assert list(got.columns) == ['b', 'c']
    assert len(got) > 0
    assert set(got.index.values) & set('abc')
    assert set(got['b']) & set(bb)
    assert set(got['c']) & set(cc)
Esempio n. 37
0
def test_from_pandas():
    pdf = pd.DataFrame()
    pdf['a'] = np.arange(10, dtype=np.int32)
    pdf['b'] = np.arange(10, 20, dtype=np.float64)

    df = DataFrame.from_pandas(pdf)

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df['a'].dtype == pdf['a'].dtype
    assert df['b'].dtype == pdf['b'].dtype

    assert len(df['a']) == len(pdf['a'])
    assert len(df['b']) == len(pdf['b'])
Esempio n. 38
0
def test_from_pandas_ex1():
    pdf = pd.DataFrame({'a': [0, 1, 2, 3],
                        'b': [0.1, 0.2, None, 0.3]})
    print(pdf)
    df = DataFrame.from_pandas(pdf)
    print(df)

    assert tuple(df.columns) == tuple(pdf.columns)
    assert np.all(df['a'].to_array() == pdf['a'])
    matches = df['b'].to_array() == pdf['b']
    # the 3d element is False due to (nan == nan) == False
    assert np.all(matches == [True, True, False, True])
    assert np.isnan(df['b'].to_array()[2])
    assert np.isnan(pdf['b'][2])
Esempio n. 39
0
def test_onehot_simple():
    np.random.seed(0)
    df = DataFrame()
    # Populate with data [0, 10)
    df['vals'] = np.arange(10, dtype=np.int32)
    # One Hot (Series)
    for i, col in enumerate(df['vals'].one_hot_encoding(list(range(10)))):
        arr = col.to_array()
        # Verify 1 in the right position
        np.testing.assert_equal(arr[i], 1)
        # Every other slots are 0s
        np.testing.assert_equal(arr[:i], 0)
        np.testing.assert_equal(arr[i + 1:], 0)
    # One Hot (DataFrame)
    df2 = df.one_hot_encoding(column='vals',
                              prefix='vals',
                              cats=list(range(10)))
    assert df2.columns[0] == 'vals'
    for i in range(1, len(df2.columns)):
        assert df2.columns[i] == 'vals_%s' % (i - 1)
    got = df2.as_matrix(columns=df2.columns[1:])
    expect = np.identity(got.shape[0])
    np.testing.assert_equal(got, expect)
Esempio n. 40
0
def test_dataframe_join_how(aa, bb, how):
    df = DataFrame()
    df['a'] = aa
    df['b'] = bb

    def work(df):
        ts = timer()
        df1 = df.set_index('a')
        df2 = df.set_index('b')
        joined = df1.join(df2, how=how, sort=True)
        te = timer()
        print('timing', type(df), te - ts)
        return joined

    expect = work(df.to_pandas())
    got = work(df)

    # print(expect)
    # print(got.to_string(nrows=None))

    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    _check_series(expect['b'], got['b'])
    _check_series(expect['a'], got['a'])
Esempio n. 41
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in 'abc':
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index('a')
    right = df.set_index('c')
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match("there are overlapping columns but lsuffix"
                 " and rsuffix are not defined")

    got = left.join(right, lsuffix='_left', rsuffix='_right')
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index('a').join(pddf.set_index('c'),
                                      lsuffix='_left', rsuffix='_right')
    # Check
    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    for k in expect.columns:
        _check_series(expect[k], got[k])
Esempio n. 42
0
def test_from_records_noindex(columns):
    recdtype = np.dtype([
        ('a', np.int32),
        ('b', np.float64),
    ])
    rec = np.recarray(10, dtype=recdtype)
    rec.a = aa = np.arange(10, dtype=np.int32)
    rec.b = bb = np.arange(10, 20, dtype=np.float64)
    df = DataFrame.from_records(rec, columns=columns)

    if columns and 'a' in columns:
        np.testing.assert_array_equal(aa, df['a'])
    if columns and 'b' in columns:
        np.testing.assert_array_equal(bb, df['b'])
    np.testing.assert_array_equal(np.arange(10), df.index.values)