コード例 #1
0
ファイル: test_joining.py プロジェクト: xiaolin1990/cudf
def test_dataframe_join_cats():
    lhs = DataFrame()
    lhs['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    lhs['b'] = bb = np.arange(len(lhs))
    lhs = lhs.set_index('a')

    rhs = DataFrame()
    rhs['a'] = pd.Categorical(list('abcac'), categories=list('abc'))
    rhs['c'] = cc = np.arange(len(rhs))
    rhs = rhs.set_index('a')

    got = lhs.join(rhs)
    expect = lhs.to_pandas().join(rhs.to_pandas())

    # Note: pandas make a object Index after joining
    pd.util.testing.assert_frame_equal(
        got.sort_values(by='b').to_pandas().sort_index().reset_index(
            drop=True), expect.reset_index(drop=True))

    # Just do some rough checking here.
    assert list(got.columns) == ['b', 'c']
    assert len(got) > 0
    assert set(got.index.values) & set('abc')
    assert set(got['b']) & set(bb)
    assert set(got['c']) & set(cc)
コード例 #2
0
ファイル: test_string.py プロジェクト: zeichuan/cudf
def test_string_join_non_key_nulls(str_data_nulls):
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["vals"] = pd.Series(str_data, dtype="str")
    gdf["vals"] = Series(str_data, dtype="str")
    pdf["key"] = other_data
    gdf["key"] = other_data

    pdf2 = pd.DataFrame()
    gdf2 = DataFrame()
    pdf2["vals"] = pd.Series(str_data_nulls, dtype="str")
    gdf2["vals"] = Series(str_data_nulls, dtype="str")
    pdf2["key"] = pd.Series(other_data_nulls, dtype="int64")
    gdf2["key"] = Series(other_data_nulls, dtype="int64")

    expect = pdf.merge(pdf2, on="key", how="left")
    got = gdf.merge(gdf2, on="key", how="left")

    if len(expect) == 0 and len(got) == 0:
        expect = expect.reset_index(drop=True)
        got = got[expect.columns]

    assert_eq(expect, got)
コード例 #3
0
def test_string_join_non_key_nulls(str_data_nulls):
    str_data = ['a', 'b', 'c', 'd', 'e']
    other_data = [1, 2, 3, 4, 5]

    other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf['vals'] = pd.Series(str_data, dtype='str')
    gdf['vals'] = Series(str_data, dtype='str')
    pdf['key'] = other_data
    gdf['key'] = other_data

    pdf2 = pd.DataFrame()
    gdf2 = DataFrame()
    pdf2['vals'] = pd.Series(str_data_nulls, dtype='str')
    gdf2['vals'] = Series(str_data_nulls, dtype='str')
    pdf2['key'] = pd.Series(other_data_nulls, dtype='int64')
    gdf2['key'] = Series(other_data_nulls, dtype='int64')

    expect = pdf.merge(pdf2, on='key', how='left')
    got = gdf.merge(gdf2, on='key', how='left')

    if len(expect) == 0 and len(got) == 0:
        expect = expect.reset_index(drop=True)
        got = got[expect.columns]

    assert_eq(expect, got)
コード例 #4
0
ファイル: test_joining.py プロジェクト: zeichuan/cudf
def test_dataframe_empty_merge():
    gdf1 = DataFrame([("a", []), ("b", [])])
    gdf2 = DataFrame([("a", []), ("c", [])])

    expect = DataFrame([("a", []), ("b", []), ("c", [])])
    got = gdf1.merge(gdf2, how="left", on=["a"])

    assert_eq(expect, got)
コード例 #5
0
def test_dataframe_empty_merge():
    gdf1 = DataFrame([('a', []), ('b', [])])
    gdf2 = DataFrame([('a', []), ('c', [])])

    expect = DataFrame([('a', []), ('b', []), ('c', [])])
    got = gdf1.merge(gdf2, how='left', on=['a'])

    assert_eq(expect, got)
コード例 #6
0
def test_dataframe_merge_on(on):
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left['key1'] = np.random.randint(0, 40, nelem)
    df_left['key2'] = np.random.randint(0, 50, nelem)
    df_left['left_val'] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right['key1'] = np.random.randint(0, 30, nelem)
    df_right['key2'] = np.random.randint(0, 50, nelem)
    df_right['right_val'] = np.arange(nelem)

    # Make pandas DF
    pddf_left = df_left.to_pandas()
    pddf_right = df_right.to_pandas()

    # Expected result (from pandas)
    pddf_joined = pddf_left.merge(pddf_right, on=on, how='left')

    # Test (from cuDF; doesn't check for ordering)
    join_result = df_left.merge(df_right, on=on, how='left')
    join_result_cudf = cudf.merge(df_left, df_right, on=on, how='left')

    join_result['right_val'] = (join_result['right_val'].astype(
        np.float64).fillna(np.nan))

    join_result_cudf['right_val'] = (join_result_cudf['right_val'].astype(
        np.float64).fillna(np.nan))

    for col in list(pddf_joined.columns):
        if (col.count('_y') > 0):
            join_result[col] = (join_result[col].astype(np.float64).fillna(
                np.nan))
            join_result_cudf[col] = (join_result_cudf[col].astype(
                np.float64).fillna(np.nan))

    # Test dataframe equality (ignore order of rows and columns)
    cdf_result = join_result.to_pandas() \
                            .sort_values(list(pddf_joined.columns)) \
                            .reset_index(drop=True)

    pdf_result = pddf_joined.sort_values(list(pddf_joined.columns)) \
                            .reset_index(drop=True)

    pd.util.testing.assert_frame_equal(cdf_result, pdf_result, check_like=True)

    merge_func_result_cdf = join_result_cudf.to_pandas() \
                                            .sort_values(
                                                list(pddf_joined.columns)) \
                                            .reset_index(drop=True)

    pd.util.testing.assert_frame_equal(merge_func_result_cdf,
                                       cdf_result,
                                       check_like=True)
コード例 #7
0
ファイル: test_onehot.py プロジェクト: yutiansut/cudf
def test_onehot_get_dummies_simple():
    df = DataFrame({'x': np.arange(10)})
    original = df.copy()
    encoded = get_dummies(df, prefix='test')

    assert df == original  # the original df should be unchanged
    cols = list(encoded.columns)[1:]
    actual = DataFrame(dict(zip(cols, np.eye(len(cols)))))
    assert (encoded.loc[:, cols] == actual).all().all()
コード例 #8
0
def test_dataframe_to_string():
    with set_options(formatting={'nrows': 5, 'ncols': 8}):
        # Test basic
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16])])
        string = str(df)
        print(string)
        assert string.splitlines()[-1] == '[1 more rows]'

        # Test skipped columns
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16]),
                        ('c', [11, 12, 13, 14, 15, 16]),
                        ('d', [11, 12, 13, 14, 15, 16])])
        string = df.to_string(ncols=3)
        print(string)
        assert string.splitlines()[-2] == '[1 more rows]'
        assert string.splitlines()[-1] == '[1 more columns]'

        # Test masked
        df = DataFrame([('a', [1, 2, 3, 4, 5, 6]),
                        ('b', [11, 12, 13, 14, 15, 16])])

        data = np.arange(6)
        mask = np.zeros(1, dtype=np.uint8)
        mask[0] = 0b00101101

        masked = Series.from_masked_array(data, mask)
        assert masked.null_count == 2
        df['c'] = masked

        # check data
        values = list(masked)
        validids = [0, 2, 3, 5]
        densearray = masked.to_array()
        np.testing.assert_equal(data[validids], densearray)
        # valid position is corret
        for i in validids:
            assert data[i] == values[i]
        # null position is correct
        for i in range(len(values)):
            if i not in validids:
                assert values[i] is None

        got = df.to_string(nrows=None)
        print(got)
        expect = '''
  a b  c
0 1 11 0
1 2 12
2 3 13 2
3 4 14 3
4 5 15
5 6 16 5
'''
        # values should match despite whitespace difference
        assert got.split() == expect.split()
コード例 #9
0
ファイル: test_label_encode.py プロジェクト: zeichuan/cudf
def test_label_encode_drop_one():
    random.seed(0)
    np.random.seed(0)

    df = DataFrame()

    # initialize data frame
    df["cats"] = np.random.randint(7, size=10, dtype=np.int32)
    vals = list(df["cats"].unique())
    # drop 1 randomly
    del vals[random.randrange(len(vals))]

    lab = dict(zip(vals, list(range(len(vals)))))

    # label encode series
    ncol = df["cats"].label_encoding(cats=vals, dtype="float32")
    arr = ncol.to_array()

    # verify labels of new column

    for i in range(arr.size):
        # assuming -1 is used for missing value
        np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1))

    # label encode data frame
    df2 = df.label_encoding(column="cats",
                            prefix="cats",
                            cats=vals,
                            dtype="float32")

    assert df2.columns[0] == "cats"
    assert df2.columns[1] == "cats_labels"
コード例 #10
0
ファイル: test_joining.py プロジェクト: xiaolin1990/cudf
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in 'abc':
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index('a')
    right = df.set_index('c')
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match("there are overlapping columns but lsuffix"
                 " and rsuffix are not defined")

    got = left.join(right, lsuffix='_left', rsuffix='_right', sort=True)
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index('a').join(pddf.set_index('c'),
                                      lsuffix='_left',
                                      rsuffix='_right')
    # Check
    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    for k in expect.columns:
        _check_series(expect[k].fillna(-1), got[k].fillna(-1))
コード例 #11
0
    def read_ipc_to_DF(self, source):
        '''
            description:
                Read arrow file from another dataframe already in the gpu
            input:
                source: file path
            return:
                pandas dataframe
        '''

        try:
            with open(source + '.pickle', 'rb') as handle:
                buffer = eval(pickle.load(handle))
            with open(source + '-col.pickle', 'rb') as handle:
                columns = list(pickle.load(handle))
            self.data_gpu = DataFrame()

            for i, j in enumerate(buffer):
                temp_ipc_handler = pickle.loads(j)
                with temp_ipc_handler as temp_nd_array:
                    np_arr = np.zeros((temp_nd_array.size),
                                      dtype=temp_nd_array.dtype)
                    np_arr_gpu = cuda.to_device(np_arr)
                    np_arr_gpu.copy_to_device(temp_nd_array)
                    self.data_gpu[columns[i]] = cudf.Series(np_arr_gpu)

            self.back_up_dimension = self.data_gpu

        except Exception as e:
            del (self.data_gpu)
            del (self.back_up_dimension)
            gc.collect()
            return "Exception *** in cudf read_ipc_to_DF():" + str(e)

        return "data read successfully"
コード例 #12
0
ファイル: test_replace.py プロジェクト: zeichuan/cudf
def test_dataframe_replace_with_nulls():
    # numerical
    pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]})
    gdf1 = DataFrame.from_pandas(pdf1)
    pdf2 = pdf1.replace(0, 4)
    gdf2 = gdf1.replace(0, None).fillna(4)
    pd.testing.assert_frame_equal(gdf2.to_pandas(), pdf2)

    # list input
    pdf6 = pdf1.replace([0, 1], [4, 5])
    gdf6 = gdf1.replace([0, 1], [4, None]).fillna(5)
    pd.testing.assert_frame_equal(gdf6.to_pandas(), pdf6)

    pdf7 = pdf1.replace([0, 1], 4)
    gdf7 = gdf1.replace([0, 1], None).fillna(4)
    pd.testing.assert_frame_equal(gdf7.to_pandas(), pdf7)

    # dict input:
    pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5})
    gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4)
    pd.testing.assert_frame_equal(gdf8.to_pandas(), pdf8)

    gdf1 = DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]})
    gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3)
    pd.testing.assert_frame_equal(gdf9.to_pandas(), pdf6)
コード例 #13
0
def test_dataframe_hash_partition(nrows, nparts, nkeys):
    np.random.seed(123)
    gdf = DataFrame()
    keycols = []
    for i in range(nkeys):
        keyname = 'key{}'.format(i)
        gdf[keyname] = np.random.randint(0, 7 - i, nrows)
        keycols.append(keyname)
    gdf['val1'] = np.random.randint(0, nrows * 2, nrows)

    got = gdf.partition_by_hash(keycols, nparts=nparts)
    # Must return a list
    assert isinstance(got, list)
    # Must have correct number of partitions
    assert len(got) == nparts
    # All partitions must be DataFrame type
    assert all(isinstance(p, DataFrame) for p in got)
    # Check that all partitions have unique keys
    part_unique_keys = set()
    for p in got:
        if len(p):
            # Take rows of the keycolums and build a set of the key-values
            unique_keys = set(map(tuple, p.as_matrix(columns=keycols)))
            # Ensure that none of the key-values have occurred in other groups
            assert not (unique_keys & part_unique_keys)
            part_unique_keys |= unique_keys
    assert len(part_unique_keys)
コード例 #14
0
def test_dataframe_as_gpu_matrix_null_values():
    df = DataFrame()

    nelem = 123
    na = -10000

    refvalues = {}
    for k in 'abcd':
        df[k] = data = np.random.random(nelem)
        bitmask = utils.random_bitmask(nelem)
        df[k] = df[k].set_mask(bitmask)
        boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem],
                              dtype=np.bool_)
        data[~boolmask] = na
        refvalues[k] = data

    # Check null value causes error
    with pytest.raises(ValueError) as raises:
        df.as_gpu_matrix()
    raises.match("column 'a' has null values")

    for k in df.columns:
        df[k] = df[k].fillna(na)

    mat = df.as_gpu_matrix().copy_to_host()
    for i, k in enumerate(df.columns):
        np.testing.assert_array_equal(refvalues[k], mat[:, i])
コード例 #15
0
ファイル: test_string.py プロジェクト: zeichuan/cudf
def test_string_join_non_key(str_data, num_cols, how, how_raise):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_cols):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    pdf2 = pdf.copy()
    gdf2 = gdf.copy()

    expectation = raise_builder([how_raise], NotImplementedError)

    with expectation:
        expect = pdf.merge(pdf2, on=["a"], how=how)
        got = gdf.merge(gdf2, on=["a"], how=how)

        if len(expect) == 0 and len(got) == 0:
            expect = expect.reset_index(drop=True)
            got = got[expect.columns]

        assert_eq(expect, got)
コード例 #16
0
def test_dataframe_setitem_from_masked_object():
    ary = np.random.randn(100)
    mask = np.zeros(100, dtype=bool)
    mask[:20] = True
    np.random.shuffle(mask)
    ary[mask] = np.nan

    test1 = Series(ary)
    assert (test1.has_null_mask)
    assert (test1.null_count == 20)

    test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary}))
    assert (test2['a'].has_null_mask)
    assert (test2['a'].null_count == 20)

    gpu_ary = rmm.to_device(ary)
    test3 = Series(gpu_ary)
    assert (test3.has_null_mask)
    assert (test3.null_count == 20)

    test4 = DataFrame()
    lst = [1, 2, None, 4, 5, 6, None, 8, 9]
    test4['lst'] = lst
    assert (test4['lst'].has_null_mask)
    assert (test4['lst'].null_count == 2)
コード例 #17
0
def test_assign():
    gdf = DataFrame({'x': [1, 2, 3]})
    gdf2 = gdf.assign(y=gdf.x + 1)
    assert list(gdf.columns) == ['x']
    assert list(gdf2.columns) == ['x', 'y']

    np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
コード例 #18
0
def test_dataframe_empty_to_string():
    # Test for printing empty dataframe
    df = DataFrame()
    got = df.to_string()
    print(got)
    expect = "Empty DataFrame\nColumns: []\nIndex: []\n"
    # values should match despite whitespace difference
    assert got.split() == expect.split()
コード例 #19
0
def test_dataframe_astype(nelem):
    df = DataFrame()
    data = np.asarray(range(nelem), dtype=np.int32)
    df['a'] = data
    assert df['a'].dtype is np.dtype(np.int32)
    df['b'] = df['a'].astype(np.float32)
    assert df['b'].dtype is np.dtype(np.float32)
    np.testing.assert_equal(df['a'].to_array(), df['b'].to_array())
コード例 #20
0
ファイル: test_string.py プロジェクト: zeichuan/cudf
def test_string_slice():
    df = DataFrame({"a": ["hello", "world"]})
    pdf = pd.DataFrame({"a": ["hello", "world"]})
    a_slice_got = df.a.str.slice(0, 2)
    a_slice_expected = pdf.a.str.slice(0, 2)

    assert isinstance(a_slice_got, Series)
    assert_eq(a_slice_expected, a_slice_got)
コード例 #21
0
def test_query_local_dict():
    df = DataFrame()
    df['a'] = aa = np.arange(100)
    expr = "a < @val"

    got = df.query(expr, local_dict={'val': 10})
    np.testing.assert_array_equal(aa[aa < 10], got['a'].to_array())

    # test for datetime
    df = DataFrame()
    data = np.array(['2018-10-07', '2018-10-08'], dtype='datetime64')
    df['datetimes'] = data
    search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d')
    expr = 'datetimes==@search_date'

    got = df.query(expr, local_dict={'search_date': search_date})
    np.testing.assert_array_equal(data[1], got['datetimes'].to_array())
コード例 #22
0
def test_pickle_dataframe_numeric():
    np.random.seed(0)
    df = DataFrame()
    nelem = 10
    df["keys"] = np.arange(nelem, dtype=np.float64)
    df["vals"] = np.random.random(nelem)

    check_serialization(df)
コード例 #23
0
def test_query_local_dict():
    df = DataFrame()
    df["a"] = aa = np.arange(100)
    expr = "a < @val"

    got = df.query(expr, local_dict={"val": 10})
    np.testing.assert_array_equal(aa[aa < 10], got["a"].to_array())

    # test for datetime
    df = DataFrame()
    data = np.array(["2018-10-07", "2018-10-08"], dtype="datetime64")
    df["datetimes"] = data
    search_date = datetime.datetime.strptime("2018-10-08", "%Y-%m-%d")
    expr = "datetimes==@search_date"

    got = df.query(expr, local_dict={"search_date": search_date})
    np.testing.assert_array_equal(data[1], got["datetimes"].to_array())
コード例 #24
0
def test_pickle_dataframe_categorical():
    np.random.seed(0)

    df = DataFrame()
    df["keys"] = pd.Categorical("aaabababac")
    df["vals"] = np.random.random(len(df))

    check_serialization(df)
コード例 #25
0
ファイル: test_numpy_interop.py プロジェクト: zeichuan/cudf
def test_to_records_noindex():
    df = DataFrame()
    df["a"] = aa = np.arange(10, dtype=np.int32)
    df["b"] = bb = np.arange(10, 20, dtype=np.float64)

    rec = df.to_records(index=False)
    assert rec.dtype.names == ("a", "b")
    np.testing.assert_array_equal(rec["a"], aa)
    np.testing.assert_array_equal(rec["b"], bb)
コード例 #26
0
def test_groupby_apply_basic_agg_single_column():
    gdf = DataFrame()
    gdf["key"] = [0, 0, 1, 1, 2, 2, 0]
    gdf["val"] = [0, 1, 2, 3, 4, 5, 6]
    gdf["mult"] = gdf["key"] * gdf["val"]
    pdf = gdf.to_pandas()

    gdg = gdf.groupby(["key", "val"]).mult.sum()
    pdg = pdf.groupby(["key", "val"]).mult.sum()
    assert_eq(pdg, gdg)
コード例 #27
0
def test_dataframe_empty_concat():
    gdf1 = DataFrame()
    gdf1['a'] = []
    gdf1['b'] = []

    gdf2 = gdf1.copy()

    gdf3 = gd.concat([gdf1, gdf2])
    assert len(gdf3) == 0
    assert len(gdf3.columns) == 2
コード例 #28
0
def test_dataframe_emptycolumns_to_string():
    # Test for printing dataframe having empty columns
    df = DataFrame()
    df['a'] = []
    df['b'] = []
    got = df.to_string()
    print(got)
    expect = "Empty DataFrame\nColumns: ['a', 'b']\nIndex: []\n"
    # values should match despite whitespace difference
    assert got.split() == expect.split()
コード例 #29
0
def test_groupby_apply_basic_agg_single_column():
    gdf = DataFrame()
    gdf['key'] = [0, 0, 1, 1, 2, 2, 0]
    gdf['val'] = [0, 1, 2, 3, 4, 5, 6]
    gdf['mult'] = gdf['key'] * gdf['val']
    pdf = gdf.to_pandas()

    gdg = gdf.groupby(['key', 'val']).mult.sum()
    pdg = pdf.groupby(['key', 'val']).mult.sum()
    assert_eq(pdg, gdg)
コード例 #30
0
def test_dataframe_append_to_empty():
    pdf = pd.DataFrame()
    pdf['a'] = []
    pdf['b'] = [1, 2, 3]

    gdf = DataFrame()
    gdf['a'] = []
    gdf['b'] = [1, 2, 3]

    pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)