Esempio n. 1
0
def test_dataframe_replace_with_nulls():
    # numerical
    pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]})
    gdf1 = DataFrame.from_pandas(pdf1)
    pdf2 = pdf1.replace(0, 4)
    gdf2 = gdf1.replace(0, None).fillna(4)
    pd.testing.assert_frame_equal(gdf2.to_pandas(), pdf2)

    # list input
    pdf6 = pdf1.replace([0, 1], [4, 5])
    gdf6 = gdf1.replace([0, 1], [4, None]).fillna(5)
    pd.testing.assert_frame_equal(gdf6.to_pandas(), pdf6)

    pdf7 = pdf1.replace([0, 1], 4)
    gdf7 = gdf1.replace([0, 1], None).fillna(4)
    pd.testing.assert_frame_equal(gdf7.to_pandas(), pdf7)

    # dict input:
    pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5})
    gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4)
    pd.testing.assert_frame_equal(gdf8.to_pandas(), pdf8)

    gdf1 = DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]})
    gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3)
    pd.testing.assert_frame_equal(gdf9.to_pandas(), pdf6)
Esempio n. 2
0
    def read_ipc_to_DF(self,source):
        '''
            description:
                Read arrow file from another dataframe already in the gpu
            input:
                source: file path
            return:
                pandas dataframe
        '''

        try:
            with open(source+'.pickle', 'rb') as handle:
                buffer = eval(pickle.load(handle))
            with open(source+'-col.pickle', 'rb') as handle:
                columns = list(pickle.load(handle))
            self.data_gpu = DataFrame()

            for i,j in enumerate(buffer):
                temp_ipc_handler = pickle.loads(j)
                with temp_ipc_handler as temp_nd_array:
                    np_arr = np.zeros((temp_nd_array.size), dtype=temp_nd_array.dtype)
                    np_arr_gpu = cuda.to_device(np_arr)
                    np_arr_gpu.copy_to_device(temp_nd_array)
                    self.data_gpu[columns[i]] = cudf.Series(np_arr_gpu)

            self.back_up_dimension = self.data_gpu

        except Exception as e:
            del(self.data_gpu)
            del(self.back_up_dimension)
            gc.collect()
            return "Exception *** in cudf read_ipc_to_DF():"+str(e)

        return "data read successfully"
Esempio n. 3
0
def test_string_join_non_key(str_data, num_cols, how, how_raise):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_cols):
        pdf[i] = pd.Series(str_data, dtype='str')
        gdf[i] = Series(str_data, dtype='str')
    pdf['a'] = other_data
    gdf['a'] = other_data

    pdf2 = pdf.copy()
    gdf2 = gdf.copy()

    expectation = raise_builder([how_raise], NotImplementedError)

    with expectation:
        expect = pdf.merge(pdf2, on=['a'], how=how)
        got = gdf.merge(gdf2, on=['a'], how=how)

        if len(expect) == 0 and len(got) == 0:
            expect = expect.reset_index(drop=True)
            got = got[expect.columns]

        assert_eq(expect, got)
Esempio n. 4
0
def test_string_join_non_key_nulls(str_data_nulls):
    str_data = ['a', 'b', 'c', 'd', 'e']
    other_data = [1, 2, 3, 4, 5]

    other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf['vals'] = pd.Series(str_data, dtype='str')
    gdf['vals'] = Series(str_data, dtype='str')
    pdf['key'] = other_data
    gdf['key'] = other_data

    pdf2 = pd.DataFrame()
    gdf2 = DataFrame()
    pdf2['vals'] = pd.Series(str_data_nulls, dtype='str')
    gdf2['vals'] = Series(str_data_nulls, dtype='str')
    pdf2['key'] = pd.Series(other_data_nulls, dtype='int64')
    gdf2['key'] = Series(other_data_nulls, dtype='int64')

    expect = pdf.merge(pdf2, on='key', how='left')
    got = gdf.merge(gdf2, on='key', how='left')

    if len(expect) == 0 and len(got) == 0:
        expect = expect.reset_index(drop=True)
        got = got[expect.columns]

    assert_eq(expect, got)
Esempio n. 5
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in "abc":
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index("a")
    right = df.set_index("c")
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match("there are overlapping columns but lsuffix"
                 " and rsuffix are not defined")

    got = left.join(right, lsuffix="_left", rsuffix="_right", sort=True)
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index("a").join(pddf.set_index("c"),
                                      lsuffix="_left",
                                      rsuffix="_right")
    # Check
    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    for k in expect.columns:
        _check_series(expect[k].fillna(-1), got[k].fillna(-1))
Esempio n. 6
0
def test_dataframe_hash_partition(nrows, nparts, nkeys):
    np.random.seed(123)
    gdf = DataFrame()
    keycols = []
    for i in range(nkeys):
        keyname = 'key{}'.format(i)
        gdf[keyname] = np.random.randint(0, 7 - i, nrows)
        keycols.append(keyname)
    gdf['val1'] = np.random.randint(0, nrows * 2, nrows)

    got = gdf.partition_by_hash(keycols, nparts=nparts)
    # Must return a list
    assert isinstance(got, list)
    # Must have correct number of partitions
    assert len(got) == nparts
    # All partitions must be DataFrame type
    assert all(isinstance(p, DataFrame) for p in got)
    # Check that all partitions have unique keys
    part_unique_keys = set()
    for p in got:
        if len(p):
            # Take rows of the keycolums and build a set of the key-values
            unique_keys = set(map(tuple, p.as_matrix(columns=keycols)))
            # Ensure that none of the key-values have occurred in other groups
            assert not (unique_keys & part_unique_keys)
            part_unique_keys |= unique_keys
    assert len(part_unique_keys)
Esempio n. 7
0
def test_dataframe_pairs_of_triples(pairs, max, rows, how):
    np.random.seed(0)

    pdf_left = pd.DataFrame()
    pdf_right = pd.DataFrame()
    for left_column in pairs[0]:
        pdf_left[left_column] = np.random.randint(0, max, rows)
    for right_column in pairs[1]:
        pdf_right[right_column] = np.random.randint(0, max, rows)
    gdf_left = DataFrame.from_pandas(pdf_left)
    gdf_right = DataFrame.from_pandas(pdf_right)
    if not set(pdf_left.columns).intersection(pdf_right.columns):
        with pytest.raises(pd.core.reshape.merge.MergeError) as raises:
            pdf_left.merge(pdf_right)
        raises.match("No common columns to perform merge on")
        with pytest.raises(ValueError) as raises:
            gdf_left.merge(gdf_right)
        raises.match("No common columns to perform merge on")
    elif not [value for value in pdf_left if value in pdf_right]:
        with pytest.raises(pd.core.reshape.merge.MergeError) as raises:
            pdf_left.merge(pdf_right)
        raises.match("No common columns to perform merge on")
        with pytest.raises(ValueError) as raises:
            gdf_left.merge(gdf_right)
        raises.match("No common columns to perform merge on")
    else:
        pdf_result = pdf_left.merge(pdf_right, how=how)
        gdf_result = gdf_left.merge(gdf_right, how=how)
        assert np.array_equal(gdf_result.columns, pdf_result.columns)
        for column in gdf_result:
            assert np.array_equal(gdf_result[column].fillna(-1).sort_values(),
                                  pdf_result[column].fillna(-1).sort_values())
Esempio n. 8
0
def test_assign():
    gdf = DataFrame({'x': [1, 2, 3]})
    gdf2 = gdf.assign(y=gdf.x + 1)
    assert list(gdf.columns) == ['x']
    assert list(gdf2.columns) == ['x', 'y']

    np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
Esempio n. 9
0
def test_dataframe_setitem_from_masked_object():
    ary = np.random.randn(100)
    mask = np.zeros(100, dtype=bool)
    mask[:20] = True
    np.random.shuffle(mask)
    ary[mask] = np.nan

    test1 = Series(ary)
    assert(test1.has_null_mask)
    assert(test1.null_count == 20)

    test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary}))
    assert(test2['a'].has_null_mask)
    assert(test2['a'].null_count == 20)

    gpu_ary = rmm.to_device(ary)
    test3 = Series(gpu_ary)
    assert(test3.has_null_mask)
    assert(test3.null_count == 20)

    test4 = DataFrame()
    lst = [1, 2, None, 4, 5, 6, None, 8, 9]
    test4['lst'] = lst
    assert(test4['lst'].has_null_mask)
    assert(test4['lst'].null_count == 2)
Esempio n. 10
0
def test_dataframe_as_gpu_matrix_null_values():
    df = DataFrame()

    nelem = 123
    na = -10000

    refvalues = {}
    for k in 'abcd':
        df[k] = data = np.random.random(nelem)
        bitmask = utils.random_bitmask(nelem)
        df[k] = df[k].set_mask(bitmask)
        boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem],
                              dtype=np.bool_)
        data[~boolmask] = na
        refvalues[k] = data

    # Check null value causes error
    with pytest.raises(ValueError) as raises:
        df.as_gpu_matrix()
    raises.match("column 'a' has null values")

    for k in df.columns:
        df[k] = df[k].fillna(na)

    mat = df.as_gpu_matrix().copy_to_host()
    for i, k in enumerate(df.columns):
        np.testing.assert_array_equal(refvalues[k], mat[:, i])
Esempio n. 11
0
def test_label_encode_drop_one():
    random.seed(0)
    np.random.seed(0)

    df = DataFrame()

    # initialize data frame
    df['cats'] = np.random.randint(7, size=10, dtype=np.int32)
    vals = list(df['cats'].unique())
    # drop 1 randomly
    del vals[random.randrange(len(vals))]

    lab = dict(zip(vals, list(range(len(vals)))))

    # label encode series
    ncol = df['cats'].label_encoding(cats=vals, dtype='float32')
    arr = ncol.to_array()

    # verify labels of new column

    for i in range(arr.size):
        # assuming -1 is used for missing value
        np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1))

    # label encode data frame
    df2 = df.label_encoding(column='cats', prefix='cats', cats=vals,
                            dtype='float32')

    assert df2.columns[0] == 'cats'
    assert df2.columns[1] == 'cats_labels'
Esempio n. 12
0
def test_dataframe_empty_to_string():
    # Test for printing empty dataframe
    df = DataFrame()
    got = df.to_string()
    print(got)
    expect = "Empty DataFrame\nColumns: []\nIndex: []\n"
    # values should match despite whitespace difference
    assert got.split() == expect.split()
Esempio n. 13
0
def test_dataframe_empty_merge():
    gdf1 = DataFrame([("a", []), ("b", [])])
    gdf2 = DataFrame([("a", []), ("c", [])])

    expect = DataFrame([("a", []), ("b", []), ("c", [])])
    got = gdf1.merge(gdf2, how="left", on=["a"])

    assert_eq(expect, got)
Esempio n. 14
0
def test_dataframe_empty_merge():
    gdf1 = DataFrame([('a', []), ('b', [])])
    gdf2 = DataFrame([('a', []), ('c', [])])

    expect = DataFrame([('a', []), ('b', []), ('c', [])])
    got = gdf1.merge(gdf2, how='left', on=['a'])

    assert_eq(expect, got)
Esempio n. 15
0
def test_merge_left_right_index_left_right_on_kwargs2(kwargs):
    left = pd.DataFrame({'x': [1, 2, 3]}, index=[10, 20, 30])
    right = pd.DataFrame({'y': [10, 20, 30]}, index=[1, 2, 30])
    gleft = DataFrame.from_pandas(left)
    gright = DataFrame.from_pandas(right)
    gd_merge = gleft.merge(gright, **kwargs)
    pd_merge = left.merge(right, **kwargs)
    if pd_merge.empty:
        assert (gd_merge.empty)
Esempio n. 16
0
def test_merge_left_right_index_left_right_on_kwargs(kwargs):
    left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 5, 6])
    right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]},
                         index=[1, 2, 3, 4, 5, 7])
    gleft = DataFrame.from_pandas(left)
    gright = DataFrame.from_pandas(right)
    pd_merge = left.merge(right, **kwargs)
    gd_merge = gleft.merge(gright, **kwargs)
    assert_eq(pd_merge, gd_merge)
Esempio n. 17
0
def test_onehot_get_dummies_simple():
    df = DataFrame({'x': np.arange(10)})
    original = df.copy()
    encoded = get_dummies(df, prefix='test')

    assert df == original  # the original df should be unchanged
    cols = list(encoded.columns)[1:]
    actual = DataFrame(dict(zip(cols, np.eye(len(cols)))))
    assert (encoded.loc[:, cols] == actual).all().all()
Esempio n. 18
0
def test_to_records_noindex():
    df = DataFrame()
    df["a"] = aa = np.arange(10, dtype=np.int32)
    df["b"] = bb = np.arange(10, 20, dtype=np.float64)

    rec = df.to_records(index=False)
    assert rec.dtype.names == ("a", "b")
    np.testing.assert_array_equal(rec["a"], aa)
    np.testing.assert_array_equal(rec["b"], bb)
Esempio n. 19
0
def test_dataframe_empty_concat():
    gdf1 = DataFrame()
    gdf1['a'] = []
    gdf1['b'] = []

    gdf2 = gdf1.copy()

    gdf3 = gd.concat([gdf1, gdf2])
    assert len(gdf3) == 0
    assert len(gdf3.columns) == 2
Esempio n. 20
0
def test_dataframe_append_to_empty():
    pdf = pd.DataFrame()
    pdf['a'] = []
    pdf['b'] = [1, 2, 3]

    gdf = DataFrame()
    gdf['a'] = []
    gdf['b'] = [1, 2, 3]

    pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
Esempio n. 21
0
def test_dataframe_emptycolumns_to_string():
    # Test for printing dataframe having empty columns
    df = DataFrame()
    df['a'] = []
    df['b'] = []
    got = df.to_string()
    print(got)
    expect = "Empty DataFrame\nColumns: ['a', 'b']\nIndex: []\n"
    # values should match despite whitespace difference
    assert got.split() == expect.split()
Esempio n. 22
0
def test_groupby_apply_basic_agg_single_column():
    gdf = DataFrame()
    gdf['key'] = [0, 0, 1, 1, 2, 2, 0]
    gdf['val'] = [0, 1, 2, 3, 4, 5, 6]
    gdf['mult'] = gdf['key'] * gdf['val']
    pdf = gdf.to_pandas()

    gdg = gdf.groupby(['key', 'val']).mult.sum()
    pdg = pdf.groupby(['key', 'val']).mult.sum()
    assert_eq(pdg, gdg)
Esempio n. 23
0
def test_groupby_apply_basic_agg_single_column():
    gdf = DataFrame()
    gdf["key"] = [0, 0, 1, 1, 2, 2, 0]
    gdf["val"] = [0, 1, 2, 3, 4, 5, 6]
    gdf["mult"] = gdf["key"] * gdf["val"]
    pdf = gdf.to_pandas()

    gdg = gdf.groupby(["key", "val"]).mult.sum()
    pdg = pdf.groupby(["key", "val"]).mult.sum()
    assert_eq(pdg, gdg)
Esempio n. 24
0
def test_merge_left_index_zero():
    left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5])
    right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]},
                         index=[0, 1, 2, 3, 4, 6])
    gleft = DataFrame.from_pandas(left)
    gright = DataFrame.from_pandas(right)
    pd_merge = left.merge(right, left_on="x", right_on='y')
    gd_merge = gleft.merge(gright, left_on="x", right_on='y')

    assert_eq(pd_merge, gd_merge)
Esempio n. 25
0
def test_dataframe_join_cats():
    lhs = DataFrame()
    lhs['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    lhs['b'] = bb = np.arange(len(lhs))
    lhs = lhs.set_index('a')

    rhs = DataFrame()
    rhs['a'] = pd.Categorical(list('abcac'), categories=list('abc'))
    rhs['c'] = cc = np.arange(len(rhs))
    rhs = rhs.set_index('a')

    got = lhs.join(rhs)
    expect = lhs.to_pandas().join(rhs.to_pandas())

    # Note: pandas make a object Index after joining
    pd.util.testing.assert_frame_equal(
        got.sort_values(by='b').to_pandas().sort_index().reset_index(
            drop=True), expect.reset_index(drop=True))

    # Just do some rough checking here.
    assert list(got.columns) == ['b', 'c']
    assert len(got) > 0
    assert set(got.index.values) & set('abc')
    assert set(got['b']) & set(bb)
    assert set(got['c']) & set(cc)
Esempio n. 26
0
def test_dataframe_sort_values(nelem, dtype):
    np.random.seed(0)
    df = DataFrame()
    df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype)
    df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype)
    sorted_df = df.sort_values(by="a")
    # Check
    sorted_index = np.argsort(aa, kind="mergesort")
    np.testing.assert_array_equal(sorted_df.index.values, sorted_index)
    np.testing.assert_array_equal(sorted_df["a"], aa[sorted_index])
    np.testing.assert_array_equal(sorted_df["b"], bb[sorted_index])
Esempio n. 27
0
def test_to_records_withindex():
    df = DataFrame()
    df['a'] = aa = np.arange(10, dtype=np.int32)
    df['b'] = bb = np.arange(10, 20, dtype=np.float64)

    rec_indexed = df.to_records(index=True)
    assert rec_indexed.size == len(aa)
    assert rec_indexed.dtype.names == ('index', 'a', 'b')
    np.testing.assert_array_equal(rec_indexed['a'], aa)
    np.testing.assert_array_equal(rec_indexed['b'], bb)
    np.testing.assert_array_equal(rec_indexed['index'], np.arange(10))
Esempio n. 28
0
def test_to_records_withindex():
    df = DataFrame()
    df["a"] = aa = np.arange(10, dtype=np.int32)
    df["b"] = bb = np.arange(10, 20, dtype=np.float64)

    rec_indexed = df.to_records(index=True)
    assert rec_indexed.size == len(aa)
    assert rec_indexed.dtype.names == ("index", "a", "b")
    np.testing.assert_array_equal(rec_indexed["a"], aa)
    np.testing.assert_array_equal(rec_indexed["b"], bb)
    np.testing.assert_array_equal(rec_indexed["index"], np.arange(10))
Esempio n. 29
0
def test_reading_arrow_sparse_data():
    schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)

    df = DataFrame(gar.to_dict().items())

    # preprocessing
    num_cols = set()
    cat_cols = set()
    response_set = set(['INCEARN '])
    feature_names = set(df.columns) - response_set

    # Determine cat and numeric columns
    uniques = {}
    for k in feature_names:
        try:
            uniquevals = df[k].unique()
            uniques[k] = uniquevals
        except ValueError:
            num_cols.add(k)
        else:
            nunique = len(uniquevals)
            if nunique < 2:
                del df[k]
            elif 1 < nunique < 1000:
                cat_cols.add(k)
            else:
                num_cols.add(k)

    # Fix numeric columns
    for k in (num_cols - response_set):
        df[k] = df[k].fillna(df[k].mean())
        assert df[k].null_count == 0
        std = df[k].std()
        # drop near constant columns
        if not np.isfinite(std) or std < 1e-4:
            del df[k]
            print('drop near constant', k)
        else:
            df[k] = df[k].scale()

    # Expand categorical columns
    for k in cat_cols:
        cats = uniques[k][1:]  # drop first
        df = df.one_hot_encoding(k, prefix=k, cats=cats)
        del df[k]

    # Print dtypes
    assert {df[k].dtype for k in df.columns} == {np.dtype('float64')}

    mat = df.as_matrix()

    assert mat.max() == 1
    assert mat.min() == 0
Esempio n. 30
0
def test_df_cat_sort_index():
    df = DataFrame()
    df['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    df['b'] = np.arange(len(df))

    got = df.set_index('a').sort_index()
    expect = df.to_pandas().set_index('a').sort_index()

    assert list(expect.columns) == list(got.columns)
    assert list(expect.index.values) == list(got.index.values)
    np.testing.assert_array_equal(expect.index.values, got.index.values)
    np.testing.assert_array_equal(expect['b'].values, got['b'].to_array())