コード例 #1
0
ファイル: test_one_hot_encoder.py プロジェクト: vinaydes/cuml
def test_onehot_drop_idx_first(client):
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False, drop='first')
    sk_enc = SkOneHotEncoder(sparse=False, drop='first')
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
コード例 #2
0
 def transform(self, columns, gdf: cudf.DataFrame) -> cudf.DataFrame:
     tmp = "__tmp__"  # Temporary column for sorting
     gdf[tmp] = cupy.arange(len(gdf), dtype="int32")
     new_gdf = gdf.merge(self._ext,
                         left_on=self.on,
                         right_on=self.on_ext,
                         how=self.how)
     new_gdf = new_gdf.sort_values(tmp)
     new_gdf.drop(columns=[tmp], inplace=True)
     gdf.drop(columns=[tmp], inplace=True)
     new_gdf.reset_index(drop=True, inplace=True)
     return new_gdf
コード例 #3
0
def test_query_env_changing():
    df = DataFrame()
    df["a"] = aa = np.arange(100)
    expr = "a < @c"
    # first attempt
    c = 10
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())
    # change env
    c = 50
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())
コード例 #4
0
ファイル: test_one_hot_encoder.py プロジェクト: vinaydes/cuml
def test_onehot_inverse_transform_handle_unknown(as_array):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    Y_ohe = cp.array([[0., 0., 1., 0.],
                      [0., 1., 0., 1.]])
    ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]})
    if as_array:
        X = _from_df_to_cupy(X)
        ref = DataFrame({0: [None, ord('b')], 1: [0, 2]})

    enc = OneHotEncoder(handle_unknown='ignore')
    enc = enc.fit(X)
    df = enc.inverse_transform(Y_ohe)
    assert_inverse_equal(df, ref)
コード例 #5
0
ファイル: test_one_hot_encoder.py プロジェクト: teju85/cuml
def test_onehot_fit_handle_unknown(cluster):
    client = Client(cluster)
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(handle_unknown='error', categories=Y)
    with pytest.raises(KeyError):
        enc.fit(X)

    enc = OneHotEncoder(handle_unknown='ignore', categories=Y)
    enc.fit(X)
    client.close()
コード例 #6
0
ファイル: shapefile.py プロジェクト: fagan2888/cuspatial
def read_polygon_shapefile(filename):
    """Reads a pair of .shp and .shx files into a cudf DataFrame"""
    result = cpp_read_polygon_shapefile(filename)
    return (
        DataFrame({
            "f_pos": result[0],
            "r_pos": result[1]
        }),
        DataFrame({
            "x": result[2],
            "y": result[3]
        }),
    )
コード例 #7
0
ファイル: test_one_hot_encoder.py プロジェクト: teju85/cuml
def test_onehot_inverse_transform_handle_unknown(cluster):
    client = Client(cluster)
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)
    Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]])
    Y_ohe = da.from_array(Y_ohe)

    enc = OneHotEncoder(handle_unknown='ignore')
    enc = enc.fit(X)
    df = enc.inverse_transform(Y_ohe)
    ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]})
    assert_frame_equal(df.compute().to_pandas(), ref.to_pandas())
    client.close()
コード例 #8
0
ファイル: multiindex.py プロジェクト: jimmytuc/cudf
    def _popn(self, n):
        """ Returns a copy of this index without the left-most n values.

        Removes n names, labels, and codes in order to build a new index
        for results.
        """
        from cudf import DataFrame
        codes = DataFrame()
        for idx in self.codes.columns[n:]:
            codes.add_column(idx, self.codes[idx])
        result = MultiIndex(self.levels[n:], codes)
        result.names = self.names[n:]
        return result
コード例 #9
0
def test_onehot_fit_handle_unknown(as_array):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]})
    if as_array:
        X = _from_df_to_cupy(X)
        Y = _from_df_to_cupy(Y)

    enc = OneHotEncoder(handle_unknown='error', categories=Y)
    with pytest.raises(KeyError):
        enc.fit(X)

    enc = OneHotEncoder(handle_unknown='ignore', categories=Y)
    enc.fit(X)
コード例 #10
0
def test_categorical_basic(data):
    cat = data.copy()
    pdsr = pd.Series(cat)
    sr = Series(cat)
    dsr = dgd.from_cudf(sr, npartitions=2)
    result = dsr.compute()
    np.testing.assert_array_equal(cat.codes, result.to_array())

    assert dsr.dtype.to_pandas() == pdsr.dtype
    # Test attributes
    assert pdsr.cat.ordered == dsr.cat.ordered

    assert tuple(pdsr.cat.categories) == tuple(dsr.cat.categories)

    np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array())
    np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype)

    string = str(result)
    expect_str = """
0 a
1 a
2 b
3 c
4 a
"""
    assert all(x == y for x, y in zip(string.split(), expect_str.split()))
    from cudf.tests.utils import assert_eq

    df = DataFrame()
    df["a"] = ["xyz", "abc", "def"] * 10

    pdf = df.to_pandas()
    cddf = dgd.from_cudf(df, 1)
    cddf["b"] = cddf["a"].astype("category")

    ddf = dd.from_pandas(pdf, 1)
    ddf["b"] = ddf["a"].astype("category")

    assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"])

    with pytest.raises(NotImplementedError):
        cddf["b"].cat.categories

    with pytest.raises(NotImplementedError):
        ddf["b"].cat.categories

    cddf = cddf.categorize()
    ddf = ddf.categorize()

    assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories)
    assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered)
コード例 #11
0
def test_dataframe_sort_values_ignore_index(index, ignore_index):
    gdf = DataFrame({
        "a": [1, 3, 5, 2, 4],
        "b": [1, 1, 2, 2, 3],
        "c": [9, 7, 7, 7, 1]
    })
    gdf = gdf.set_index(index)

    pdf = gdf.to_pandas()

    expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index)
    got = gdf.sort_values((gdf.columns), ignore_index=ignore_index)

    assert_eq(expect, got)
コード例 #12
0
 def _to_frame(self):
     from cudf import DataFrame
     # for each column of codes
     # replace column with mapping from integers to levels
     df = self.codes.copy(deep=False)
     for idx, column in enumerate(df.columns):
         # use merge as a replace fn
         level = DataFrame({'idx': Series(cudautils.arange(len(
                                                     self.levels[idx]),
                                          dtype=df[column].dtype)),
                            'level': self.levels[idx]})
         code = DataFrame({'idx': df[column]})
         df[column] = code.merge(level).level
     return df
コード例 #13
0
ファイル: test_indexing.py プロジェクト: wenxiang-Li/cudf
def test_dataframe_masked_slicing(nelem, slice_start, slice_end):
    gdf = DataFrame()
    gdf["a"] = list(range(nelem))
    gdf["b"] = list(range(nelem, 2 * nelem))
    gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem))
    gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem))

    def do_slice(x):
        return x[slice_start:slice_end]

    expect = do_slice(gdf.to_pandas())
    got = do_slice(gdf).to_pandas()

    assert_eq(expect, got, check_dtype=False)
コード例 #14
0
ファイル: multiindex.py プロジェクト: jimmytuc/cudf
 def _get_row_major(self, df, row_tuple):
     valid_indices = self._compute_validity_mask(df, row_tuple)
     from cudf import Series
     result = df.take(Series(valid_indices))
     # Build new index - INDEX based MultiIndex
     # ---------------
     from cudf import DataFrame
     out_index = DataFrame()
     # Select the last n-k columns where n is the number of source
     # levels and k is the length of the indexing tuple
     for k in range(len(row_tuple), len(df.index.levels)):
         out_index.add_column(df.index.names[k],
                              df.index.codes[df.index.codes.columns[k]])
     # If there's only one column remaining in the output index, convert
     # it into a StringIndex and name the final index values according
     # to the proper codes.
     if len(out_index.columns) == 1:
         out_index = []
         for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]:  # noqa: E501
             out_index.append(result.index.levels[
                     len(result.index.codes.columns)-1][val])
         # TODO: Warning! The final index column could be arbitrarily
         # ordered integers, not Strings, so we need to check for that
         # dtype and produce a GenericIndex instead of a StringIndex
         out_index = StringIndex(out_index)
         out_index.name = result.index.names[len(result.index.names)-1]
         result.index = out_index
     else:
         # Otherwise pop the leftmost levels, names, and codes from the
         # source index until it has the correct number of columns (n-k)
         if(len(out_index.columns)) > 0:
             result.reset_index(drop=True)
             result.index = result.index._popn(len(row_tuple))
     return result
コード例 #15
0
ファイル: test_accessor.py プロジェクト: sperlingxx/cudf
def test_str_slice():

    df = DataFrame({"a": ["abc,def,123", "xyz,hi,bye"]})

    ddf = dgd.from_cudf(df, 1)
    pdf = df.to_pandas()

    dd.assert_eq(
        pdf.a.str.split(",", expand=True, n=1),
        ddf.a.str.split(",", expand=True, n=1),
    )
    dd.assert_eq(
        pdf.a.str.split(",", expand=True, n=2),
        ddf.a.str.split(",", expand=True, n=2),
    )
コード例 #16
0
ファイル: test_dns_extractor.py プロジェクト: gbatmaz/clx
def test_get_hostname_split_df():
    input_df = DataFrame({
        "hostname":
        ["forums.news.cnn.com.ac", "forums.news.cnn.ac", "b.cnn.com"]
    })

    expected_output_df = DataFrame({
        4: ["ac", "", ""],
        3: ["com", "ac", ""],
        2: ["cnn", "cnn", "com"],
        1: ["news", "news", "cnn"],
        0: ["forums", "forums", "b"],
    })
    actual_output_df = dns.get_hostname_split_df(input_df["hostname"])
    assert actual_output_df.equals(expected_output_df)
コード例 #17
0
ファイル: test_one_hot_encoder.py プロジェクト: teju85/cuml
def test_onehot_drop_one_of_each(cluster):
    client = Client(cluster)
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    ddf = dask_cudf.from_cudf(X, npartitions=2)

    drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'})
    enc = OneHotEncoder(sparse=False, drop=drop)
    sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b'])
    ohe = enc.fit_transform(ddf)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe.compute(), ref)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
    client.close()
コード例 #18
0
def test_onehot_generic_index():
    np.random.seed(0)
    size = 33
    indices = np.random.randint(low=0, high=100, size=size)
    df = DataFrame()
    values = np.random.randint(low=0, high=4, size=size)
    df["fo"] = Series(values, index=Index(indices))
    out = df.one_hot_encoding(
        "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
    )
    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
コード例 #19
0
def test_memory_usage_dataframe():
    np.random.seed(0)
    df = DataFrame()
    nelem = 1000
    df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
    df["vals"] = hvals = np.random.random(nelem)

    nbytes = hkeys.nbytes + hvals.nbytes
    sizeof = df.memory_usage().sum()
    assert sizeof >= nbytes

    serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL))

    # assert at least sizeof bytes were serialized
    assert serialized_nbytes >= sizeof
コード例 #20
0
ファイル: test_accessor.py プロジェクト: sperlingxx/cudf
def test_categorical_categories():

    df = DataFrame(
        {"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)}
    )
    df["a"] = df["a"].astype("category")
    pdf = df.to_pandas(nullable_pd_dtype=False)

    ddf = dgd.from_cudf(df, 2)
    dpdf = dd.from_pandas(pdf, 2)

    dd.assert_eq(
        ddf.a.cat.categories.to_series().to_pandas(nullable_pd_dtype=False),
        dpdf.a.cat.categories.to_series(),
        check_index=False,
    )
コード例 #21
0
ファイル: avro.py プロジェクト: vyasr/cudf
def read_avro(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    skiprows=None,
    num_rows=None,
    **kwargs,
):
    """{docstring}"""

    from cudf import DataFrame

    is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
        path_or_data=filepath_or_buffer,
        **kwargs,
    )
    if not is_single_filepath_or_buffer:
        raise NotImplementedError(
            "`read_avro` does not yet support reading multiple files")

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        path_or_data=filepath_or_buffer, compression=None, **kwargs)
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        return DataFrame._from_table(
            libcudf.avro.read_avro(filepath_or_buffer, columns, skiprows,
                                   num_rows))
    else:
        raise NotImplementedError("read_avro currently only supports cudf")
コード例 #22
0
ファイル: test_indexing.py プロジェクト: trevorsm7/cudf
def test_dataframe_iloc(nelem):
    gdf = DataFrame()

    gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(
        np.int32
    )
    gdf["b"] = hb = np.random.random(nelem).astype(np.float32)

    pdf = pd.DataFrame()
    pdf["a"] = ha
    pdf["b"] = hb

    assert_eq(gdf.iloc[-1:1], pdf.iloc[-1:1])
    assert_eq(gdf.iloc[nelem - 1 : -1], pdf.iloc[nelem - 1 : -1])
    assert_eq(gdf.iloc[0 : nelem - 1], pdf.iloc[0 : nelem - 1])
    assert_eq(gdf.iloc[0:nelem], pdf.iloc[0:nelem])
    assert_eq(gdf.iloc[1:1], pdf.iloc[1:1])
    assert_eq(gdf.iloc[1:2], pdf.iloc[1:2])
    assert_eq(gdf.iloc[nelem - 1 : nelem + 1], pdf.iloc[nelem - 1 : nelem + 1])
    assert_eq(gdf.iloc[nelem : nelem * 2], pdf.iloc[nelem : nelem * 2])

    assert_eq(gdf.iloc[-1 * nelem], pdf.iloc[-1 * nelem])
    assert_eq(gdf.iloc[-1], pdf.iloc[-1])
    assert_eq(gdf.iloc[0], pdf.iloc[0])
    assert_eq(gdf.iloc[1], pdf.iloc[1])
    assert_eq(gdf.iloc[nelem - 1], pdf.iloc[nelem - 1])
コード例 #23
0
def _to_frame(this_index, index=True, name=None):
    """Create a DataFrame with a column containing this Index

    Parameters
    ----------
    index : boolean, default True
        Set the index of the returned DataFrame as the original Index
    name : str, default None
        Name to be used for the column

    Returns
    -------
    DataFrame
        cudf DataFrame
    """

    from cudf import DataFrame

    if name is not None:
        col_name = name
    elif this_index.name is None:
        col_name = 0
    else:
        col_name = this_index.name

    return DataFrame(
        {col_name: this_index._values}, index=this_index if index else None
    )
コード例 #24
0
ファイル: shapefile.py プロジェクト: LeviBarnes/cuspatial
def read_polygon_shapefile(filename):
    """
    Reads polygon geometry from an ESRI shapefile into GPU memory.

    Parameters
    ----------
    filename : str, pathlike
        ESRI Shapefile file path (usually ends in ``.shp``)

    Returns
    -------
    result  : tuple (cudf.Series, cudf.Series, cudf.DataFrame)
    poly_offsets   : cudf.Series(dtype=np.int32)
        Offsets of the first ring in each polygon
    ring_offsets   : cudf.Series(dtype=np.int32)
        Offsets of the first point in each ring
    points  : cudf.DataFrame
        DataFrame of all points in the shapefile
            x : cudf.Series(dtype=np.float64)
                x-components of each polygon's points
            y : cudf.Series(dtype=np.float64)
                y-components of each polygon's points
    """
    result = cpp_read_polygon_shapefile(filename)
    f_pos = Series(result[0], name="f_pos")
    r_pos = Series(result[1], name="r_pos")
    return (f_pos, r_pos, DataFrame({"x": result[2], "y": result[3]}))
コード例 #25
0
def test_factorize_index_obj(ncats, nelem):
    df = DataFrame()
    np.random.seed(0)

    # initialize data frame
    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
    df = df.set_index("cats")

    uvals, labels = df.index.factorize()
    np.testing.assert_array_equal(labels.values.get(), sorted(set(arr)))
    assert isinstance(uvals, cp.ndarray)
    assert isinstance(labels, Index)

    encoder = dict((labels[idx], idx) for idx in range(len(labels)))
    handcoded = [encoder[v] for v in arr]
    np.testing.assert_array_equal(uvals.get(), handcoded)
コード例 #26
0
def _cubic_spline_coefficients(x, y, ids, prefix_sums):
    x_c = x._column
    y_c = y._column
    ids_c = ids._column
    prefix_c = prefix_sums._column
    return DataFrame._from_data(
        *cubicspline_coefficients(x_c, y_c, ids_c, prefix_c))
コード例 #27
0
ファイル: test_indexing.py プロジェクト: wenxiang-Li/cudf
def test_dataframe_take(ntake):
    np.random.seed(0)
    df = DataFrame()

    nelem = 123
    df["ii"] = np.random.randint(0, 20, nelem)
    df["ff"] = np.random.random(nelem)

    take_indices = np.random.randint(0, len(df), ntake)

    actual = df.take(take_indices)
    expected = df.to_pandas().take(take_indices)

    assert actual.ii.null_count == 0
    assert actual.ff.null_count == 0
    assert_eq(actual, expected)
コード例 #28
0
def test_onehot_transform_handle_unknown(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]})
    X = dask_cudf.from_cudf(X, npartitions=2)
    Y = dask_cudf.from_cudf(Y, npartitions=2)

    enc = OneHotEncoder(handle_unknown='error', sparse=False)
    enc = enc.fit(X)
    with pytest.raises(KeyError):
        enc.transform(Y).compute()

    enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    enc = enc.fit(X)
    ohe = enc.transform(Y)
    ref = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]])
    cp.testing.assert_array_equal(ohe.compute(), ref)
コード例 #29
0
ファイル: multiindex.py プロジェクト: williamBlazing/cudf
    def _get_column_major(self, df, row_tuple):
        from cudf import Series
        from cudf import DataFrame

        valid_indices = self._get_valid_indices_by_tuple(
            df.columns, row_tuple, len(df._cols))
        result = df._take_columns(valid_indices)
        if isinstance(row_tuple, (numbers.Number, slice)):
            row_tuple = [row_tuple]
        if len(result) == 0 and len(result.columns) == 0:
            result_columns = df.columns.copy(deep=False)
            clear_codes = DataFrame()
            for name in df.columns.names:
                clear_codes[name] = Series([])
            result_columns._codes = clear_codes
            result_columns._source_data = clear_codes
            result.columns = result_columns
        elif len(row_tuple) < len(
                self.levels) and (not slice(None) in row_tuple
                                  and not isinstance(row_tuple[0], slice)):
            columns = self._popn(len(row_tuple))
            result.columns = columns.take(valid_indices)
        else:
            result.columns = self.take(valid_indices)
        if len(result.columns.levels) == 1:
            columns = []
            for code in result.columns.codes[result.columns.codes.columns[0]]:
                columns.append(result.columns.levels[0][code])
            name = result.columns.names[0]
            result.columns = as_index(columns, name=name)
        if len(row_tuple) == len(self.levels) and len(result.columns) == 1:
            result = list(result._cols.values())[0]
        return result
コード例 #30
0
ファイル: test_sorting.py プロジェクト: TravisHester/cudf
def test_dataframe_multi_column_nulls(
    num_cols, num_rows, dtype, nulls, ascending, na_position
):

    np.random.seed(0)
    by = list(string.ascii_lowercase[:num_cols])
    pdf = pd.DataFrame()

    for i in range(3):
        colname = string.ascii_lowercase[i]
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        if nulls == "some":
            idx = np.array([], dtype="int64")
            if num_rows > 0:
                idx = np.random.choice(
                    num_rows, size=int(num_rows / 4), replace=False
                )
            data[idx] = np.nan
        elif nulls == "all":
            data[:] = np.nan
        pdf[colname] = data

    gdf = DataFrame.from_pandas(pdf)

    got = gdf.sort_values(by, ascending=ascending, na_position=na_position)
    expect = pdf.sort_values(by, ascending=ascending, na_position=na_position)

    assert_eq(
        got[by].reset_index(drop=True), expect[by].reset_index(drop=True)
    )