Ejemplo n.º 1
0
def test_applymap_round(nelem, masked):
    # Generate data
    np.random.seed(0)
    data = np.random.random(nelem) * 100

    if masked:
        # Make mask
        bitmask = utils.random_bitmask(nelem)
        boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask),
                              dtype=np.bool)[:nelem]
        data[~boolmask] = np.nan

    sr = Series(data)

    if masked:
        # Mask the Series
        sr = sr.set_mask(bitmask)

    # Call applymap
    out = sr.applymap(lambda x: (floor(x) + 1
                                 if x - floor(x) >= 0.5 else floor(x)))

    if masked:
        # Fill masked values
        out = out.fillna(np.nan)

    # Check
    expect = np.round(data)
    got = out.to_array()
    np.testing.assert_array_almost_equal(expect, got)
Ejemplo n.º 2
0
 def _compute_drop_idx(self):
     """Helper to compute indices to drop from category to drop"""
     if self.drop is None:
         return None
     elif isinstance(self.drop, str) and self.drop == 'first':
         return {feature: 0 for feature in self._encoders.keys()}
     elif isinstance(self.drop, (dict, list)):
         if isinstance(self.drop, list):
             self.drop = dict(zip(range(len(self.drop)), self.drop))
         if len(self.drop.keys()) != len(self._encoders):
             msg = ("`drop` should have as many columns as the number "
                    "of features ({}), got {}")
             raise ValueError(msg.format(len(self._encoders),
                                         len(self.drop.keys())))
         drop_idx = dict()
         for feature in self.drop.keys():
             self.drop[feature] = Series(self.drop[feature])
             if len(self.drop[feature]) != 1:
                 msg = ("Trying to drop multiple values for feature {}, "
                        "this is not supported.").format(feature)
                 raise ValueError(msg)
             cats = self._encoders[feature].classes_
             if not self.drop[feature].isin(cats).all():
                 msg = ("Some categories for feature {} were supposed "
                        "to be dropped, but were not found in the encoder "
                        "categories.".format(feature))
                 raise ValueError(msg)
             cats = Series(cats)
             idx = cats.isin(self.drop[feature])
             drop_idx[feature] = cp.asarray(cats[idx].index)
         return drop_idx
     else:
         msg = ("Wrong input for parameter `drop`. Expected "
                "'first', None or a dict, got {}")
         raise ValueError(msg.format(type(self.drop)))
Ejemplo n.º 3
0
def read_polygon_shapefile(filename):
    """
    Reads polygon geometry from an ESRI shapefile into GPU memory.

    Parameters
    ----------
    filename : str, pathlike
        ESRI Shapefile file path (usually ends in ``.shp``)

    Returns
    -------
    result  : tuple (cudf.Series, cudf.Series, cudf.DataFrame)
    poly_offsets   : cudf.Series(dtype=np.int32)
        Offsets of the first ring in each polygon
    ring_offsets   : cudf.Series(dtype=np.int32)
        Offsets of the first point in each ring
    points  : cudf.DataFrame
        DataFrame of all points in the shapefile
            x : cudf.Series(dtype=np.float64)
                x-components of each polygon's points
            y : cudf.Series(dtype=np.float64)
                y-components of each polygon's points
    """
    result = cpp_read_polygon_shapefile(filename)
    f_pos = Series(result[0], name="f_pos")
    r_pos = Series(result[1], name="r_pos")
    return (f_pos, r_pos, DataFrame({"x": result[2], "y": result[3]}))
Ejemplo n.º 4
0
def test_categorical_compare_ordered(data):
    cat1 = data[0]
    cat2 = data[1]
    pdsr1 = pd.Series(cat1)
    pdsr2 = pd.Series(cat2)
    sr1 = Series(cat1)
    sr2 = Series(cat2)
    dsr1 = dgd.from_cudf(sr1, npartitions=2)
    dsr2 = dgd.from_cudf(sr2, npartitions=2)

    # Test equality
    out = dsr1 == dsr1
    assert out.dtype == np.bool_
    assert np.all(out.compute().to_array())
    assert np.all(pdsr1 == pdsr1)

    # Test inequality
    out = dsr1 != dsr1
    assert not np.any(out.compute().to_array())
    assert not np.any(pdsr1 != pdsr1)

    assert dsr1.cat.ordered
    assert pdsr1.cat.ordered

    # Test ordered operators
    np.testing.assert_array_equal(
        pdsr1 < pdsr2, (dsr1 < dsr2).compute().to_array()
    )
    np.testing.assert_array_equal(
        pdsr1 > pdsr2, (dsr1 > dsr2).compute().to_array()
    )
Ejemplo n.º 5
0
def test_generic_ptx(dtype):

    size = 500

    lhs_arr = np.random.random(size).astype(dtype)
    lhs_col = Series(lhs_arr)._column

    rhs_arr = np.random.random(size).astype(dtype)
    rhs_col = Series(rhs_arr)._column

    def generic_function(a, b):
        return a ** 3 + b

    nb_type = numpy_support.from_dtype(cudf.dtype(dtype))
    type_signature = (nb_type, nb_type)

    ptx_code, output_type = compile_ptx(
        generic_function, type_signature, device=True
    )

    dtype = numpy_support.as_dtype(output_type).type

    out_col = libcudf.binaryop.binaryop_udf(lhs_col, rhs_col, ptx_code, dtype)

    result = lhs_arr ** 3 + rhs_arr

    np.testing.assert_almost_equal(result, out_col.to_array())
Ejemplo n.º 6
0
def test_series_sort_values_ignore_index(ignore_index):
    gsr = Series([1, 3, 5, 2, 4])
    psr = gsr.to_pandas()

    expect = psr.sort_values(ignore_index=ignore_index)
    got = gsr.sort_values(ignore_index=ignore_index)
    assert_eq(expect, got)
Ejemplo n.º 7
0
def test_get(data, index, expectation):
    with expectation:
        expect = Series(data).list.get(index)

    if expectation == does_not_raise():
        ds = dgd.from_cudf(Series(data), 5)
        assert_eq(expect, ds.list.get(index).compute())
Ejemplo n.º 8
0
def test_take(data, list_indices, expectation):
    with expectation:
        expect = Series(data).list.take(list_indices)

    if expectation == does_not_raise():
        ds = dgd.from_cudf(Series(data), 5)
        assert_eq(expect, ds.list.take(list_indices).compute())
Ejemplo n.º 9
0
def test_series_nlargest_nelem(nelem):
    np.random.seed(0)
    elems = np.random.random(nelem)
    gds = Series(elems).nlargest(nelem)
    pds = pd.Series(elems).nlargest(nelem)

    assert (pds == gds.to_pandas()).all().all()
Ejemplo n.º 10
0
def test_vectorizer_empty_token_case():
    """
    We ignore empty tokens right now but sklearn treats them as a character
    we might want to look into this more but
    this should not be a concern for most piplines
    """
    corpus = [
        "a b ",
    ]

    # we have extra null token here
    # we slightly diverge from sklearn here as not treating it as a token
    res = CountVectorizer(preprocessor=lambda s: s).\
        fit_transform(Series(corpus))
    ref = SkCountVect(
        preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ")
    ).fit_transform(corpus)
    cp.testing.assert_array_equal(res.todense(), ref.toarray())

    res = HashingVectorizer(preprocessor=lambda s: s).\
        fit_transform(Series(corpus))
    ref = SkHashVect(
        preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ")
    ).fit_transform(corpus)
    assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
Ejemplo n.º 11
0
def test_series_floor():
    arr = np.random.random(100) * 100
    sr = Series(arr)
    with pytest.warns(
        FutureWarning, match="Series.floor and DataFrame.floor are deprecated"
    ):
        sr = sr.floor()
    np.testing.assert_equal(sr.to_numpy(), np.floor(arr))
Ejemplo n.º 12
0
def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype):
    np_data = data.astype(from_dtype)
    gdf_col = Series(np_data)._column

    np_casted = np_data.astype(to_dtype)
    gdf_casted = gdf_col.astype(to_dtype)

    np.testing.assert_equal(np_casted, gdf_casted.to_array())
Ejemplo n.º 13
0
def test_max(dtype, nelem):
    dtype = cudf.dtype(dtype).type
    data = gen_rand(dtype, nelem)
    sr = Series(data)

    got = sr.max()
    expect = dtype(data.max())

    assert expect == got
Ejemplo n.º 14
0
def test_series_sort_index(nelem, asc):
    np.random.seed(0)
    sr = Series((100 * np.random.random(nelem)))
    orig = sr.to_array()
    got = sr.sort_values().sort_index(ascending=asc).to_array()
    if not asc:
        # Reverse the array for descending sort
        got = got[::-1]
    np.testing.assert_array_equal(orig, got)
Ejemplo n.º 15
0
def test_series_sort_index(nelem, asc):
    np.random.seed(0)
    sr = Series(100 * np.random.random(nelem))
    psr = sr.to_pandas()

    expected = psr.sort_index(ascending=asc)
    got = sr.sort_index(ascending=asc)

    assert_eq(expected, got)
Ejemplo n.º 16
0
def test_sum(dtype, nelem):
    dtype = cudf.dtype(dtype).type
    data = gen_rand(dtype, nelem)
    sr = Series(data)

    got = sr.sum()
    expect = data.sum()
    significant = 4 if dtype == np.float32 else 6
    np.testing.assert_approx_equal(expect, got, significant=significant)
Ejemplo n.º 17
0
def test_typecast_from_datetime_to_int64_to_datetime(data, dtype):
    pd_data = pd.Series(data.copy())
    np_data = np.array(pd_data)
    gdf_data = Series(pd_data)

    np_casted = np_data.astype(np.int64).astype(dtype)
    gdf_casted = gdf_data.astype(np.int64).astype(dtype)

    np.testing.assert_equal(np_casted, gdf_casted.to_array())
Ejemplo n.º 18
0
def test_pickle_categorical_column(slices):
    sr = Series(["a", "b", None, "a", "c", "b"]).astype("category")
    sliced_sr = sr.iloc[slices]
    input_col = sliced_sr._column

    pickled = pickle.dumps(input_col)
    out = pickle.loads(pickled)

    assert_eq(Series(out), Series(input_col))
Ejemplo n.º 19
0
def test_pickle_string_column(slices):
    sr = Series(["a", "b", None, "a", "c", "b"])
    sliced_sr = sr.iloc[slices]
    input_col = sliced_sr._column

    pickled = pickle.dumps(input_col)
    out = pickle.loads(pickled)

    assert_eq(Series(out), Series(input_col))
Ejemplo n.º 20
0
def test_fillna():
    _, schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)
    masked_col = gar[8]
    sr = Series(data=masked_col.data)
    dense = sr.nans_to_nulls().fillna(123)
    np.testing.assert_equal(123, dense.to_array())
    assert len(dense) == len(sr)
    assert dense.null_count == 0
Ejemplo n.º 21
0
def test_series_argsort(nelem, dtype, asc):
    np.random.seed(0)
    sr = Series((100 * np.random.random(nelem)).astype(dtype))
    res = sr.argsort(ascending=asc)

    if asc:
        expected = np.argsort(sr.to_array(), kind="mergesort")
    else:
        expected = np.argsort(sr.to_array() * -1, kind="mergesort")
    np.testing.assert_array_equal(expected, res.to_array())
Ejemplo n.º 22
0
def test_pickle_series(named):
    np.random.seed(0)
    if named:
        ser = Series(np.random.random(10), name="a")
    else:
        ser = Series(np.random.random(10))

    pickled = pickle.dumps(ser)
    out = pickle.loads(pickled)
    assert (ser == out).all()
Ejemplo n.º 23
0
def test_sorting(data, ascending, na_position, ignore_index):
    expect = Series(data).list.sort_values(ascending=ascending,
                                           na_position=na_position,
                                           ignore_index=ignore_index)
    got = (dgd.from_cudf(Series(data), 5).list.sort_values(
        ascending=ascending,
        na_position=na_position,
        ignore_index=ignore_index,
    ).compute().reset_index(drop=True))
    assert_eq(expect, got)
Ejemplo n.º 24
0
def test_applymap_python_lambda(dtype, udf, testfunc):

    size = 500

    lhs_arr = np.random.random(size).astype(dtype)
    lhs_ser = Series(lhs_arr)

    out_ser = lhs_ser.applymap(udf)
    result = testfunc(lhs_arr)
    np.testing.assert_almost_equal(result, out_ser.to_array())
Ejemplo n.º 25
0
    def _categories_equal(self, new_categories, **kwargs):
        cur_categories = self._column.categories
        if len(new_categories) != len(cur_categories):
            return False
        # if order doesn't matter, sort before the equals call below
        if not kwargs.get("ordered", self.ordered):
            from cudf.core.series import Series

            cur_categories = Series(cur_categories).sort_values()
            new_categories = Series(new_categories).sort_values()
        return cur_categories.equals(new_categories)
Ejemplo n.º 26
0
    def can_cast_safely(self, to_dtype):
        """
        Returns true if all the values in self can be
        safely cast to dtype
        """
        if self.dtype.kind == to_dtype.kind:
            if self.dtype <= to_dtype:
                return True
            else:
                # Kinds are the same but to_dtype is smaller
                if "float" in to_dtype.name:
                    info = np.finfo(to_dtype)
                elif "int" in to_dtype.name:
                    info = np.iinfo(to_dtype)
                min_, max_ = info.min, info.max

                if (self.min() > min_) and (self.max() < max_):
                    return True
                else:
                    return False

        # want to cast int to float
        elif to_dtype.kind == "f" and self.dtype.kind == "i":
            info = np.finfo(to_dtype)
            biggest_exact_int = 2 ** (info.nmant + 1)
            if (self.min() >= -biggest_exact_int) and (
                self.max() <= biggest_exact_int
            ):
                return True
            else:
                from cudf import Series

                if (
                    Series(self).astype(to_dtype).astype(self.dtype)
                    == Series(self)
                ).all():
                    return True
                else:
                    return False

        # want to cast float to int:
        elif to_dtype.kind == "i" and self.dtype.kind == "f":
            info = np.iinfo(to_dtype)
            min_, max_ = info.min, info.max
            # best we can do is hope to catch it here and avoid compare
            if (self.min() >= min_) and (self.max() <= max_):
                from cudf import Series

                if (Series(self) % 1 == 0).all():
                    return True
                else:
                    return False
            else:
                return False
Ejemplo n.º 27
0
def test_multiindex_take(pdf, gdf, pdfIndex):
    gdfIndex = cudf.from_pandas(pdfIndex)
    pdf.index = pdfIndex
    gdf.index = gdfIndex
    assert_eq(pdf.index.take([0]), gdf.index.take([0]))
    assert_eq(pdf.index.take(np.array([0])), gdf.index.take(np.array([0])))
    from cudf import Series
    assert_eq(pdf.index.take(Series([0])), gdf.index.take(Series([0])))
    assert_eq(pdf.index.take([0, 1]), gdf.index.take([0, 1]))
    assert_eq(pdf.index.take(np.array([0, 1])),
              gdf.index.take(np.array([0, 1])))
    assert_eq(pdf.index.take(Series([0, 1])), gdf.index.take(Series([0, 1])))
Ejemplo n.º 28
0
    def codes(self):
        from cudf import Series

        data = self._parent.data
        if self._parent.has_null_mask:
            mask = self._parent.mask
            null_count = self._parent.null_count
            return Series.from_masked_array(
                data=data.mem, mask=mask.mem, null_count=null_count
            )
        else:
            return Series(data, name=self._parent.name)
Ejemplo n.º 29
0
    def remove_categories(self, removals, **kwargs):
        from cudf import Series

        cats = self.categories.to_series()
        removals = Series(removals, dtype=cats.dtype)
        removals_mask = removals.isin(cats)
        # ensure all the removals are in the current categories
        # list. If not, raise an error to match Pandas behavior
        if not removals_mask.all():
            vals = removals[~removals_mask].to_array()
            msg = "removals must all be in old categories: {}".format(vals)
            raise ValueError(msg)
        return self.set_categories(cats[~cats.isin(removals)], **kwargs)
Ejemplo n.º 30
0
def test_applymap_change_out_dtype():
    # Test for changing the out_dtype using applymap

    data = list(range(10))

    sr = Series(data)

    out = sr.applymap(lambda x: float(x), out_dtype=float)

    # Check
    expect = np.array(data, dtype=float)
    got = out.to_array()
    np.testing.assert_array_equal(expect, got)