コード例 #1
0
    def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series:
        if self.categories.dtype.kind == "f":
            new_mask = bools_to_mask(self.notnull())
            col = column.build_categorical_column(
                categories=self.categories,
                codes=column.as_column(self.codes, dtype=self.codes.dtype),
                mask=new_mask,
                ordered=self.dtype.ordered,
                size=self.codes.size,
            )
        else:
            col = self

        signed_dtype = min_signed_type(len(col.categories))
        codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array()
        categories = col.categories.dropna(drop_nan=True).to_pandas()
        data = pd.Categorical.from_codes(codes,
                                         categories=categories,
                                         ordered=col.ordered)
        return pd.Series(data, index=index)
コード例 #2
0
ファイル: categorical.py プロジェクト: wenxiang-Li/cudf
def pandas_categorical_as_column(categorical, codes=None):
    """Creates a CategoricalColumn from a pandas.Categorical

    If ``codes`` is defined, use it instead of ``categorical.codes``
    """
    codes = categorical.codes if codes is None else codes
    codes = column.as_column(codes)

    valid_codes = codes != codes.dtype.type(-1)

    mask = None
    if not valid_codes.all():
        mask = bools_to_mask(valid_codes)

    return column.build_categorical_column(
        categories=categorical.categories,
        codes=column.as_column(codes.base_data, dtype=codes.dtype),
        size=codes.size,
        mask=mask,
        ordered=categorical.ordered,
    )
コード例 #3
0
ファイル: datasets.py プロジェクト: rongou/cudf
def timeseries(
    start="2000-01-01",
    end="2000-01-31",
    freq="1s",
    dtypes=None,
    nulls_frequency=0,
    seed=None,
):
    """Create timeseries dataframe with random data

    Parameters
    ----------
    start : datetime (or datetime-like string)
        Start of time series
    end : datetime (or datetime-like string)
        End of time series
    dtypes : dict
        Mapping of column names to types.
        Valid types include {float, int, str, 'category'}.
        If none is provided, this defaults to
        ``{"name": "category", "id": int, "x": float, "y": float}``
    freq : string
        String like '2s' or '1H' or '12W' for the time series frequency
    nulls_frequency : float
        Fill the series with the specified proportion of nulls. Default is 0.
    seed : int (optional)
        Randomstate seed

    Examples
    --------
    >>> import cudf as gd
    >>> gdf = gd.datasets.timeseries()
    >>> gdf.head()  # doctest: +SKIP
              timestamp    id     name         x         y
    2000-01-01 00:00:00   967    Jerry -0.031348 -0.040633
    2000-01-01 00:00:01  1066  Michael -0.262136  0.307107
    2000-01-01 00:00:02   988    Wendy -0.526331  0.128641
    2000-01-01 00:00:03  1016   Yvonne  0.620456  0.767270
    2000-01-01 00:00:04   998   Ursula  0.684902 -0.463278
    """
    if dtypes is None:
        dtypes = {"name": "category", "id": int, "x": float, "y": float}

    index = pd.DatetimeIndex(
        pd.date_range(start, end, freq=freq, name="timestamp"))
    state = np.random.RandomState(seed)
    columns = {k: make[dt](len(index), state) for k, dt in dtypes.items()}
    df = pd.DataFrame(columns, index=index, columns=sorted(columns))
    if df.index[-1] == end:
        df = df.iloc[:-1]

    gdf = cudf.from_pandas(df)
    for col in gdf:
        mask = state.choice(
            [True, False],
            size=len(index),
            p=[1 - nulls_frequency, nulls_frequency],
        )
        mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
        masked_col = gdf[col]._column.set_mask(mask_buf)
        gdf[col] = cudf.Series._from_data(ColumnAccessor({None: masked_col}),
                                          index=gdf.index)

    return gdf