Beispiel #1
0
def test_reflected_ops_cudf_scalar(funcs, dtype, obj_class):
    cpu_func, gpu_func = funcs

    # create random series
    np.random.seed(12)
    random_series = utils.gen_rand(dtype, 100, low=10)

    # gpu series
    gs = Series(random_series)

    # class typing
    if obj_class == "Index":
        gs = as_index(gs)

    gs_result = gpu_func(gs)

    # class typing
    if obj_class == "Index":
        gs = Series(gs)

    # pandas
    ps_result = cpu_func(random_series)

    # verify
    np.testing.assert_allclose(ps_result, gs_result.to_array())
Beispiel #2
0
    def _get_column_major(self, df, row_tuple):
        from cudf import Series
        from cudf import DataFrame

        valid_indices = self._get_valid_indices_by_tuple(
            df.columns, row_tuple, len(df._cols))
        result = df._take_columns(valid_indices)
        if isinstance(row_tuple, (numbers.Number, slice)):
            row_tuple = [row_tuple]
        if len(result) == 0 and len(result.columns) == 0:
            result_columns = df.columns.copy(deep=False)
            clear_codes = DataFrame()
            for name in df.columns.names:
                clear_codes[name] = Series([])
            result_columns._codes = clear_codes
            result_columns._source_data = clear_codes
            result.columns = result_columns
        elif len(row_tuple) < len(
                self.levels) and (not slice(None) in row_tuple
                                  and not isinstance(row_tuple[0], slice)):
            columns = self._popn(len(row_tuple))
            result.columns = columns.take(valid_indices)
        else:
            result.columns = self.take(valid_indices)
        if len(result.columns.levels) == 1:
            columns = []
            for code in result.columns.codes[result.columns.codes.columns[0]]:
                columns.append(result.columns.levels[0][code])
            name = result.columns.names[0]
            result.columns = as_index(columns, name=name)
        if len(row_tuple) == len(self.levels) and len(result.columns) == 1:
            result = list(result._cols.values())[0]
        return result
Beispiel #3
0
    def to_pandas(self, **kwargs):
        if hasattr(self, "_source_data"):
            result = self._source_data.to_pandas()
            result.columns = self.names
            return pd.MultiIndex.from_frame(result)

        pandas_codes = []
        for code in self.codes.columns:
            pandas_codes.append(self.codes[code].to_array())

        # We do two things here to mimic Pandas behavior:
        # 1. as_index() on each level, so DatetimeColumn becomes DatetimeIndex
        # 2. convert levels to numpy array so empty levels become Float64Index
        levels = np.array(
            [as_index(level).to_pandas() for level in self.levels]
        )

        # Backwards compatibility:
        # Construct a dummy MultiIndex and check for the codes attr.
        # This indicates that it is pandas >= 0.24
        # If no codes attr is present it is pandas <= 0.23
        if hasattr(pd.MultiIndex([[]], [[]]), "codes"):
            pandas_mi = pd.MultiIndex(levels=levels, codes=pandas_codes)
        else:
            pandas_mi = pd.MultiIndex(levels=levels, labels=pandas_codes)
        if self.names is not None:
            pandas_mi.names = self.names
        return pandas_mi
Beispiel #4
0
    def categories(self):
        """
        The categories of this categorical.
        """
        from cudf.core.index import as_index

        return as_index(self._column.categories)
Beispiel #5
0
    def get_level_values(self, level):
        """
        Return the values at the requested level

        Parameters
        ----------
        level : int or label

        Returns
        -------
        An Index containing the values at the requested level.
        """
        colnames = list(self._source_data.columns)
        if level not in colnames:
            if isinstance(level, int):
                if level < 0:
                    level = level + len(colnames)
                if level < 0 or level >= len(colnames):
                    raise IndexError(f"Invalid level number: '{level}'")
                level_idx = level
                level = colnames[level_idx]
            elif level in self.names:
                level_idx = list(self.names).index(level)
                level = colnames[level_idx]
            else:
                raise KeyError(f"Level not found: '{level}'")
        else:
            level_idx = colnames.index(level)
        level_values = as_index(
            self._source_data._data[level], name=self.names[level_idx]
        )
        return level_values
Beispiel #6
0
def test_categorical_basic():
    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
    cudf_cat = as_index(cat)

    pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"])
    sr = Series(cat, index=["p", "q", "r", "s", "t"])
    assert_eq(pdsr.cat.codes, sr.cat.codes)

    # Test attributes
    assert_eq(pdsr.cat.categories, sr.cat.categories)
    assert pdsr.cat.ordered == sr.cat.ordered

    np.testing.assert_array_equal(pdsr.cat.codes.values,
                                  sr.cat.codes.to_array())

    string = str(sr)
    expect_str = """
p a
q a
r b
s c
t a
"""
    assert all(x == y for x, y in zip(string.split(), expect_str.split()))
    assert_eq(cat.codes, cudf_cat.codes.to_array())
def test_categorical_basic():
    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
    cudf_cat = as_index(cat)

    pdsr = pd.Series(cat)
    sr = Series(cat)
    np.testing.assert_array_equal(cat.codes, sr.to_array())

    # Test attributes
    assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories)
    assert pdsr.cat.ordered == sr.cat.ordered

    np.testing.assert_array_equal(
        pdsr.cat.codes.values, sr.cat.codes.to_array()
    )
    np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)

    string = str(sr)
    expect_str = """
0 a
1 a
2 b
3 c
4 a
"""
    assert all(x == y for x, y in zip(string.split(), expect_str.split()))
    assert_eq(cat.codes, cudf_cat.codes.to_array())
Beispiel #8
0
    def _getitem_tuple_arg(self, arg):
        from cudf.core.dataframe import DataFrame
        from cudf.core.column import column
        from cudf.core.index import as_index
        from cudf.utils.cudautils import arange
        from cudf import MultiIndex

        # Step 1: Gather columns
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
        else:
            columns = self._get_column_selection(arg[1])
            columns_df = DataFrame()
            for col in columns:
                columns_df.add_column(name=col, data=self._df[col])
        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            return columns_df.index._get_row_major(columns_df, arg[0])
        else:
            if isinstance(self._df.columns, MultiIndex):
                if isinstance(arg[0], slice):
                    start, stop, step = arg[0].indices(len(columns_df))
                    indices = arange(start, stop, step)
                    df = columns_df.take(indices)
                else:
                    df = columns_df.take(arg[0])
            else:
                df = DataFrame()
                for col in columns_df.columns:
                    df[col] = columns_df[col].loc[arg[0]]
        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Beispiel #9
0
    def _index_and_downcast(self, result, index, index_key):

        if isinstance(index_key, (numbers.Number, slice)):
            index_key = [index_key]
        if (
            len(index_key) > 0 and not isinstance(index_key, tuple)
        ) or isinstance(index_key[0], slice):
            index_key = index_key[0]

        slice_access = False
        if isinstance(index_key, slice):
            slice_access = True
        out_index = cudf.DataFrame()
        # Select the last n-k columns where n is the number of _source_data
        # columns and k is the length of the indexing tuple
        size = 0
        if not isinstance(index_key, (numbers.Number, slice)):
            size = len(index_key)
        for k in range(size, len(index._source_data.columns)):
            if index.names is None:
                name = k
            else:
                name = index.names[k]
            out_index.insert(
                len(out_index.columns),
                name,
                index._source_data[index._source_data.columns[k]],
            )

        if len(result) == 1 and size == 0 and slice_access is False:
            # If the final result is one row and it was not mapped into
            # directly, return a Series with a tuple as name.
            result = result.T
            result = result[result._data.names[0]]
        elif len(result) == 0 and slice_access is False:
            # Pandas returns an empty Series with a tuple as name
            # the one expected result column
            series_name = []
            for idx, code in enumerate(index._source_data.columns):
                series_name.append(index._source_data[code][0])
            result = cudf.Series([])
            result.name = tuple(series_name)
        elif len(out_index.columns) == 1:
            # If there's only one column remaining in the output index, convert
            # it into an Index and name the final index values according
            # to the _source_data column names
            last_column = index._source_data.columns[-1]
            out_index = index._source_data[last_column]
            out_index = as_index(out_index)
            out_index.name = index.names[len(index.names) - 1]
            index = out_index
        elif len(out_index.columns) > 1:
            # Otherwise pop the leftmost levels, names, and codes from the
            # source index until it has the correct number of columns (n-k)
            result.reset_index(drop=True)
            index = index._popn(size)
        if isinstance(index_key, tuple):
            result = result.set_index(index)
        return result
Beispiel #10
0
def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype):
    arr1 = (np.random.random(100) * 100).astype(lhs_dtype)
    sr1 = Series(arr1)

    arr2 = (np.random.random(100) * 100).astype(rhs_dtype)
    sr2 = Series(arr2)

    if obj_class == "Index":
        sr1 = as_index(sr1)
        sr2 = as_index(sr2)

    result = binop(sr1, sr2)

    if obj_class == "Index":
        result = Series(result)

    np.testing.assert_almost_equal(result.to_array(), binop(arr1, arr2))
Beispiel #11
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.column import column
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns_df = self._get_column_selection(arg[1])
        columns_df._index = self._df._index

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg[0], slice):
                df = columns_df[arg[0]]
            else:
                df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (
                isinstance(arg[0], slice) or isinstance(arg[1], slice)
            ):
                # Pandas returns a numpy scalar in this case
                return df.iloc[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            if isinstance(arg[0], slice):
                df = columns_df._slice(arg[0])
            elif is_scalar(arg[0]):
                index = arg[0]
                if index < 0:
                    index += len(columns_df)
                df = columns_df._slice(slice(index, index + 1, 1))
            else:
                arg = (column.as_column(arg[0]), arg[1])
                if pd.api.types.is_bool_dtype(arg[0]):
                    df = columns_df._apply_boolean_mask(arg[0])
                else:
                    df = columns_df._gather(arg[0])

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)

        if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
            from cudf.core.index import RangeIndex

            slice_len = len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Beispiel #12
0
def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class):
    nelem = 5
    lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype)
    rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype)

    sr1 = Series(lhs)
    sr2 = Series(rhs)

    if obj_class == "Index":
        sr1 = as_index(sr1)
        sr2 = as_index(sr2)

    result = cmpop(Series(sr1), Series(sr2))

    if obj_class == "Index":
        result = Series(result)

    np.testing.assert_array_equal(result.to_array(), cmpop(lhs, rhs))
Beispiel #13
0
    def sort_values(self, return_indexer=False, ascending=True, key=None):
        if key is not None:
            raise NotImplementedError("key parameter is not yet implemented.")

        indices = self._source_data.argsort(ascending=ascending)
        index_sorted = as_index(self.take(indices), name=self.names)

        if return_indexer:
            return index_sorted, cupy.asarray(indices)
        else:
            return index_sorted
Beispiel #14
0
def test_index_rename():
    pds = pd.Index([1, 2, 3], name="asdf")
    gds = as_index(pds)

    expect = pds.rename("new_name")
    got = gds.rename("new_name")

    assert_eq(expect, got)
    """
    From here on testing recursive creation
    and if name is being handles in recursive creation.
    """
    pds = pd.Index(expect)
    gds = as_index(got)

    assert_eq(pds, gds)

    pds = pd.Index(pds, name="abc")
    gds = as_index(gds, name="abc")
    assert_eq(pds, gds)
Beispiel #15
0
def test_series_binop(binop, obj_class):
    nelem = 1000
    arr1 = utils.gen_rand("float64", nelem) * 10000
    # Keeping a low value because CUDA 'pow' has 2 full range error
    arr2 = utils.gen_rand("float64", nelem) * 10

    sr1 = Series(arr1)
    sr2 = Series(arr2)

    if obj_class == "Index":
        sr1 = as_index(sr1)
        sr2 = as_index(sr2)

    result = binop(sr1, sr2)
    expect = binop(pd.Series(arr1), pd.Series(arr2))

    if obj_class == "Index":
        result = Series(result)

    utils.assert_eq(result, expect)
Beispiel #16
0
def test_series_compare(cmpop, obj_class, dtype):
    arr1 = np.random.randint(0, 100, 100).astype(dtype)
    arr2 = np.random.randint(0, 100, 100).astype(dtype)
    sr1 = Series(arr1)
    sr2 = Series(arr2)

    if obj_class == "Index":
        sr1 = as_index(sr1)
        sr2 = as_index(sr2)

    result1 = cmpop(sr1, sr1)
    result2 = cmpop(sr2, sr2)
    result3 = cmpop(sr1, sr2)

    if obj_class == "Index":
        result1 = Series(result1)
        result2 = Series(result2)
        result3 = Series(result3)

    np.testing.assert_equal(result1.to_array(), cmpop(arr1, arr1))
    np.testing.assert_equal(result2.to_array(), cmpop(arr2, arr2))
    np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2))
Beispiel #17
0
def test_series_binop_scalar(nelem, binop, obj_class):
    arr = np.random.random(nelem)
    rhs = random.choice(arr).item()
    sr = Series(arr)
    if obj_class == "Index":
        sr = as_index(sr)

    result = binop(sr, rhs)

    if obj_class == "Index":
        result = Series(result)

    np.testing.assert_almost_equal(result.to_array(), binop(arr, rhs))
Beispiel #18
0
def test_index_rename_inplace():
    pds = pd.Index([1, 2, 3], name="asdf")
    gds = as_index(pds)

    # inplace=False should yield a deep copy
    gds_renamed_deep = gds.rename("new_name", inplace=False)

    assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr

    # inplace=True returns none
    expected_ptr = gds._values.data_ptr
    gds.rename("new_name", inplace=True)

    assert expected_ptr == gds._values.data_ptr
Beispiel #19
0
def test_index_rename_preserves_arg():
    idx1 = GenericIndex([1, 2, 3], name="orig_name")

    # this should be an entirely new object
    idx2 = idx1.rename("new_name", inplace=False)

    assert idx2.name == "new_name"
    assert idx1.name == "orig_name"

    # a new object but referencing the same data
    idx3 = as_index(idx1, name="last_name")

    assert idx3.name == "last_name"
    assert idx1.name == "orig_name"
Beispiel #20
0
def test_pandas_as_index():
    # Define Pandas Indexes
    pdf_int_index = pd.Int64Index([1, 2, 3, 4, 5])
    pdf_uint_index = pd.UInt64Index([1, 2, 3, 4, 5])
    pdf_float_index = pd.Float64Index([1.0, 2.0, 3.0, 4.0, 5.0])
    pdf_datetime_index = pd.DatetimeIndex(
        [1000000, 2000000, 3000000, 4000000, 5000000]
    )
    pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"])

    # Define cudf Indexes
    gdf_int_index = as_index(pdf_int_index)
    gdf_uint_index = as_index(pdf_uint_index)
    gdf_float_index = as_index(pdf_float_index)
    gdf_datetime_index = as_index(pdf_datetime_index)
    gdf_category_index = as_index(pdf_category_index)

    # Check instance types
    assert isinstance(gdf_int_index, GenericIndex)
    assert isinstance(gdf_uint_index, GenericIndex)
    assert isinstance(gdf_float_index, GenericIndex)
    assert isinstance(gdf_datetime_index, DatetimeIndex)
    assert isinstance(gdf_category_index, CategoricalIndex)

    # Check equality
    assert_eq(pdf_int_index, gdf_int_index)
    assert_eq(pdf_uint_index, gdf_uint_index)
    assert_eq(pdf_float_index, gdf_float_index)
    assert_eq(pdf_datetime_index, gdf_datetime_index)
    assert_eq(pdf_category_index, gdf_category_index)

    assert_eq(
        pdf_category_index.codes,
        gdf_category_index.codes.astype(
            pdf_category_index.codes.dtype
        ).to_array(),
    )
Beispiel #21
0
    def _getitem_tuple_arg(self, arg):
        from cudf.core.dataframe import Series, DataFrame
        from cudf.core.column import column
        from cudf.core.index import as_index
        from cudf import MultiIndex

        # Step 1: Gather columns
        columns_df = self._get_column_selection(arg[1])
        columns_df._index = self._df._index

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            return columns_df.index._get_row_major(columns_df, arg[0])
        else:
            df = DataFrame()
            for col in columns_df.columns:
                # need Series() in case a scalar is returned
                df[col] = Series(columns_df[col].loc[arg[0]])
            df.columns = columns_df.columns

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Beispiel #22
0
def test_index_rename_inplace():
    pds = pd.Index([1, 2, 3], name="asdf")
    gds = as_index(pds)

    # inplace=False should yield a deep copy
    gds_renamed_deep = gds.rename("new_name", inplace=False)
    gds._values.data.mem = GenericIndex([2, 3, 4])._values.data.mem

    assert (gds_renamed_deep.values == [1, 2, 3]).all()

    # inplace=True returns none
    gds_to_rename = gds
    gds.rename("new_name", inplace=True)
    gds._values.data.mem = GenericIndex([3, 4, 5])._values.data.mem

    assert (gds_to_rename.values == [3, 4, 5]).all()
Beispiel #23
0
def test_series_compare_scalar(nelem, cmpop, obj_class, dtype):
    arr1 = np.random.randint(0, 100, 100).astype(dtype)
    sr1 = Series(arr1)
    rhs = random.choice(arr1).item()

    if obj_class == "Index":
        sr1 = as_index(sr1)

    result1 = cmpop(sr1, rhs)
    result2 = cmpop(rhs, sr1)

    if obj_class == "Index":
        result1 = Series(result1)
        result2 = Series(result2)

    np.testing.assert_equal(result1.to_array(), cmpop(arr1, rhs))
    np.testing.assert_equal(result2.to_array(), cmpop(rhs, arr1))
Beispiel #24
0
def test_string_index():
    from cudf.core.index import StringIndex

    pdf = pd.DataFrame(np.random.rand(5, 5))
    gdf = cudf.from_pandas(pdf)
    stringIndex = ["a", "b", "c", "d", "e"]
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
    stringIndex = np.array(["a", "b", "c", "d", "e"])
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
    stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name")
    pdf.index = stringIndex.to_pandas()
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
    stringIndex = as_index(as_column(["a", "b", "c", "d", "e"]), name="name")
    pdf.index = stringIndex.to_pandas()
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
Beispiel #25
0
    def _getitem_tuple_arg(self, arg):
        from uuid import uuid4

        from cudf import MultiIndex
        from cudf.core.column import column
        from cudf.core.dataframe import DataFrame
        from cudf.core.index import as_index

        # Step 1: Gather columns
        if isinstance(arg, tuple):
            columns_df = self._get_column_selection(arg[1])
            columns_df._index = self._df._index
        else:
            columns_df = self._df

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg, (MultiIndex, pd.MultiIndex)):
                if isinstance(arg, pd.MultiIndex):
                    arg = MultiIndex.from_pandas(arg)

                indices = indices_from_labels(columns_df, arg)
                return columns_df.take(indices)

            else:
                if isinstance(arg, tuple):
                    return columns_df.index._get_row_major(columns_df, arg[0])
                else:
                    return columns_df.index._get_row_major(columns_df, arg)
        else:
            if isinstance(arg[0], slice):
                out = get_label_range_or_mask(
                    columns_df.index, arg[0].start, arg[0].stop, arg[0].step
                )
                if isinstance(out, slice):
                    df = columns_df._slice(out)
                else:
                    df = columns_df._apply_boolean_mask(out)
            else:
                tmp_arg = arg
                if is_scalar(arg[0]):
                    # If a scalar, there is possibility of having duplicates.
                    # Join would get all the duplicates. So, coverting it to
                    # an array kind.
                    tmp_arg = ([tmp_arg[0]], tmp_arg[1])
                if len(tmp_arg[0]) == 0:
                    return columns_df._empty_like(keep_index=True)
                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])

                if pd.api.types.is_bool_dtype(tmp_arg[0]):
                    df = columns_df._apply_boolean_mask(tmp_arg[0])
                else:
                    tmp_col_name = str(uuid4())
                    other_df = DataFrame(
                        {tmp_col_name: column.arange(len(tmp_arg[0]))},
                        index=as_index(tmp_arg[0]),
                    )
                    df = other_df.join(columns_df, how="inner")
                    # as join is not assigning any names to index,
                    # update it over here
                    df.index.name = columns_df.index.name
                    df = df.sort_values(tmp_col_name)
                    df.drop(columns=[tmp_col_name], inplace=True)
                    # There were no indices found
                    if len(df) == 0:
                        raise KeyError(arg)

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Beispiel #26
0
    def categories(self):
        from cudf.core.index import as_index

        return as_index(self._parent.categories)
Beispiel #27
0
    def categories(self):
        from cudf.core.index import as_index

        return as_index(self._column.categories)
Beispiel #28
0
    def __init__(self,
                 levels=None,
                 codes=None,
                 labels=None,
                 names=None,
                 **kwargs):
        from cudf.core.series import Series

        self.name = None
        self.names = names
        self._source_data = None
        column_names = []
        if labels:
            warnings.warn(
                "the 'labels' keyword is deprecated, use 'codes' "
                "instead",
                FutureWarning,
            )
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if "source_data" in kwargs:
            self._source_data = kwargs["source_data"].reset_index(drop=True)
            self._codes = codes
            self._levels = levels
            return

        # name setup
        if isinstance(
                names,
            (
                Sequence,
                pd.core.indexes.frozen.FrozenNDArray,
                pd.core.indexes.frozen.FrozenList,
            ),
        ):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError("Must pass non-zero number of levels/codes")

        from cudf import DataFrame

        if not isinstance(codes, DataFrame) and not isinstance(
                codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)):
            raise TypeError("Codes is not a Sequence of sequences")

        if isinstance(codes, DataFrame):
            self._codes = codes
        elif len(levels) == len(codes):
            self._codes = DataFrame()
            for i, codes in enumerate(codes):
                name = column_names[i] or i
                codes = column.as_column(codes)
                self._codes[name] = codes.astype(np.int64)
        else:
            raise ValueError("MultiIndex has unequal number of levels and "
                             "codes and is inconsistent!")

        self._levels = [Series(level) for level in levels]
        self._validate_levels_and_codes(self._levels, self._codes)

        self._source_data = DataFrame()
        for i, name in enumerate(self._codes.columns):
            codes = as_index(self._codes[name]._column)
            if -1 in self._codes[name].values:
                # Must account for null(s) in _source_data column
                level = DataFrame(
                    {name: [None] + list(self._levels[i])},
                    index=range(-1, len(self._levels[i])),
                )
            else:
                level = DataFrame({name: self._levels[i]})
            level = DataFrame(index=codes).join(level)
            self._source_data[name] = level[name].reset_index(drop=True)

        self.names = [None] * len(self._levels) if names is None else names
Beispiel #29
0
def to_datetime(
    arg,
    errors="raise",
    dayfirst=False,
    yearfirst=False,
    utc=None,
    format=None,
    exact=True,
    unit="ns",
    infer_datetime_format=False,
    origin="unix",
    cache=True,
):
    """
    Convert argument to datetime.

    Parameters
    ----------
    arg : int, float, str, datetime, list, tuple, 1-d array,
        Series DataFrame/dict-like
        The object to convert to a datetime.
    errors : {'ignore', 'raise', 'coerce', 'warn'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception.
        - If 'coerce', then invalid parsing will be set as NaT.
        - If 'warn' : prints last exceptions as warnings and
            return the input.
        - If 'ignore', then invalid parsing will return the input.
    dayfirst : bool, default False
        Specify a date parse order if `arg` is str or its list-likes.
        If True, parses dates with the day first, eg 10/11/12 is parsed as
        2012-11-10.
        Warning: dayfirst=True is not strict, but will prefer to parse
        with day first (this is a known bug, based on dateutil behavior).
    format : str, default None
        The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
        all the way up to nanoseconds.
        See strftime documentation for more information on choices:
        https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
    unit : str, default 'ns'
        The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
        integer or float number. This will be based off the
        origin(unix epoch start).
        Example, with unit='ms' and origin='unix' (the default), this
        would calculate the number of milliseconds to the unix epoch start.
    infer_datetime_format : bool, default False
        If True and no `format` is given, attempt to infer the format of the
        datetime strings, and if it can be inferred, switch to a faster
        method of parsing them. In some cases this can increase the parsing
        speed by ~5-10x.

    Returns
    -------
    datetime
        If parsing succeeded.
        Return type depends on input:
        - list-like: DatetimeIndex
        - Series: Series of datetime64 dtype
        - scalar: Timestamp

    Examples
    --------
    Assembling a datetime from multiple columns of a DataFrame. The keys can be
    common abbreviations like ['year', 'month', 'day', 'minute', 'second',
    'ms', 'us', 'ns']) or plurals of the same

    >>> import cudf
    >>> df = cudf.DataFrame({'year': [2015, 2016],
    ...                    'month': [2, 3],
    ...                    'day': [4, 5]})
    >>> cudf.to_datetime(df)
    0   2015-02-04
    1   2016-03-05
    dtype: datetime64[ns]
    >>> cudf.to_datetime(1490195805, unit='s')
    numpy.datetime64('2017-03-22T15:16:45.000000000')
    >>> cudf.to_datetime(1490195805433502912, unit='ns')
    numpy.datetime64('1780-11-20T01:02:30.494253056')
    """
    if arg is None:
        return None

    if exact is False:
        raise NotImplementedError("exact support is not yet implemented")

    if origin != "unix":
        raise NotImplementedError("origin support is not yet implemented")

    if yearfirst:
        raise NotImplementedError("yearfirst support is not yet implemented")

    try:
        if isinstance(arg, cudf.DataFrame):
            # we require at least Ymd
            required = ["year", "month", "day"]
            req = list(set(required) - set(arg._data.names))
            if len(req):
                req = ",".join(req)
                raise ValueError(
                    f"to assemble mappings requires at least that "
                    f"[year, month, day] be specified: [{req}] "
                    f"is missing"
                )

            # replace passed column name with values in _unit_map
            unit = {k: get_units(k) for k in arg._data.names}
            unit_rev = {v: k for k, v in unit.items()}

            # keys we don't recognize
            excess = set(unit_rev.keys()) - set(_unit_map.values())
            if len(excess):
                excess = ",".join(excess)
                raise ValueError(
                    f"extra keys have been passed to the "
                    f"datetime assemblage: [{excess}]"
                )

            new_series = (
                arg[unit_rev["year"]].astype("str")
                + "-"
                + arg[unit_rev["month"]].astype("str").str.zfill(2)
                + "-"
                + arg[unit_rev["day"]].astype("str").str.zfill(2)
            )
            format = "%Y-%m-%d"
            col = new_series._column.as_datetime_column(
                "datetime64[s]", format=format
            )

            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    arg_col = arg._data[value]
                    if arg_col.dtype.kind in ("f"):
                        col = new_series._column.as_datetime_column(
                            "datetime64[ns]", format=format
                        )
                        break
                    elif arg_col.dtype.kind in ("O"):
                        if not cpp_is_integer(arg_col).all():
                            col = new_series._column.as_datetime_column(
                                "datetime64[ns]", format=format
                            )
                            break

            times_column = None
            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    current_col = arg._data[value]
                    # If the arg[value] is of int or
                    # float dtype we don't want to type-cast
                    if current_col.dtype.kind in ("O"):
                        try:
                            current_col = current_col.astype(dtype="int64")
                        except ValueError:
                            current_col = current_col.astype(dtype="float64")

                    factor = as_device_scalar(
                        column.datetime._numpy_to_pandas_conversion[u]
                        / (
                            column.datetime._numpy_to_pandas_conversion["s"]
                            if np.datetime_data(col.dtype)[0] == "s"
                            else 1
                        )
                    )

                    if times_column is None:
                        times_column = current_col * factor
                    else:
                        times_column = times_column + (current_col * factor)
            if times_column is not None:
                col = (col.astype(dtype="int64") + times_column).astype(
                    dtype=col.dtype
                )
            return cudf.Series(col, index=arg.index)
        elif isinstance(arg, cudf.Index):
            col = arg._values
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return as_index(col, name=arg.name)
        elif isinstance(arg, cudf.Series):
            col = arg._column
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return cudf.Series(col, index=arg.index, name=arg.name)
        else:
            col = column.as_column(arg)
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )

            if is_scalar(arg):
                return col[0]
            else:
                return as_index(col)
    except Exception as e:
        if errors == "raise":
            raise e
        elif errors == "warn":
            import traceback

            tb = traceback.format_exc()
            warnings.warn(tb)
        elif errors == "ignore":
            pass
        elif errors == "coerce":
            return np.datetime64("nat", "ns" if unit is None else unit)
        return arg
Beispiel #30
0
    def __new__(
        cls,
        levels=None,
        codes=None,
        sortorder=None,
        labels=None,
        names=None,
        dtype=None,
        copy=False,
        name=None,
        **kwargs,
    ) -> "MultiIndex":

        if sortorder is not None:
            raise NotImplementedError("sortorder is not yet supported")

        if name is not None:
            raise NotImplementedError(
                "Use `names`, `name` is not yet supported"
            )

        out = Frame.__new__(cls)
        super(Index, out).__init__()

        if copy:
            if isinstance(codes, cudf.DataFrame):
                codes = codes.copy()
            if len(levels) > 0 and isinstance(levels[0], cudf.Series):
                levels = [level.copy() for level in levels]

        out._name = None

        column_names = []
        if labels:
            warnings.warn(
                "the 'labels' keyword is deprecated, use 'codes' " "instead",
                FutureWarning,
            )
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if "source_data" in kwargs:
            source_data = kwargs["source_data"].copy(deep=False)
            source_data.reset_index(drop=True, inplace=True)

            if isinstance(source_data, pd.DataFrame):
                nan_as_null = kwargs.get("nan_as_null", None)
                source_data = cudf.DataFrame.from_pandas(
                    source_data, nan_as_null=nan_as_null
                )
            names = names if names is not None else source_data._data.names
            # if names are unique
            # try using those as the source_data column names:
            if len(dict.fromkeys(names)) == len(names):
                source_data.columns = names
            out._data = source_data._data
            out.names = names
            out._codes = codes
            out._levels = levels
            return out

        # name setup
        if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenList,),):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError("Must pass non-zero number of levels/codes")

        if not isinstance(codes, cudf.DataFrame) and not isinstance(
            codes[0], (Sequence, np.ndarray)
        ):
            raise TypeError("Codes is not a Sequence of sequences")

        if isinstance(codes, cudf.DataFrame):
            out._codes = codes
        elif len(levels) == len(codes):
            out._codes = cudf.DataFrame()
            for i, codes in enumerate(codes):
                name = column_names[i] or i
                codes = column.as_column(codes)
                out._codes[name] = codes.astype(np.int64)
        else:
            raise ValueError(
                "MultiIndex has unequal number of levels and "
                "codes and is inconsistent!"
            )

        out._levels = [cudf.Series(level) for level in levels]
        out._validate_levels_and_codes(out._levels, out._codes)

        source_data = cudf.DataFrame()
        for i, name in enumerate(out._codes.columns):
            codes = as_index(out._codes[name]._column)
            if -1 in out._codes[name].values:
                # Must account for null(s) in _source_data column
                level = cudf.DataFrame(
                    {name: [None] + list(out._levels[i])},
                    index=range(-1, len(out._levels[i])),
                )
            else:
                level = cudf.DataFrame({name: out._levels[i]})

            source_data[name] = libcudf.copying.gather(
                level, codes._data.columns[0]
            )._data[name]

        out._data = source_data._data
        out.names = names

        return out