def test_pandas_as_index(): # Define Pandas Indexes pdf_int_index = pd.Int64Index([1, 2, 3, 4, 5]) pdf_float_index = pd.Float64Index([1.0, 2.0, 3.0, 4.0, 5.0]) pdf_datetime_index = pd.DatetimeIndex( [1000000, 2000000, 3000000, 4000000, 5000000]) pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"]) # Define cudf Indexes gdf_int_index = as_index(pdf_int_index) gdf_float_index = as_index(pdf_float_index) gdf_datetime_index = as_index(pdf_datetime_index) gdf_category_index = as_index(pdf_category_index) # Check instance types assert isinstance(gdf_int_index, GenericIndex) assert isinstance(gdf_float_index, GenericIndex) assert isinstance(gdf_datetime_index, DatetimeIndex) assert isinstance(gdf_category_index, CategoricalIndex) # Check equality assert_eq(pdf_int_index, gdf_int_index) assert_eq(pdf_float_index, gdf_float_index) assert_eq(pdf_datetime_index, gdf_datetime_index) assert_eq(pdf_category_index, gdf_category_index) assert_eq(pdf_category_index.codes, gdf_category_index.codes.to_array())
def test_pandas_as_index(): # Define Pandas Indexes pdf_int_index = pd.Int64Index([1, 2, 3, 4, 5]) pdf_float_index = pd.Float64Index([1., 2., 3., 4., 5.]) pdf_datetime_index = pd.DatetimeIndex( [1000000, 2000000, 3000000, 4000000, 5000000]) pdf_category_index = pd.CategoricalIndex(['a', 'b', 'c', 'b', 'a']) # Define cudf Indexes gdf_int_index = as_index(pdf_int_index) gdf_float_index = as_index(pdf_float_index) gdf_datetime_index = as_index(pdf_datetime_index) gdf_category_index = as_index(pdf_category_index) # Check instance types assert isinstance(gdf_int_index, GenericIndex) assert isinstance(gdf_float_index, GenericIndex) assert isinstance(gdf_datetime_index, DatetimeIndex) assert isinstance(gdf_category_index, CategoricalIndex) # Check equality assert_eq(pdf_int_index, gdf_int_index) assert_eq(pdf_float_index, gdf_float_index) assert_eq(pdf_datetime_index, gdf_datetime_index) assert_eq(pdf_category_index, gdf_category_index)
def _apply_basic_agg(self, agg_type, sort_results=False): """ Parameters ---------- agg_type : str The aggregation function to run. """ result = DataFrame() add_col_values = True ctx = ffi.new('gdf_context*') ctx.flag_sorted = 0 ctx.flag_method = self._method ctx.flag_distinct = 0 val_columns = self._val_columns val_columns_out = self._val_columns result = self._apply_agg(agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=sort_results) # If a Groupby has one index column and one value column # and as_index is set, return a Series instead of a df if isinstance(val_columns, (str, Number)) and self._as_index: result_series = result[val_columns] idx = index.as_index(result[self._by[0]]) if self.level == 0: idx.name = self._original_index_name else: idx.name = self._by[0] result_series = result_series.set_index(idx) return result_series # TODO: Do MultiIndex here if (self._as_index): idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result.drop_column(idx.name) if self.level == 0: idx.name = self._original_index_name else: idx.name = self._by[0] result = result.set_index(idx) nvtx_range_pop() return result
def _getitem_tuple_arg(self, arg): from cudf.dataframe.dataframe import DataFrame from cudf.dataframe.index import as_index columns = self._get_column_selection(arg[1]) df = DataFrame() for col in columns: df.add_column(name=col, data=self._df[col].loc[arg[0]]) if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): df.index = as_index(arg[0].start) else: df.index = as_index(arg[0]) return df
def quantile(self, q, interpolation='midpoint', exact=True, quant_index=True): """ Return values at the given quantile. Parameters ---------- q : float or array-like, default 0.5 (50% quantile) 0 <= q <= 1, the quantile(s) to compute interpolation : {’linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’} This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points i and j: columns : list of str List of column names to include. exact : boolean Whether to use approximate or exact quantile algorithm. quant_index : boolean Whether to use the list of quantiles as index. Returns ------- DataFrame """ if not quant_index: return Series(self._column.quantile(q, interpolation, exact)) else: return Series(self._column.quantile(q, interpolation, exact), index=as_index(np.asarray(q)))
def test_reflected_ops_scalar(func, dtype, obj_class): import pandas as pd # create random series np.random.seed(12) random_series = pd.Series(np.random.sample(100) + 10, dtype=dtype) # gpu series gs = Series(random_series) # class typing if obj_class == 'Index': gs = as_index(gs) gs_result = func(gs) # class typing if obj_class == 'Index': gs = Series(gs) # pandas ps_result = func(random_series) # verify np.testing.assert_allclose(ps_result, gs_result)
def test_categorical_basic(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) cudf_cat = as_index(cat) pdsr = pd.Series(cat) sr = Series(cat) np.testing.assert_array_equal(cat.codes, sr.to_array()) assert sr.dtype == pdsr.dtype # Test attributes assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal(pdsr.cat.codes.values, sr.cat.codes.to_array()) np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype) string = str(sr) expect_str = """ 0 a 1 a 2 b 3 c 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) assert_eq(cat.codes, cudf_cat.codes.to_array())
def reverse(self): """Reverse the Series """ data = cudautils.reverse_array(self.to_gpu_array()) index = as_index(cudautils.reverse_array(self.index.gpu_values)) col = self._column.replace(data=Buffer(data)) return self._copy_construct(data=col, index=index)
def _get_column_major(self, df, row_tuple): from cudf import Series from cudf import DataFrame valid_indices = self._get_valid_indices_by_tuple( df.columns, row_tuple, len(df._cols) ) result = df._take_columns(valid_indices) if isinstance(row_tuple, (numbers.Number, slice)): row_tuple = [row_tuple] if len(result) == 0 and len(result.columns) == 0: result_columns = df.columns.copy(deep=False) clear_codes = DataFrame() for name in df.columns.names: clear_codes[name] = Series([]) result_columns._codes = clear_codes result_columns._source_data = clear_codes result.columns = result_columns elif len(row_tuple) < len(self.levels) and ( not slice(None) in row_tuple and not isinstance(row_tuple[0], slice) ): columns = self._popn(len(row_tuple)) result.columns = columns.take(valid_indices) else: result.columns = self.take(valid_indices) if len(result.columns.levels) == 1: columns = [] for code in result.columns.codes[result.columns.codes.columns[0]]: columns.append(result.columns.levels[0][code]) name = result.columns.names[0] result.columns = as_index(columns, name=name) return result
def test_index_rename(): pds = pd.Index([1, 2, 3], name='asdf') gds = as_index(pds) expect = pds.rename('new_name') got = gds.rename('new_name') assert_eq(expect, got)
def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): arr1 = (np.random.random(100) * 100).astype(lhs_dtype) sr1 = Series(arr1) arr2 = (np.random.random(100) * 100).astype(rhs_dtype) sr2 = Series(arr2) if obj_class == 'Index': sr1 = as_index(sr1) sr2 = as_index(sr2) result = binop(sr1, sr2) if obj_class == 'Index': result = Series(result) np.testing.assert_almost_equal(result.to_array(), binop(arr1, arr2))
def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): nelem = 5 lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) sr1 = Series(lhs) sr2 = Series(rhs) if obj_class == 'Index': sr1 = as_index(sr1) sr2 = as_index(sr2) result = cmpop(Series(sr1), Series(sr2)) if obj_class == 'Index': result = Series(result) np.testing.assert_array_equal(result.to_array(), cmpop(lhs, rhs))
def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): nelem = 10 lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) sr1 = Series(lhs) sr2 = Series(rhs) if obj_class == "Index": sr1 = as_index(sr1) sr2 = as_index(sr2) result = binop(Series(sr1), Series(sr2)) if obj_class == "Index": result = Series(result) np.testing.assert_almost_equal(result.to_array(), binop(lhs, rhs))
def set_index(self, index): """Returns a new Series with a different index. Parameters ---------- index : Index, Series-convertible the new index or values for the new index """ index = index if isinstance(index, Index) else as_index(index) return self._copy_construct(index=index)
def _getitem_tuple_arg(self, arg): from cudf.dataframe.dataframe import DataFrame from cudf.dataframe.index import as_index from cudf.utils.cudautils import arange from cudf import MultiIndex # Step 1: Gather columns if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) else: columns = self._get_column_selection(arg[1]) columns_df = DataFrame() for col in columns: columns_df.add_column(name=col, data=self._df[col]) # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): return columns_df.index._get_row_major(columns_df, arg[0]) else: if isinstance(self._df.columns, MultiIndex): if isinstance(arg[0], slice): start, stop, step = arg[0].indices(len(columns_df)) indices = arange(start, stop, step) df = columns_df.take(indices) else: df = columns_df.take(arg[0]) else: df = DataFrame() for col in columns_df.columns: df[col] = columns_df[col].loc[arg[0]] # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: df.index = as_index(arg[0]) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def test_series_binop(binop, obj_class): nelem = 1000 arr1 = utils.gen_rand('float64', nelem) * 10000 # Keeping a low value because CUDA 'pow' has 2 full range error arr2 = utils.gen_rand('float64', nelem) * 10 sr1 = Series(arr1) sr2 = Series(arr2) if obj_class == 'Index': sr1 = as_index(sr1) sr2 = as_index(sr2) result = binop(sr1, sr2) expect = binop(pd.Series(arr1), pd.Series(arr2)) if obj_class == 'Index': result = Series(result) utils.assert_eq(result, expect)
def test_index_rename(): pds = pd.Index([1, 2, 3], name="asdf") gds = as_index(pds) expect = pds.rename("new_name") got = gds.rename("new_name") assert_eq(expect, got) """ From here on testing recursive creation and if name is being handles in recursive creation. """ pds = pd.Index(expect) gds = as_index(got) assert_eq(pds, gds) pds = pd.Index(pds, name="abc") gds = as_index(gds, name="abc") assert_eq(pds, gds)
def test_series_binop(binop, obj_class): arr = np.random.random(100) sr = Series(arr) if obj_class == 'Index': sr = as_index(sr) result = binop(sr, sr) if obj_class == 'Index': result = Series(result) np.testing.assert_almost_equal(result.to_array(), binop(arr, arr))
def test_series_compare(cmpop, obj_class, dtype): arr1 = np.random.randint(0, 100, 100).astype(dtype) arr2 = np.random.randint(0, 100, 100).astype(dtype) sr1 = Series(arr1) sr2 = Series(arr2) if obj_class == 'Index': sr1 = as_index(sr1) sr2 = as_index(sr2) result1 = cmpop(sr1, sr1) result2 = cmpop(sr2, sr2) result3 = cmpop(sr1, sr2) if obj_class == 'Index': result1 = Series(result1) result2 = Series(result2) result3 = Series(result3) np.testing.assert_equal(result1.to_array(), cmpop(arr1, arr1)) np.testing.assert_equal(result2.to_array(), cmpop(arr2, arr2)) np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2))
def test_series_binop_scalar(nelem, binop, obj_class): arr = np.random.random(nelem) rhs = np.asscalar(random.choice(arr)) sr = Series(arr) if obj_class == 'Index': sr = as_index(sr) result = binop(sr, rhs) if obj_class == 'Index': result = Series(result) np.testing.assert_almost_equal(result.to_array(), binop(arr, rhs))
def value_counts(self, method='sort', sort=True): """Returns unique values of this Series. """ if method != 'sort': msg = 'non sort based value_count() not implemented yet' raise NotImplementedError(msg) if self.null_count == len(self): return Series(np.array([], dtype=np.int64)) vals, cnts = self._column.value_counts(method=method) res = Series(cnts, index=as_index(vals)) if sort: return res.sort_values(ascending=False) return res
def __init__(self, data=None, index=None, name=None, nan_as_null=True, dtype=None): if isinstance(data, pd.Series): name = data.name index = as_index(data.index) if isinstance(data, Series): index = data._index if index is None else index name = data.name data = data._column if data is None: data = {} if not isinstance(data, columnops.TypedColumnBase): data = columnops.as_column(data, nan_as_null=nan_as_null, dtype=dtype) if index is not None and not isinstance(index, Index): index = as_index(index) assert isinstance(data, columnops.TypedColumnBase) self._column = data self._index = RangeIndex(len(data)) if index is None else index self.name = name
def test_series_compare_scalar(nelem, cmpop, obj_class, dtype): arr1 = np.random.randint(0, 100, 100).astype(dtype) sr1 = Series(arr1) rhs = np.asscalar(random.choice(arr1)) if obj_class == 'Index': sr1 = as_index(sr1) result1 = cmpop(sr1, rhs) result2 = cmpop(rhs, sr1) if obj_class == 'Index': result1 = Series(result1) result2 = Series(result2) np.testing.assert_equal(result1.to_array(), cmpop(arr1, rhs)) np.testing.assert_equal(result2.to_array(), cmpop(rhs, arr1))
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)
def __getitem__(self, arg): rows = [] len_idx = len(self._sr) if isinstance(arg, tuple): for idx in arg: rows.append(idx) elif isinstance(arg, int): rows.append(arg) elif isinstance(arg, slice): start, stop, step, sln = utils.standard_python_slice(len_idx, arg) if sln > 0: for idx in range(start, stop, step): rows.append(idx) else: raise TypeError(type(arg)) # To check whether all the indices are valid. for idx in rows: if abs(idx) > len_idx or idx == len_idx: raise IndexError("positional indexers are out-of-bounds") for i in range(len(rows)): if rows[i] < 0: rows[i] = len_idx + rows[i] # returns the single elem similar to pandas if isinstance(arg, int) and len(rows) == 1: return self._sr[rows[0]] ret_list = [] for idx in rows: ret_list.append(self._sr[idx]) col_data = columnops.as_column(ret_list, dtype=self._sr.dtype, nan_as_null=True) return Series(col_data, index=as_index(np.asarray(rows)))
def __init__(self, data=None, index=None, name=None, nan_as_null=True): if isinstance(data, pd.Series): name = data.name index = as_index(data.index) if isinstance(data, Series): index = data._index if index is None else index name = data.name data = data._column if data is None: data = {} if not isinstance(data, columnops.TypedColumnBase): data = columnops.as_column(data, nan_as_null=nan_as_null) if index is not None and not isinstance(index, Index): raise TypeError('index not a Index type: got {!r}'.format(index)) assert isinstance(data, columnops.TypedColumnBase) self._column = data self._index = RangeIndex(len(data)) if index is None else index self.name = name
def to_pandas(self): pandas_codes = [] for code in self.codes.columns: pandas_codes.append(self.codes[code].to_array()) # We do two things here to mimic Pandas behavior: # 1. as_index() on each level, so DatetimeColumn becomes DatetimeIndex # 2. convert levels to numpy array so empty levels become Float64Index levels = np.array( [as_index(level).to_pandas() for level in self.levels] ) # Backwards compatibility: # Construct a dummy MultiIndex and check for the codes attr. # This indicates that it is pandas >= 0.24 # If no codes attr is present it is pandas <= 0.23 if hasattr(pd.MultiIndex([[]], [[]]), "codes"): pandas_mi = pd.MultiIndex(levels=levels, codes=pandas_codes) else: pandas_mi = pd.MultiIndex(levels=levels, labels=pandas_codes) if self.names is not None: pandas_mi.names = self.names return pandas_mi
def test_reflected_ops_scalar(func, dtype, obj_class): # create random series np.random.seed(12) random_series = utils.gen_rand(dtype, 100, low=10) # gpu series gs = Series(random_series) # class typing if obj_class == 'Index': gs = as_index(gs) gs_result = func(gs) # class typing if obj_class == 'Index': gs = Series(gs) # pandas ps_result = func(random_series) # verify np.testing.assert_allclose(ps_result, gs_result)
def _getitem_tuple_arg(self, arg): from cudf import MultiIndex from cudf.dataframe.dataframe import DataFrame from cudf.dataframe.dataframe import Series from cudf.dataframe.index import as_index # Iloc Step 1: # Gather the columns specified by the second tuple arg columns = self._get_column_selection(arg[1]) if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) if ( len(columns_df) == 0 and len(columns_df.columns) == 0 and not isinstance(arg[0], slice) ): result = Series([], name=arg[0]) result._index = columns_df.columns.copy(deep=False) return result else: if isinstance(arg[0], slice): columns_df = DataFrame() for col in columns: columns_df.add_column(name=col, data=self._df[col]) columns_df._index = self._df._index else: columns_df = self._df._columns_view(columns) # Iloc Step 2: # Gather the rows specified by the first tuple arg if isinstance(columns_df.index, MultiIndex): df = columns_df.index._get_row_major(columns_df, arg[0]) if (len(df) == 1 and len(columns_df) >= 1) and not ( isinstance(arg[0], slice) or isinstance(arg[1], slice) ): # Pandas returns a numpy scalar in this case return df[0] if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df else: df = DataFrame() for key, col in columns_df._cols.items(): df[key] = col.iloc[arg[0]] df.columns = columns_df.columns # Iloc Step 3: # Reindex if df.shape[0] == 1: # we have a single row without an index if isinstance(arg[0], slice): start = arg[0].start if start is None: start = 0 df.index = as_index(self._df.index[start]) else: df.index = as_index(self._df.index[arg[0]]) # Iloc Step 4: # Downcast if self._can_downcast_to_series(df, arg): if isinstance(df.columns, MultiIndex): if len(df) > 0 and not ( isinstance(arg[0], slice) or isinstance(arg[1], slice) ): return list(df._cols.values())[0][0] elif df.shape[1] > 1: result = self._downcast_to_series(df, arg) result.index = df.columns return result elif not isinstance(arg[0], slice): result_series = list(df._cols.values())[0] result_series.index = df.columns result_series.name = arg[0] return result_series else: return list(df._cols.values())[0] return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0: from cudf.dataframe.index import RangeIndex slice_len = arg[0].stop or len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df
def __getitem__(self, arg): from cudf.dataframe.index import as_index return as_index(self.idx.to_series().loc[arg])