def _loc_to_iloc(self, arg): from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance( arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray) ): if len(arg) == 0: arg = Series(np.array([], dtype="int32")) else: arg = Series(arg) if isinstance(arg, Series): if arg.dtype in [np.bool, np.bool_]: return arg else: return indices_from_labels(self._sr, arg) elif is_scalar(arg): found_index = self._sr.index.find_label_range(arg, None)[0] return found_index elif isinstance(arg, slice): start_index, stop_index = self._sr.index.find_label_range( arg.start, arg.stop ) return slice(start_index, stop_index, arg.step) else: raise NotImplementedError( ".loc not implemented for label type {}".format( type(arg).__name__ ) )
def __getitem__(self, arg): from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance( arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)): if len(arg) == 0: arg = Series(np.array([], dtype='int32')) else: arg = Series(arg) if isinstance(arg, Series): if arg.dtype in [np.bool, np.bool_]: return self._sr.iloc[arg] # To do this efficiently we need a solution to # https://github.com/rapidsai/cudf/issues/1087 out = Series([], dtype=self._sr.dtype, index=self._sr.index.__class__([])) for s in arg: out = out.append(self._sr.loc[s:s], ignore_index=False) return out elif is_single_value(arg): found_index = self._sr.index.find_label_range(arg, None)[0] return self._sr.iloc[found_index] elif isinstance(arg, slice): start_index, stop_index = self._sr.index.find_label_range( arg.start, arg.stop) return self._sr.iloc[start_index:stop_index:arg.step] else: raise NotImplementedError( ".loc not implemented for label type {}".format( type(arg).__name__))
def _compute_validity_mask(self, index, row_tuple, max_length): """ Computes the valid set of indices of values in the lookup """ from cudf import DataFrame from cudf import Series from cudf import concat from cudf.utils.cudautils import arange lookup = DataFrame() for idx, row in enumerate(row_tuple): if row == slice(None): continue lookup[index._source_data.columns[idx]] = Series(row) data_table = concat( [ index._source_data, DataFrame({"idx": Series(arange(len(index._source_data)))}), ], axis=1, ) result = lookup.merge(data_table)["idx"] # Avoid computing levels unless the result of the merge is empty, # which suggests that a KeyError should be raised. if len(result) == 0: for idx, row in enumerate(row_tuple): if row == slice(None): continue if row not in index.levels[idx]: raise KeyError(row) return result
def normalize_chunks(self, size, chunks): if isinstance(chunks, six.integer_types): # *chunks* is the chunksize return cudautils.arange(0, size, chunks) else: # *chunks* is an array of chunk leading offset chunks = Series(chunks) return chunks.to_gpu_array()
def codes(self): from cudf.dataframe.series import Series data = self._parent.data if self._parent.has_null_mask: mask = self._parent.mask null_count = self._parent.null_count return Series.from_masked_array(data=data.mem, mask=mask.mem, null_count=null_count) else: return Series(data)
def _get_column_major(self, df, row_tuple): from cudf import Series from cudf import DataFrame valid_indices = self._get_valid_indices_by_tuple( df.columns, row_tuple, len(df._cols) ) result = df._take_columns(valid_indices) if isinstance(row_tuple, (numbers.Number, slice)): row_tuple = [row_tuple] if len(result) == 0 and len(result.columns) == 0: result_columns = df.columns.copy(deep=False) clear_codes = DataFrame() for name in df.columns.names: clear_codes[name] = Series([]) result_columns._codes = clear_codes result_columns._source_data = clear_codes result.columns = result_columns elif len(row_tuple) < len(self.levels) and ( not slice(None) in row_tuple and not isinstance(row_tuple[0], slice) ): columns = self._popn(len(row_tuple)) result.columns = columns.take(valid_indices) else: result.columns = self.take(valid_indices) if len(result.columns.levels) == 1: columns = [] for code in result.columns.codes[result.columns.codes.columns[0]]: columns.append(result.columns.levels[0][code]) name = result.columns.names[0] result.columns = as_index(columns, name=name) return result
def concat(objs, ignore_index=False): """Concatenate DataFrames, Series, or Indices row-wise. Parameters ---------- objs : list of DataFrame, Series, or Index ignore_index : bool Set True to ignore the index of the *objs* and provide a default range index instead. Returns ------- A new object of like type with rows from each object in ``objs``. """ if not objs: raise ValueError("Need at least one object to concatenate") # no-op for single object if len(objs) == 1: return objs[0] typs = set(type(o) for o in objs) if len(typs) > 1: raise ValueError("`concat` expects all objects to be of the same " "type. Got mix of %r." % [t.__name__ for t in typs]) typ = list(typs)[0] if typ is DataFrame: return DataFrame._concat(objs, ignore_index=ignore_index) elif typ is Series: return Series._concat(objs) elif issubclass(typ, Index): return Index._concat(objs) else: raise ValueError("Unknown type %r" % typ)
def _get_row_major(self, df, row_tuple): valid_indices = self._compute_validity_mask(df, row_tuple) from cudf import Series result = df.take(Series(valid_indices)) # Build new index - INDEX based MultiIndex # --------------- from cudf import DataFrame out_index = DataFrame() # Select the last n-k columns where n is the number of source # levels and k is the length of the indexing tuple for k in range(len(row_tuple), len(df.index.levels)): out_index.add_column(df.index.names[k], df.index.codes[df.index.codes.columns[k]]) # If there's only one column remaining in the output index, convert # it into a StringIndex and name the final index values according # to the proper codes. if len(out_index.columns) == 1: out_index = [] for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]: # noqa: E501 out_index.append(result.index.levels[ len(result.index.codes.columns)-1][val]) # TODO: Warning! The final index column could be arbitrarily # ordered integers, not Strings, so we need to check for that # dtype and produce a GenericIndex instead of a StringIndex out_index = StringIndex(out_index) out_index.name = result.index.names[len(result.index.names)-1] result.index = out_index else: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) if(len(out_index.columns)) > 0: result.reset_index(drop=True) result.index = result.index._popn(len(row_tuple)) return result
def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): from cudf.utils.cudautils import arange from cudf import Series # Instructions for Slicing # if tuple, get first and last elements of tuple # if open beginning tuple, get 0 to highest valid_index # if open ending tuple, get highest valid_index to len() # if not open end or beginning, get range lowest beginning index # to highest ending index if isinstance(row_tuple, slice): if ( isinstance(row_tuple.start, numbers.Number) or isinstance(row_tuple.stop, numbers.Number) or row_tuple == slice(None) ): stop = row_tuple.stop or max_length start, stop, step = row_tuple.indices(stop) return arange(start, stop, step) start_values = self._compute_validity_mask( index, row_tuple.start, max_length ) stop_values = self._compute_validity_mask( index, row_tuple.stop, max_length ) return Series(arange(start_values.min(), stop_values.max() + 1)) elif isinstance(row_tuple, numbers.Number): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length)
def _apply_op(self, fn, other=None): from cudf.dataframe.series import Series idx_series = Series(self) op = getattr(idx_series, fn) if other is not None: return as_index(op(other)) else: return as_index(op())
def _sortjoin(self, other, how='left', return_indexers=False): """Join with another column. When the column is a index, set *return_indexers* to obtain the indices for shuffling the remaining columns. """ from cudf.dataframe.series import Series if not self.is_type_equivalent(other): raise TypeError('*other* is not compatible') lkey, largsort = self.sort_by_values(True) rkey, rargsort = other.sort_by_values(True) with _gdf.apply_join( [lkey], [rkey], how=how, method='sort') as (lidx, ridx): if lidx.size > 0: raw_index = cudautils.gather_joined_index( lkey.to_gpu_array(), rkey.to_gpu_array(), lidx, ridx, ) buf_index = Buffer(raw_index) else: buf_index = Buffer.null(dtype=self.dtype) joined_index = lkey.replace(data=buf_index) if return_indexers: def gather(idxrange, idx): mask = (Series(idx) != -1).as_mask() return idxrange.take(idx).set_mask(mask).fillna(-1) if len(joined_index) > 0: indexers = ( gather(Series(largsort), lidx), gather(Series(rargsort), ridx), ) else: indexers = ( Series(Buffer.null(dtype=np.intp)), Series(Buffer.null(dtype=np.intp)) ) return joined_index, indexers else: return joined_index
def wrapper(*args, **kwargs): ret = getattr(self._parent._data, attr)(*args, **kwargs) if isinstance(ret, nvstrings.nvstrings): ret = Series( columnops.as_column(ret), index=self._index, name=self._parent.name, ) return ret
def concat(objs, axis=0, ignore_index=False, sort=None): """Concatenate DataFrames, Series, or Indices row-wise. Parameters ---------- objs : list of DataFrame, Series, or Index axis : concatenation axis, 0 - index, 1 - columns ignore_index : bool Set True to ignore the index of the *objs* and provide a default range index instead. Returns ------- A new object of like type with rows from each object in ``objs``. """ if sort not in (None, False): raise NotImplementedError("sort parameter is not yet supported") if not objs: raise ValueError("Need at least one object to concatenate") # no-op for single object if len(objs) == 1: return objs[0] typs = set(type(o) for o in objs) allowed_typs = {Series, DataFrame} # when axis is 1 (column) we can concat with Series and Dataframes if axis == 1: assert typs.issubset(allowed_typs) df = DataFrame() for idx, o in enumerate(objs): if isinstance(o, Series): name = o.name if o.name is None: # pandas uses 0-offset name = idx - 1 df[name] = o else: for col in o.columns: df[col] = o[col] return df if len(typs) > 1: raise ValueError("`concat` expects all objects to be of the same " "type. Got mix of %r." % [t.__name__ for t in typs]) typ = list(typs)[0] if typ is DataFrame: return DataFrame._concat(objs, axis=axis, ignore_index=ignore_index) elif typ is Series: return Series._concat(objs, axis=axis) elif issubclass(typ, Index): return Index._concat(objs) else: raise ValueError("Unknown type %r" % typ)
def _get_row_major(self, df, row_tuple): from cudf import Series valid_indices = self._get_valid_indices_by_tuple( df.index, row_tuple, len(df.index) ) indices = Series(valid_indices) result = df.take(indices) final = self._index_and_downcast(result, result.index, row_tuple) return final
def _group_dataframe(self, df, levels): """Group dataframe. The output dataframe has the same number of rows as the input dataframe. The rows are shuffled so that the groups are moved together in ascending order based on the multi-level index. Parameters ---------- df : DataFrame levels : list[str] Column names for the multi-level index. Returns ------- (df, segs) : namedtuple * df : DataFrame The grouped dataframe. * segs : Series. Group starting index. """ if len(df) == 0: # Groupby on empty dataframe return _dfsegs_pack(df=df, segs=Buffer(np.asarray([]))) # Prepare dataframe orig_df = df.copy() df = df.loc[:, levels].reset_index(drop=True) df = df.to_frame() if isinstance(df, Series) else df rowid_column = '__cudf.groupby.rowid' df[rowid_column] = df.index.as_column() col_order = list(levels) # Perform grouping df, segs, markers = self._group_first_level(col_order[0], rowid_column, df) rowidcol = df[rowid_column] sorted_keys = [Series(df.index.as_column())] del df more_keys, reordering_indices, segs = self._group_inner_levels( col_order[1:], rowidcol, segs, markers=markers) sorted_keys.extend(more_keys) valcols = [k for k in orig_df.columns if k not in levels] # Prepare output # All key columns are already sorted out_df = DataFrame() for k, sr in zip(levels, sorted_keys): out_df[k] = sr # Shuffle the value columns self._group_shuffle(orig_df.loc[:, valcols], reordering_indices, out_df) return _dfsegs_pack(df=out_df, segs=segs)
def __init__(self, levels=None, codes=None, labels=None, names=None, **kwargs): self.names = names column_names = [] if labels: warnings.warn("the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning) if labels and not codes: codes = labels # early termination enables lazy evaluation of codes if 'source_data' in kwargs: self._source_data = kwargs['source_data'] self._codes = codes self._levels = levels self.names = self._source_data.columns return # name setup if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenNDArray, pd.core.indexes.frozen.FrozenList)): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError('Must pass non-zero number of levels/codes') import cudf if not isinstance(codes, cudf.dataframe.dataframe.DataFrame) and\ not isinstance(codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)): raise TypeError('Codes is not a Sequence of sequences') if not isinstance(codes, cudf.dataframe.dataframe.DataFrame): self._codes = cudf.dataframe.dataframe.DataFrame() for idx, code in enumerate(codes): code = np.array(code) self._codes.add_column(column_names[idx], columnops.as_column(code)) else: self._codes = codes # converting levels to numpy array will produce a Float64Index # (on empty levels)for levels mimicking the behavior of Pandas self._levels = np.array([Series(level).to_array() for level in levels]) self._validate_levels_and_codes(self._levels, self._codes) self.name = None self.names = names
def lower(self): """ Convert strings in the Series/Index to lowercase. Returns ------- Series/Index of str dtype A copy of the object with all strings converted to lowercase. """ from cudf.dataframe import Series return Series(self._parent.data.lower(), index=self._index)
def searchsorted(self, value, side="left"): """Find indices where elements should be inserted to maintain order Parameters ---------- value : Column Column of values to search for side : str {‘left’, ‘right’} optional If ‘left’, the index of the first suitable location found is given. If ‘right’, return the last such index Returns ------- An index series of insertion points with the same shape as value """ from cudf.dataframe.series import Series idx_series = Series(self, name=self.name) result = idx_series.searchsorted(value, side) return as_index(result)
def _hashjoin(self, other, how='left', return_indexers=False): from cudf.dataframe.series import Series if not self.is_type_equivalent(other): raise TypeError('*other* is not compatible') with _gdf.apply_join( [self], [other], how=how, method='hash') as (lidx, ridx): if lidx.size > 0: raw_index = cudautils.gather_joined_index( self.to_gpu_array(), other.to_gpu_array(), lidx, ridx, ) buf_index = Buffer(raw_index) else: buf_index = Buffer.null(dtype=self.dtype) joined_index = self.replace(data=buf_index) if return_indexers: def gather(idxrange, idx): mask = (Series(idx) != -1).as_mask() return idxrange.take(idx).set_mask(mask).fillna(-1) if len(joined_index) > 0: indexers = ( gather(Series(range(0, len(self))), lidx), gather(Series(range(0, len(other))), ridx), ) else: indexers = ( Series(Buffer.null(dtype=np.intp)), Series(Buffer.null(dtype=np.intp)) ) return joined_index, indexers else: return joined_index
def _group_inner_levels(self, columns, rowidcol, segs, markers): """Group the second and onwards level. Parameters ---------- columns : sequence[str] Group keys. The order is important. rowid_column : str The name of the special column with the original rowid. It's internally used to determine the shuffling order. df : DataFrame The dataframe being grouped. segs : Series First level group begin offsets. Returns ------- (sorted_keys, reordering_indices, segments) - sorted_keys : list[Series] List of sorted key columns. Column order is same as arg *columns*. - reordering_indices : device array The indices to gather on to shuffle the dataframe into the grouped seqence. - segments : Series Group begin offsets. """ dsegs = segs.astype(dtype=np.int32).data.mem sorted_keys = [] plan_cache = {} for col in columns: # Shuffle the key column according to the previous groups srkeys = self._df[col].take(rowidcol.to_gpu_array(), ignore_index=True) # Segmented sort on the key shuf = Column(Buffer(cudautils.arange(len(srkeys)))) cache_key = (len(srkeys), srkeys.dtype, shuf.dtype) plan = plan_cache.get(cache_key) plan = apply_segsort(srkeys._column, shuf, dsegs, plan=plan) plan_cache[cache_key] = plan sorted_keys.append(srkeys) # keep sorted key cols # Determine segments dsegs, markers = cudautils.find_segments(srkeys.to_gpu_array(), dsegs, markers=markers) # Shuffle rowidcol = rowidcol.take(shuf.to_gpu_array(), ignore_index=True) reordering_indices = rowidcol.to_gpu_array() return sorted_keys, reordering_indices, Series(dsegs)
def _to_frame(self): from cudf import DataFrame # for each column of codes # replace column with mapping from integers to levels df = self.codes.copy(deep=False) for idx, column in enumerate(df.columns): # use merge as a replace fn level = DataFrame({'idx': Series(cudautils.arange(len( self.levels[idx]), dtype=df[column].dtype)), 'level': self.levels[idx]}) code = DataFrame({'idx': df[column]}) df[column] = code.merge(level).level return df
def apply(self, method): gpu_out = numba.cuda.device_array_like(self.gpu_in) kernel = get_ewm_kernel(method) kernel[(self.number_of_blocks,), (self.number_of_threads,), 0, self.shared_buffer_size * 8](self.gpu_in, gpu_out, self.window, self.span, self.array_len, self.thread_tile, self.min_periods) return Series(gpu_out)
def _compute_levels_and_codes(self): levels = [] from cudf import DataFrame codes = DataFrame() names = [] # Note: This is an O(N^2) solution using gpu masking # to compute new codes for the MultiIndex. There may be # a faster solution that could be executed on gpu at the same # time the groupby is calculated. for by in self._source_data.columns: if len(self._source_data[by]) > 0: level = self._source_data[by].unique() replaced = self._source_data[by].replace( level, Series(range(len(level)))) else: level = np.array([]) replaced = np.array([]) levels.append(level) codes[by] = Series(replaced, dtype="int32") names.append(by) self._levels = levels self._codes = codes self.names = names
def cat(self, others=None, sep=None, na_rep=None): """ Concatenate strings in the Series/Index with given separator. If *others* is specified, this function concatenates the Series/Index and elements of others element-wise. If others is not passed, then all values in the Series/Index are concatenated into a single string with a given sep. Parameters ---------- others : Series or List of str Strings to be appended. The number of strings must match size() of this instance. This must be either a Series of string dtype or a Python list of strings. sep : str If specified, this separator will be appended to each string before appending the others. na_rep : str This character will take the place of any null strings (not empty strings) in either list. - If `na_rep` is None, and `others` is None, missing values in the Series/Index are omitted from the result. - If `na_rep` is None, and `others` is not None, a row containing a missing value in any of the columns (before concatenation) will have a missing value in the result. Returns ------- concat : str or Series/Index of str dtype If `others` is None, `str` is returned, otherwise a `Series/Index` (same type as caller) of str dtype is returned. """ from cudf.dataframe import Series, Index if isinstance(others, (Series, Index)): assert others.dtype == np.dtype('object') others = others.data out = Series(self._parent.data.cat(others=others, sep=sep, na_rep=na_rep), index=self._index) if len(out) == 1 and others is None: out = out[0] return out
def __getattr__(self, attr, *args, **kwargs): from cudf.dataframe.series import Series if hasattr(self._parent._data, attr): passed_attr = getattr(self._parent._data, attr) if callable(passed_attr): def wrapper(*args, **kwargs): return getattr(self._parent._data, attr)(*args, **kwargs) if isinstance(wrapper, nvstrings.nvstrings): wrapper = Series(columnops.as_column(wrapper), index=self._index) return wrapper else: return passed_attr else: raise AttributeError(attr)
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)
def from_dlpack(pycapsule_obj): """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_. This function takes a PyCapsule object which contains a pointer to a DLPack tensor as input, and returns a cuDF object. This function deep copies the data in the DLPack tensor into a cuDF object. Parameters ---------- pycapsule_obj : PyCapsule Input DLPack tensor pointer which is encapsulated in a PyCapsule object. Returns ------- A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D or 2D. """ try: res, valids = cpp_dlpack.from_dlpack(pycapsule_obj) except GDFError as err: if str(err) == "b'GDF_DATASET_EMPTY'": raise ValueError( "Cannot create a cuDF Object from a DLPack tensor of 0 size") else: raise err cols = [] for idx in range(len(valids)): mask = None if valids[idx]: mask = Buffer(valids[idx]) cols.append( columnops.build_column(Buffer(res[idx]), dtype=res[idx].dtype, mask=mask)) if len(cols) == 1: return Series(cols[0]) else: df = DataFrame() for idx, col in enumerate(cols): df[idx] = col return df
def extract(self, pat, flags=0, expand=True): """ Extract capture groups in the regex `pat` as columns in a DataFrame. For each subject string in the Series, extract groups from the first match of regular expression `pat`. Parameters ---------- pat : str Regular expression pattern with capturing groups. expand : bool, default True If True, return DataFrame with on column per capture group. If False, return a Series/Index if there is one capture group or DataFrame if there are multiple capture groups. Returns ------- DataFrame or Series/Index A DataFrame with one row for each subject string, and one column for each group. If `expand=False` and `pat` has only one capture group, then return a Series/Index. Notes ----- The `flags` parameter is not yet supported and will raise a NotImplementedError if anything other than the default value is passed. """ if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") from cudf.dataframe import DataFrame, Series out = self._parent.data.extract(pat) if len(out) == 1 and expand is False: return Series( out[0], index=self._index ) else: out_df = DataFrame(index=self._index) for idx, val in enumerate(out): out_df[idx] = val return out_df
def len(self): """ Computes the length of each element in the Series/Index. Returns ------- Series or Index of int: A Series or Index of integer values indicating the length of each element in the Series or Index. """ from cudf.dataframe.series import Series out_dev_arr = rmm.device_array(len(self._parent), dtype='int32') ptr = get_ctype_ptr(out_dev_arr) self._parent.data.len(ptr) mask = None if self._parent.null_count > 0: mask = self._parent.mask column = columnops.build_column(Buffer(out_dev_arr), np.dtype('int32'), mask=mask) return Series(column, index=self._index)
def _group_first_level(self, col, rowid_column, df): """Group first level *col* of *df* Parameters ---------- col : str Name of the first group key column. df : DataFrame The dataframe being grouped. Returns ------- (df, segs) - df : DataFrame Sorted by *col- * index - segs : Series Group begin offsets """ df = df.loc[:, [col, rowid_column]] df = df.set_index(col).sort_index() segs, markers = df.index._find_segments() return df, Series(segs), markers