def binop(lhs, rhs, op, out_dtype): nvtx_range_push("CUDF_BINARY_OP", "orange") masked = lhs.has_null_mask or rhs.has_null_mask out = columnops.column_empty_like(lhs, dtype=out_dtype, masked=masked) null_count = cpp_binops.apply_op(lhs, rhs, out, op) out = out.replace(null_count=null_count) nvtx_range_pop() return out
def string_column_binop(lhs, rhs, op): nvtx_range_push("CUDF_BINARY_OP", "orange") # Allocate output masked = lhs.has_null_mask or rhs.has_null_mask out = columnops.column_empty_like(lhs, dtype='bool', masked=masked) # Call and fix null_count null_count = cpp_binops.apply_op(lhs=lhs, rhs=rhs, out=out, op=op) result = out.replace(null_count=null_count) nvtx_range_pop() return result
def numeric_column_binop(lhs, rhs, op, out_dtype): nvtx_range_push("CUDF_BINARY_OP", "orange") # Allocate output masked = lhs.has_null_mask or rhs.has_null_mask out = columnops.column_empty_like(lhs, dtype=out_dtype, masked=masked) # Call and fix null_count null_count = cpp_binops.apply_op(lhs, rhs, out, op) out = out.replace(null_count=null_count) result = out.view(NumericalColumn, dtype=out_dtype) nvtx_range_pop() return result
def take(self, indices, ignore_index=False): """Return Column by taking values from the corresponding *indices*. """ import cudf.bindings.copying as cpp_copying from cudf.dataframe.columnops import column_empty_like indices = Buffer(indices).to_gpu_array() # Handle zero size if indices.size == 0: return column_empty_like(self, newsize=0) # Returns a new column result = cpp_copying.apply_gather(self, indices) result.name = self.name return result
def numeric_column_binop(lhs, rhs, op, out_dtype): nvtx_range_push("CUDF_BINARY_OP", "orange") # Allocate output masked = lhs.has_null_mask or rhs.has_null_mask out = columnops.column_empty_like(lhs, dtype=out_dtype, masked=masked) # Call and fix null_count if lhs.dtype != rhs.dtype or op not in _binary_impl: # Use JIT implementation null_count = cpp_binops.apply_op(lhs=lhs, rhs=rhs, out=out, op=op) else: # Use compiled implementation null_count = _gdf.apply_binaryop(_binary_impl[op], lhs, rhs, out) out = out.replace(null_count=null_count) result = out.view(NumericalColumn, dtype=out_dtype) nvtx_range_pop() return result
def _concat(cls, objs, dtype=None): from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn if len(objs) == 0: if pd.api.types.is_categorical_dtype(dtype): return CategoricalColumn(data=Column( Buffer.null(np.dtype('int8'))), null_count=0, ordered=False) elif dtype == np.dtype('object'): return StringColumn(data=nvstrings.to_device([]), null_count=0) else: dtype = np.dtype(dtype) return Column(Buffer.null(dtype)) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not objs[i].is_type_equivalent(head): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.dataframe.columnops import column_empty_like objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): new_cats = tuple( set([val for o in objs for val in o.cat().categories])) objs = [o.cat()._set_categories(new_cats) for o in objs] head = objs[0] for obj in objs: if not (obj.is_type_equivalent(head)): raise ValueError("All series must be of same type") # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _column_concat(objs, col) return col
def _concat(cls, objs, dtype=None): from cudf.dataframe.series import Series from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.numerical import NumericalColumn if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if dtype.type in (np.object_, np.str_): return StringColumn(data=nvstrings.to_device([]), null_count=0) elif is_categorical_dtype(dtype): return CategoricalColumn( data=Column(Buffer.null(np.dtype("int8"))), null_count=0, ordered=False, ) else: return Column(Buffer.null(dtype)) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and (len([ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ]) == 0): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not objs[i].is_type_equivalent(head): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.dataframe.columnops import column_empty_like objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = (Series(Column._concat([o.categories for o in objs ])).drop_duplicates()._column) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.is_type_equivalent(head)): raise ValueError("All series must be of same type") # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _column_concat(objs, col) return col