def _encode_list_column(original, encoded, dtype=None): """Convert `encoded` to be a list column with the same offsets as `original` """ if isinstance(original, pd.Series): # Pandas version (not very efficient) offset = 0 new_data = [] for val in original.values: size = len(val) new_data.append( np.array(encoded[offset:offset + size], dtype=dtype)) offset += size return pd.Series(new_data) else: # CuDF version encoded = as_column(encoded) if dtype: encoded = encoded.astype(dtype, copy=False) list_dtype = cudf.core.dtypes.ListDtype( encoded.dtype if dtype is None else dtype) return build_column( None, dtype=list_dtype, size=original.size, children=(original._column.offsets, encoded), )
def _encode_list_column(original, encoded): encoded = as_column(encoded) return build_column( None, dtype=cudf.core.dtypes.ListDtype(encoded.dtype), size=original.size, children=(original._column.offsets, encoded), )
def as_timedelta_column(self, dtype, **kwargs): return build_column( data=self.astype("int64").base_data, dtype=dtype, mask=self.base_mask, offset=self.offset, size=self.size, )
def _build_cudf_list_column(new_elements, new_offsets): if not HAS_GPU: return [] return build_column( None, dtype=cudf.core.dtypes.ListDtype(new_elements.dtype), size=new_offsets.size - 1, children=(as_column(new_offsets), as_column(new_elements)), )
def as_numerical(self): from cudf.core.column import build_column return build_column( data=self.base_data, dtype=np.int64, mask=self.base_mask, offset=self.offset, size=self.size, )
def as_datetime_column(self, dtype, **kwargs): from cudf.core.column import build_column return build_column( data=self.astype("int64").base_data, dtype=dtype, mask=self.base_mask, offset=self.offset, size=self.size, )
def as_timedelta_column(self, dtype: Dtype, **kwargs) -> "cudf.core.column.TimeDeltaColumn": return cast( "cudf.core.column.TimeDeltaColumn", build_column( data=self.astype("int64").base_data, dtype=dtype, mask=self.base_mask, offset=self.offset, size=self.size, ), )
def execute(self, requests: List[InferenceRequest]) -> List[InferenceResponse]: """Transforms the input batches by running through a NVTabular workflow.transform function. """ responses = [] for request in requests: # create a cudf DataFrame from the triton request input_df = cudf.DataFrame({ name: _convert_tensor(get_input_tensor_by_name(request, name)) for name in self.input_dtypes }) for name, dtype in self.input_multihots.items(): values = as_column( _convert_tensor( get_input_tensor_by_name(request, name + "__values"))) nnzs = as_column( _convert_tensor( get_input_tensor_by_name(request, name + "__nnzs"))) input_df[name] = build_column(None, dtype=dtype, size=nnzs.size - 1, children=(nnzs, values)) # use our NVTabular workflow to transform the dataframe output_df = nvtabular.workflow._transform_partition( input_df, [self.workflow.column_group]) # convert back to a triton response output_tensors = [] for name in output_df.columns: col = output_df[name] if is_list_dtype(col.dtype): # convert list values to match TF dataloader values = col.list.leaves.values_host.astype( self.output_dtypes[name + "__values"]) values = values.reshape(len(values), 1) output_tensors.append(Tensor(name + "__values", values)) offsets = col._column.offsets.values_host.astype( self.output_dtypes[name + "__nnzs"]) nnzs = offsets[1:] - offsets[:-1] nnzs = nnzs.reshape(len(nnzs), 1) output_tensors.append(Tensor(name + "__nnzs", nnzs)) else: d = col.values_host.astype(self.output_dtypes[name]) d = d.reshape(len(d), 1) output_tensors.append(Tensor(name, d)) responses.append(InferenceResponse(output_tensors)) return responses
def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: if isinstance(dtype, CategoricalDtype): return column.build_categorical_column( categories=dtype.categories._values, codes=build_column(self.base_data, dtype=self.dtype), mask=self.base_mask, ordered=dtype.ordered, size=self.size, offset=self.offset, null_count=self.null_count, ) return self
def _fillna_natwise(col): # If the value we are filling is np.datetime64("NAT") # we set the same mask as current column. # However where there are "<NA>" in the # columns, their corresponding locations nat = cudf._lib.scalar._create_proxy_nat_scalar(col.dtype) result = cudf._lib.replace.replace_nulls(col, nat) return column.build_column( data=result.base_data, dtype=result.dtype, size=result.size, offset=result.offset, children=result.base_children, )
def create_multihot_col(self, offsets, data): """ offsets = cudf series with offset values for list data data = cudf series with the list data flattened to 1-d """ offs = as_column(offsets, dtype="int32") encoded = as_column(data) col = build_column( None, size=offs.size - 1, dtype=cudf.core.dtypes.ListDtype(encoded.dtype), children=(offs, encoded), ) return cudf.Series(col)
def transform(self, columns: ColumnNames, df: DataFrameType) -> DataFrameType: on_cpu = isinstance(df, pd.DataFrame) ret = pd.DataFrame() if on_cpu else cudf.DataFrame() for col in columns: # handle CPU via normal python slicing (not very efficient) if on_cpu: ret[col] = [row[self.start:self.end] for row in df[col]] else: # figure out the size of each row from the list offsets c = df[col]._column offsets = c.offsets.values elements = c.elements.values # figure out the size of each row after slicing start/end new_offsets = cp.zeros(offsets.size, dtype=offsets.dtype) threads = 32 blocks = (offsets.size + threads - 1) // threads # calculate new row offsets after slicing _calculate_row_sizes[blocks, threads](self.start, self.end, offsets, new_offsets) new_offsets = cp.cumsum(new_offsets).astype(offsets.dtype) # create a new array for the sliced elements new_elements = cp.zeros(new_offsets[-1].item(), dtype=elements.dtype) if new_elements.size: _slice_rows[blocks, threads](self.start, offsets, elements, new_offsets, new_elements) # build up a list column with the sliced values ret[col] = build_column( None, dtype=cudf.core.dtypes.ListDtype(new_elements.dtype), size=new_offsets.size - 1, children=(as_column(new_offsets), as_column(new_elements)), ) return ret
def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_and_dtype device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize), protocol_dtype_to_cupy_dtype(dtype)) # check that non null values are the equals as nulls are represented # by sentinel values in the buffer. # FIXME: In gh-10202 some minimal fixes were added to unblock CI. But # currently only non-null values are compared, null positions are # unchecked. non_null_idxs = ~cudf.Series(cudfcol).isna() assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) if dtype[0] != _DtypeKind.BOOL: array_from_dlpack = cp.fromDlpack(buf.__dlpack__()).get() col_array = cp.asarray(cudfcol.data_array_view).get() assert_eq( array_from_dlpack[non_null_idxs.to_numpy()].flatten(), col_array[non_null_idxs.to_numpy()].flatten(), ) else: pytest.raises(TypeError, buf.__dlpack__)
def as_datetime_column(self, dtype, **kwargs): from cudf.core.column import build_column return build_column(data=self.astype("int64").data, dtype=dtype, mask=self.mask)
def as_numerical(self): from cudf.core.column import build_column return build_column(data=self.data, dtype=np.int64, mask=self.mask)