def kernel_masked(values, masks, results): i = cuda.grid(1) # in range? if i < values.size: # valid? if utils.mask_get(masks, i): # call udf results[i] = core(values[i])
def gpu_expand_mask_bits(bits, out): """Expand each bits in bitmask *bits* into an element in out. This is a flexible kernel that can be launch with any number of blocks and threads. """ for i in range(cuda.grid(1), out.size, cuda.gridsize(1)): if i < bits.size * mask_bitsize: out[i] = mask_get(bits, i)
def gpu_notna(validity, out): tid = cuda.grid(1) if tid < out.size: valid = mask_get(validity, tid) if valid: out[tid] = True else: out[tid] = False
def gpu_equal_constant_masked(arr, mask, val, out): i = cuda.grid(1) if i < out.size: res = (arr[i] == val) if mask_get(mask, i) else False out[i] = res
def gpu_fill_masked(value, validity, out): tid = cuda.grid(1) if tid < out.size: valid = mask_get(validity, tid) if not valid: out[tid] = value
def gpu_copy_to_dense(data, mask, slots, out): tid = cuda.grid(1) if tid < data.size and mask_get(mask, tid): idx = slots[tid] out[idx] = data[tid]