def _assign(self, value): if isinstance(value, (int, float)): # if we have a c or f contiguous array, then use the speedy driver kernel if self.flags.forc and float(value) >= 0: drv.memset_d16(self.gpudata, Flexpt.flex_from_native(value, self.iwl), self.size) # otherwise use our copy kerel else: OpTreeNode.build("copy", value, None, out=self) elif isinstance(value, FlexptArray): if self.flags.forc and value.flags.forc and self.iwl == value.iwl: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("copy", value, None, out=self) elif isinstance(value, OpTreeNode): value.execute(out=self) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def ones(self, shape, iwl, allocator=drv.mem_alloc, order="C"): """ Returns an array of the given shape and dtype filled with 0's. """ result = FlexptArray(self, shape, iwl, allocator, order=order) drv.memset_d16(result.gpudata, self.flex_from_native(1,iwl), result.size) return result
def _assign(self, value): if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8( self.gpudata, unpack_from('B', value)[0], self.size) elif self.dtype.itemsize == 2: drv.memset_d16(self.gpudata, unpack_from('H', value)[0], self.size) else: drv.memset_d32(self.gpudata, unpack_from('I', value)[0], self.size) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def _assign(self, value): if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8(self.gpudata, unpack_from('B', value)[0], self.size) elif self.dtype.itemsize == 2: drv.memset_d16(self.gpudata, unpack_from('H', value)[0], self.size) else: drv.memset_d32(self.gpudata, unpack_from('I', value)[0], self.size) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def _assign(self, value): if isinstance(value, (int, float)): # if we have a c or f contiguous array, then use the speedy driver kernel if self.flags.forc and float(value) >= 0: drv.memset_d16(self.gpudata, Flexpt.flex_from_native(value,self.iwl), self.size) # otherwise use our copy kerel else: OpTreeNode.build("copy", value, None, out=self) elif isinstance(value, FlexptArray): if self.flags.forc and value.flags.forc and self.iwl == value.iwl: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("copy", value, None, out=self) elif isinstance(value, OpTreeNode): value.execute(out=self) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def _autotune_kernel(n, k, type_args, tune_args): load_cnt = 0 array_ids = set() kernel_args = list() tune_args = list(tune_args) # Setup some fake data to autotune on # Perhaps tune on the real data but we'd need a custom memoize for that, # And we wouldn't be able to use n,k abstracted sizing data = drv.mem_alloc(n * k * 2) drv.memset_d16(data, 0, n * k) kernel_args.extend((data, k, 1, k)) for arg in type_args: arg_type, arg_idx = arg[0:2] # Array operands if arg_type is flexpt_array.FlexptArray: if arg_idx not in array_ids: array_ids.add(arg_idx) flags = tune_args.pop() size = 1 if flags & 1 == 0: size *= n if flags & 2 == 0: size *= k data = drv.mem_alloc(size * 2) drv.memset_d16(data, 0, size) kernel_args.extend((data, k, 1, 15, flags)) load_cnt += 1 # Constant operands elif arg_type is int: kernel_args.extend((0, 15)) # Operations elif arg[2]: # postop_convert_to_flex kernel_args.append(15) repeat = min(500, max(1, 8192**2 // (n * k))) # warm up the gpu clocks so our timings are accurate cur_ctx = drv.Context.get_current() if cur_ctx not in _context_warmup_set: _context_warmup_set.add(cur_ctx) kernel = _get_compound_ew_kernel(0, type_args) for r in range(repeat * 10): kernel.prepared_call((n, 1, 1), (32, 1, 1), *kernel_args) start = drv.Event() end = drv.Event() min_msecs = 99999999.9 min_blocks = 1 min_threads = 32 min_factor = 0 max_factor = 3 if load_cnt < 4 else 2 for unroll_factor in range(max_factor): kernel = _get_compound_ew_kernel(unroll_factor, type_args) unroll = 1 << unroll_factor for threads in (32, 64, 128, 256): for blocks in (1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024): loads = blocks * threads * unroll if loads > k and min_msecs != 99999999.9: #print "skipping %d loads for %d" % (loads, k) continue loops = k // loads if (loops > 8 or loops < 1) and min_msecs != 99999999.9: print "skipping %d loops %d // %d" % (loops, k, loads) continue start.record() for r in range(repeat): kernel.prepared_call((n, blocks, 1), (threads, 1, 1), *kernel_args) end.record() end.synchronize() msecs = end.time_since(start) if msecs < min_msecs: min_msecs = msecs min_factor = unroll_factor min_blocks = blocks min_threads = threads #print "%4d %d %4d %3d %4d (%4d, %4d) %4d %.5f" % \ # (repeat, unroll, blocks, threads, loads, n, k, loops, msecs) #print "Final: %d %4d %3d %.5f" % (1<<min_factor, min_blocks, min_threads, min_msecs) return (min_factor, min_blocks, min_threads)
def _autotune_kernel(n, k, type_args, tune_args): load_cnt = 0 array_ids = set() kernel_args = list() tune_args = list(tune_args) # Setup some fake data to autotune on # Perhaps tune on the real data but we'd need a custom memoize for that, # And we wouldn't be able to use n,k abstracted sizing data = drv.mem_alloc(n * k * 2) drv.memset_d16(data, 0, n * k) kernel_args.extend((data, k, 1, k)) for arg in type_args: arg_type, arg_idx = arg[0:2] # Array operands if arg_type is flexpt_array.FlexptArray: if arg_idx not in array_ids: array_ids.add(arg_idx) flags = tune_args.pop() size = 1 if flags & 1 == 0: size *= n if flags & 2 == 0: size *= k data = drv.mem_alloc(size * 2) drv.memset_d16(data, 0, size) kernel_args.extend((data, k, 1, 15, flags)) load_cnt += 1 # Constant operands elif arg_type is int: kernel_args.extend((0,15)) # Operations elif arg[2]: # postop_convert_to_flex kernel_args.append(15) repeat = min(500, max(1, 8192**2 // (n * k))) # warm up the gpu clocks so our timings are accurate cur_ctx = drv.Context.get_current() if cur_ctx not in _context_warmup_set: _context_warmup_set.add(cur_ctx) kernel = _get_compound_ew_kernel(0, type_args) for r in range(repeat * 10): kernel.prepared_call((n,1,1), (32,1,1), *kernel_args) start = drv.Event() end = drv.Event() min_msecs = 99999999.9 min_blocks = 1 min_threads = 32 min_factor = 0 max_factor = 3 if load_cnt < 4 else 2 for unroll_factor in range(max_factor): kernel = _get_compound_ew_kernel(unroll_factor, type_args) unroll = 1 << unroll_factor for threads in (32,64,128,256): for blocks in (1,2,4,8,16,32,64,128,256,512,1024): loads = blocks * threads * unroll if loads > k and min_msecs != 99999999.9: #print "skipping %d loads for %d" % (loads, k) continue loops = k // loads if (loops > 8 or loops < 1) and min_msecs != 99999999.9: print "skipping %d loops %d // %d" % (loops, k, loads) continue start.record() for r in range(repeat): kernel.prepared_call((n,blocks,1), (threads,1,1), *kernel_args) end.record() end.synchronize() msecs = end.time_since(start) if msecs < min_msecs: min_msecs = msecs min_factor = unroll_factor min_blocks = blocks min_threads = threads #print "%4d %d %4d %3d %4d (%4d, %4d) %4d %.5f" % \ # (repeat, unroll, blocks, threads, loads, n, k, loops, msecs) #print "Final: %d %4d %3d %.5f" % (1<<min_factor, min_blocks, min_threads, min_msecs) return (min_factor, min_blocks, min_threads)