def start_define_computation(self, computation_decl): self.exop_codegen.append("class {}(HetrLocals, ConvLocals):", computation_decl.computation_op.name) with indenting(self.exop_codegen): self.exop_codegen.append("def __init__(self, **kwargs):") with indenting(self.exop_codegen): if is_tracing_enabled(): self.exop_codegen.append(""" self.__profiler_start__ = list() self.__profiler_stop__ = list() """) self.exop_codegen.append('super({}, self).__init__(**kwargs)', computation_decl.computation_op.name) for exop in computation_decl.exop_block: output_decl = exop.output_decls[0] if len( exop.output_decls) > 0 else None # TODO better way to deal with multiple values self.exop_codegen.exop = exop self.exop_codegen.allocate_op(exop.op, output_decl, *exop.input_decls) self.exop_codegen.endl() self.exop_codegen.indent(1) self.exop_codegen.append("def __call__(self):") self.exop_codegen.indent(1) self.codegen_define_length = self.exop_codegen.code_length
def transform_ordered_ops(self, computation, ordered_ops, name): self.current_computation = computation if name is None: name = "C_" + str(self.n_computations) self.n_computations += 1 self.compute_code.append("class {}(HetrLocals, ConvLocals):", name) with indenting(self.compute_code): self.compute_code.append("def __init__(self, **kwargs):") with indenting(self.compute_code): self.compute_code.append('super({}, self).__init__(**kwargs)', name) self.transform_allocate_ops(ordered_ops) self.compute_code.endl() self.compute_code.append("def __call__(self):") code_length = self.compute_code.code_length def tensor_description_value(x): if isinstance(x, TensorDescription): return self.get_tensor_description_tensor_view(x) return x with indenting(self.compute_code): for op in ordered_ops: out = tensor_description_value( op.forwarded.tensor_description()) call_info = (tensor_description_value(_) for _ in op.call_info()) self.compute_code.generate_op(op, out, *call_info) if code_length == self.compute_code.code_length: self.compute_code.append("pass") self.compute_code.endl() self.name = name return name
def finish_transform(self): if self.model is not None: return self.code.append(" class Model(object):") with indenting(self.code): if len(self.device_buffers) == 0: self.init_code.append("pass") self.code.append(self.init_code.code) self.code.endl() self.code.append(NumPyConvEngine.all_conv_code()) self.code.endl() self.code.append(self.allocate_storage_code.code) self.code.endl() if len(self.device_buffers) == 0: self.allocate_code.append("pass") self.code.append(self.allocate_code.code) self.code.endl(2) self.code.append(self.compute_code.code) # print(self.code.code) # print(self.code.filename) r = self.code.compile("op", globals()) self.model = r['Model']() self.model.conv_params = self.compute_code.conv_params self.model.pool_params = self.compute_code.pool_params self.model.conv_slices = self.compute_code.conv_slices self.model.pool_slices = self.compute_code.pool_slices for computation in self.computations: executor = getattr(self.model, computation.name) computation.executor = executor
def transform_allocate(self): self.transformer.init_code.append("{} = None", self.ref_str) self.transformer.allocate_storage_code.append("def {}():", self.alloc_name) with indenting(self.transformer.allocate_storage_code): elts = self.bytes // self.dtype.itemsize if self.dtype.name == 'float32': c_type_name = 'c_float' elif self.dtype.name == 'float64': c_type_name = 'c_double' else: c_type_name = None if c_type_name is not None and self.transformer.use_mlsl: self.transformer.allocate_storage_code.append( """try: type_size = ctypes.sizeof(ctypes.{3}(1)) mlsl_buf_{0} = mlsl_obj.alloc({1} * type_size, 64) array_{0} = ctypes.cast(mlsl_buf_{0}, ctypes.POINTER(ctypes.{3} * {1})) np_array_{0} = np.frombuffer(array_{0}.contents, dtype=np.dtype('{2}')) {0}(np_array_{0}) except NameError as error: print str(error) {0}(np.empty({1}, dtype=np.dtype('{2}')))""", self.update_name, elts, self.dtype.name, c_type_name) else: self.transformer.allocate_storage_code.append( "{}(np.empty({}, dtype=np.dtype('{}')))", self.update_name, elts, self.dtype.name) self.transformer.allocate_storage_code.endl() self.transformer.allocate_storage_code.append("def {}(buffer):", self.update_name) with indenting(self.transformer.allocate_storage_code): self.transformer.allocate_storage_code.append( "global {}", self.ref_str) self.transformer.allocate_storage_code.append( "{} = buffer", self.ref_str) self.transform_allocate_views() self.transformer.allocate_storage_code.endl() self.transformer.allocate_code.append("{}()", self.alloc_name)
def transform_allocate(self): self.transformer.init_code.append("{} = None", self.ref_str) self.transformer.allocate_storage_code.append("def {}():", self.alloc_name) with indenting(self.transformer.allocate_storage_code): elts = self.bytes // self.dtype.itemsize self.transformer.allocate_storage_code.append( "{}(np.empty({}, dtype=np.dtype('{}')))", self.update_name, elts, self.dtype.name) self.transformer.allocate_storage_code.endl() self.transformer.allocate_storage_code.append("def {}(buffer):", self.update_name) with indenting(self.transformer.allocate_storage_code): self.transformer.allocate_storage_code.append("global {}", self.ref_str) self.transformer.allocate_storage_code.append("{} = buffer", self.ref_str) self.transform_allocate_views() self.transformer.allocate_storage_code.endl() self.transformer.allocate_code.append("{}()", self.alloc_name)
def transform_ordered_ops(self, ordered_ops, name): if name is None: name = "C_" + str(self.n_computations) self.n_computations += 1 self.compute_code.append("class {}(HetrLocals, ConvLocals):", name) with indenting(self.compute_code): self.compute_code.append("def __call__(self):") code_length = self.compute_code.code_length def tensor_description_value(x): if isinstance(x, TensorDescription): return x.value return x with indenting(self.compute_code): for op in ordered_ops: out = tensor_description_value(op.tensor_description()) call_info = (tensor_description_value(_) for _ in op.call_info()) self.compute_code.generate_op(op, out, *call_info) if code_length == self.compute_code.code_length: self.compute_code.append("pass") self.compute_code.endl() self.name = name return name
def transform_ordered_ops(self, ordered_ops, name): if name is None: name = "c_" + str(self.n_computations) self.n_computations += 1 self.compute_code.append("def {}(self):", name) code = self.compute_code.code def tensor_description_value(x): if isinstance(x, TensorDescription): return x.value return x with indenting(self.compute_code): for op in ordered_ops: out = tensor_description_value(op.tensor_description()) call_info = (tensor_description_value(_) for _ in op.call_info()) self.compute_code.generate_op(op, out, *call_info) if code is self.compute_code.code: self.compute_code.append("pass") self.compute_code.endl() return name
def finish_transform(self): if self.model is not None: return self.code.append(" class Model(object):") with indenting(self.code): if len(self.device_buffers) == 0: self.init_code.append("pass") self.code.append(self.init_code.code) self.code.endl() self.code.append(NumPyConvEngine.all_conv_code()) self.code.append(NumPyCodeEngine.lut_code()) self.code.endl() self.code.append(self.allocate_storage_code.code) self.code.endl() if len(self.device_buffers) == 0: self.allocate_code.append("pass") self.code.append(self.allocate_code.code) self.code.endl(2) self.code.append(self.compute_code.code) # with open("code_{}.py".format(self.name), "w") as f: # f.write(self.code.code) # print(self.code.filename) r = self.code.compile("op", globals()) self.model = r['Model'] def send(self, send_id): send_op = self.send_nodes[send_id] q = send_op.shared_q # TODO # below converts DeviceTensor to numpy array # should we instead serialize DeviceTensor? x_devicetensor = send_op.args[0].value x_nparr = x_devicetensor.get(None) q.put(x_nparr) def recv(self, recv_id): recv_op = self.recv_nodes[recv_id] q = recv_op.shared_q x = q.get() return x def gather_send(self, gather_send_id): gather_send_op = self.gather_send_nodes[gather_send_id] q = gather_send_op.shared_queue # TODO # below converts DeviceTensor to numpy array # should we instead serialize DeviceTensor? x_devicetensor = gather_send_op.args[0].value x_nparr = x_devicetensor.get(None) q.put(x_nparr) def gather_recv(self, gather_recv_id): gather_recv_op = self.gather_recv_nodes[gather_recv_id] x_devicetensor = gather_recv_op.value x_nparr = x_devicetensor.get(None) for i in range(len(gather_recv_op.from_id)): q = gather_recv_op.shared_queue_list[i] x = q.get() x_nparr[gather_recv_op.slices[i]] = x return x_nparr def scatter_send(self, scatter_send_id): scatter_send_op = self.scatter_send_nodes[scatter_send_id] # TODO # below converts DeviceTensor to numpy array # should we instead serialize DeviceTensor? x_devicetensor = scatter_send_op.args[0].value x_nparr = x_devicetensor.get(None) for i in range(len(scatter_send_op.to_id)): q = scatter_send_op.shared_queue_list[i] q.put(x_nparr[scatter_send_op.slices[i]]) def scatter_recv(self, scatter_recv_id): scatter_recv_op = self.scatter_recv_nodes[scatter_recv_id] q = scatter_recv_op.shared_queue x = q.get() return x self.model.recv_from_send = recv self.model.send = send self.model.gather_recv_from_gather_send = gather_recv self.model.gather_send = gather_send self.model.scatter_recv_from_scatter_send = scatter_recv self.model.scatter_send = scatter_send self.model = self.model() self.model.send_nodes = self.compute_code.send_nodes self.model.recv_nodes = self.compute_code.recv_nodes self.model.gather_send_nodes = self.compute_code.gather_send_nodes self.model.gather_recv_nodes = self.compute_code.gather_recv_nodes self.model.scatter_send_nodes = self.compute_code.scatter_send_nodes self.model.scatter_recv_nodes = self.compute_code.scatter_recv_nodes self.model.conv_params = self.compute_code.conv_params self.model.pool_params = self.compute_code.pool_params self.model.conv_slices = self.compute_code.conv_slices self.model.pool_slices = self.compute_code.pool_slices for computation in self.computations: executor = getattr(self.model, computation.name) computation.executor = executor