def init_resource_arg_max(self): self.op_type_name = "ArgMaxD" self.op_attr = acl.op.create_attr() ret = acl.op.set_attr_int(self.op_attr, "dimension", 0) check_ret("acl.op.set_attr_int", ret) # settings of arg_max operator self.input_desc_arg_max_d = \ acl.create_tensor_desc(ACL_FLOAT16, [self.input_shape, ], ACL_FORMAT_ND) self.output_desc_arg_max_d = \ acl.create_tensor_desc(ACL_INT32, [self.output_shape, ], ACL_FORMAT_ND) self.tensor_size_arg_max_d = \ acl.get_tensor_desc_size(self.output_desc_arg_max_d) self.dev_buffer_arg_max_d, ret = \ acl.rt.malloc(self.tensor_size_arg_max_d, ACL_MEM_MALLOC_NORMAL_ONLY) check_ret("acl.rt.malloc", ret) self.output_buffer_arg_max_d = \ acl.create_data_buffer(self.dev_buffer_arg_max_d, self.tensor_size_arg_max_d)
def forward_op_batch_matmul(data, stream): ret = acl.op.set_model_dir(MODEL_MATMUL_PATH) check_ret("acl.op.set_model_dir", ret) op_attr = acl.op.create_attr() ret = acl.op.set_attr_bool(op_attr, "adj_x1", False) check_ret("acl.op.set_attr_bool", ret) ret = acl.op.set_attr_bool(op_attr, "adj_x2", False) check_ret("acl.op.set_attr_bool", ret) input_desc_batch_matmul_x1 = \ acl.create_tensor_desc(ACL_FLOAT, [1, 1, 1024, 1024], ACL_FORMAT_NCHW) input_desc_batch_matmul_x2 = \ acl.create_tensor_desc(ACL_FLOAT, [1, 1, 1024, 27648], ACL_FORMAT_NCHW) output_desc_batch_matmul_y = \ acl.create_tensor_desc(ACL_FLOAT, [1, 1, 1024, 27648], ACL_FORMAT_NCHW) tensor_size_batch_matmul_x1 = \ acl.get_tensor_desc_size(input_desc_batch_matmul_x1) tensor_size_batch_matmul_x2 = \ acl.get_tensor_desc_size(input_desc_batch_matmul_x2) tensor_size_batch_matmul_y = \ acl.get_tensor_desc_size(output_desc_batch_matmul_y) input_buffer_x1 = create_input(data[0], tensor_size_batch_matmul_x1) input_buffer_x2 = create_input(data[1], tensor_size_batch_matmul_x2) dev_buffer_batch_matmul, ret = \ acl.rt.malloc(tensor_size_batch_matmul_y, ACL_MEM_MALLOC_NORMAL_ONLY) check_ret("acl.rt.malloc", ret) output_buffer_batch_matmul_y = \ acl.create_data_buffer(dev_buffer_batch_matmul, tensor_size_batch_matmul_y) ret = acl.op.execute_v2( OP_TYPE, [input_desc_batch_matmul_x1, input_desc_batch_matmul_x2], [input_buffer_x1, input_buffer_x2], [output_desc_batch_matmul_y], [output_buffer_batch_matmul_y], op_attr, stream) check_ret("acl.op.execute_v2", ret) ret = acl.rt.synchronize_stream(stream) check_ret("acl.rt.synchronize_stream", ret) print("[SingleOp] batch_matmul run success") return get_forward_result(dev_buffer_batch_matmul, tensor_size_batch_matmul_y)
def init_resource_cast(self): # settings of cast operator self._input_desc = acl.create_tensor_desc(ACL_FLOAT, [self.input_shape], ACL_FORMAT_ND) self._output_desc = acl.create_tensor_desc(ACL_FLOAT16, [self.input_shape], ACL_FORMAT_ND) tensor_size = acl.get_tensor_desc_size(self._output_desc) self.dev_buffer_cast, ret = acl.rt.malloc(tensor_size, ACL_MEM_MALLOC_NORMAL_ONLY) check_ret("acl.rt.malloc", ret) self.output_buffer_cast = acl.create_data_buffer(self.dev_buffer_cast, tensor_size)
def _gen_output_tensor(self): print("gen output data stage:") self.operator_output = acl.create_tensor_desc( acl_dtype[self.data_type], self.shape, self.format_type) for factor in [self.operator_output]: factor_size = acl.get_tensor_desc_size(factor) factor_device, ret = acl.rt.malloc(factor_size, ACL_MEM_MALLOC_NORMAL_ONLY) check_ret("acl.rt.malloc", ret) self.device_outputs.append(factor_device) self.device_buffer_outputs.append( acl.create_data_buffer(factor_device, factor_size)) self.host_outputs.append(acl.rt.malloc_host(factor_size)[0]) self.output_desc.append(factor) print("gen output data success")
def _gen_input_tensor(self): print("gen input data stage:") for factor in [self.factor_a, self.factor_b]: tensor = acl.create_tensor_desc(acl_dtype[self.data_type], self.shape, self.format_type) factor_size = acl.get_tensor_desc_size(tensor) factor_device, ret = acl.rt.malloc(factor_size, ACL_MEM_MALLOC_NORMAL_ONLY) check_ret("acl.rt.malloc", ret) if "bytes_to_ptr" in dir(acl.util): bytes_data = factor.tobytes() factor_ptr = acl.util.bytes_to_ptr(bytes_data) else: factor_ptr = acl.util.numpy_to_ptr(factor) ret = acl.rt.memcpy(factor_device, factor_size, factor_ptr, factor_size, ACL_MEMCPY_HOST_TO_DEVICE) check_ret("acl.rt.memcpy", ret) factor_buffer = acl.create_data_buffer(factor_device, factor_size) self._inputs_device.append(factor_device) self._inputs_device_buffer.append(factor_buffer) self._inputs_desc.append(tensor) print("gen input data success")