def wrap_all_outputs(result, training_mode_flag): def extract_context(result): # Search for context among all outputs. ctx = None for arg in result: if not isinstance(arg, torch.Tensor) or not hasattr( arg, 'grad_fn'): continue # Use the first context we see because all of arg's # share the same one. ctx = arg.grad_fn break if training_mode_flag: # Must extract one valid context from result tensors. assert ctx is not None else: # Context must not present under non-training mode. assert ctx is None return ctx if isinstance(result, torch.Tensor): ctx = extract_context([result]) return [ctx, to_dlpack(result)] elif isinstance(result, tuple) or isinstance(result, list): ctx = extract_context(result) wrapped = [ctx] wrapped.extend(list(to_dlpack(value) for value in result)) # Inside the returned list, first element is context and the rest # are DLPack tensors. return wrapped else: raise TypeError('Unsupported returned type: ', type(result))
def wrap_all_outputs(result): if isinstance(result, torch.Tensor): return [to_dlpack(result)] elif isinstance(result, tuple) or isinstance(result, list): return [to_dlpack(value) for value in result if value is not None] else: raise Exception('Unsupported returned type: ', type(result))
def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu): input0 = torch.rand(16) input1 = torch.rand(16) if is_input0_gpu: input0 = input0.to('cuda') if is_input1_gpu: input1 = input1.to('cuda') input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0)) input1_pb = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1)) gpu_bls_return = self._get_gpu_bls_outputs(input0_pb, input1_pb) if gpu_bls_return: output0_dlpack, output1_dlpack = gpu_bls_return else: return False expected_output_0 = from_dlpack( input0_pb.to_dlpack()).to('cpu') + from_dlpack( input1_pb.to_dlpack()).to('cpu') expected_output_1 = from_dlpack( input0_pb.to_dlpack()).to('cpu') - from_dlpack( input1_pb.to_dlpack()).to('cpu') output0_matches = torch.all( expected_output_0 == from_dlpack(output0_dlpack).to('cpu')) output1_matches = torch.all( expected_output_1 == from_dlpack(output1_dlpack).to('cpu')) if not output0_matches or not output1_matches: return False return True
def inference_torch(self, *args, **kw): if self.input_names is None: assert len(args) == 0 and len(kw) != 0 for k, v in kw.items(): if isinstance(v, torch.Tensor): self.tvm_context.set_input(k, tvm.nd.from_dlpack(dlpack.to_dlpack(v))) else: self.tvm_context.set_input(k, tvm.nd.array(v, ctx=self.ctx)) else: assert len(args) == len(self.input_names) for k, v in zip(self.input_names, args): if isinstance(v, torch.Tensor): self.tvm_context.set_input(k, tvm.nd.from_dlpack(dlpack.to_dlpack(v))) else: self.tvm_context.set_input(k, tvm.nd.array(v, ctx=self.ctx)) self.tvm_context.run() num_outputs = self.tvm_context.get_num_outputs() outputs = [] for i in range(num_outputs): # outputs = [dlpack.from_dlpack(o) for o in outputs] out = self.tvm_context.get_output(i).to_dlpack() outputs.append(dlpack.from_dlpack(out)) if len(outputs) == 1: return outputs[0] return tuple(outputs)
def execute(self, requests): responses = [] for request in requests: input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") print('ISCPU', input0.is_cpu()) gpu_output = pb_utils.get_input_tensor_by_name( request, "GPU_OUTPUT").as_numpy() if input0.is_cpu(): if not gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", input0.to_dlpack()) else: outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(outptu0_pytorch)) else: if gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", input0.to_dlpack()) else: outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(outptu0_pytorch)) next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:]) responses.append( pb_utils.InferenceResponse([output0, next_gpu_output])) return responses
def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu): input0 = torch.rand(16) input1 = torch.rand(16) if is_input0_gpu: input0 = input0.to('cuda') if is_input1_gpu: input1 = input1.to('cuda') input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0)) input1_pb = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1)) output0_dlpack, output1_dlpack = self._get_gpu_bls_outputs( input0_pb, input1_pb) expected_output_0 = from_dlpack( input0_pb.to_dlpack()).to('cpu') + from_dlpack( input1_pb.to_dlpack()).to('cpu') expected_output_1 = from_dlpack( input0_pb.to_dlpack()).to('cpu') - from_dlpack( input1_pb.to_dlpack()).to('cpu') self.assertTrue( torch.all( expected_output_0 == from_dlpack(output0_dlpack).to('cpu'))) self.assertTrue( torch.all( expected_output_1 == from_dlpack(output1_dlpack).to('cpu')))
def execute(self, requests): output0_dtype = self.output0_dtype output1_dtype = self.output1_dtype responses = [] for request in requests: in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") # If both of the tensors are in CPU, use NumPy. if in_0.is_cpu() and in_1.is_cpu(): if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy( ).dtype == np.object_: out_0, out_1 = (in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),\ in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32)) out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) else: in_0_pytorch, in_1_pytorch = from_dlpack( in_0.to_dlpack()), from_dlpack(in_1.to_dlpack()) out_0, out_1 = (in_0_pytorch - in_1_pytorch, in_0_pytorch + in_1_pytorch) if self.output0_dtype == np.object_: out_tensor_0 = pb_utils.Tensor( "OUTPUT0", out_0.numpy().astype(output0_dtype)) else: out_0 = out_0.type( self.numpy_to_pytorch_dtype[output0_dtype]) out_tensor_0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(out_0)) if self.output1_dtype == np.object_: out_tensor_1 = pb_utils.Tensor( "OUTPUT1", out_1.numpy().astype(output1_dtype)) else: out_1 = out_1.type( self.numpy_to_pytorch_dtype[output1_dtype]) out_tensor_1 = pb_utils.Tensor.from_dlpack( "OUTPUT1", to_dlpack(out_1)) else: in_0_pytorch, in_1_pytorch = from_dlpack( in_0.to_dlpack()).cuda(), from_dlpack( in_1.to_dlpack()).cuda() out_0, out_1 = (in_0_pytorch - in_1_pytorch, in_0_pytorch + in_1_pytorch) out_tensor_0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(out_0)) out_tensor_1 = pb_utils.Tensor.from_dlpack( "OUTPUT1", to_dlpack(out_1)) responses.append( pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) return responses
def test_pytorch_cpu_tensor_to_submatrix(self): ''' Note that we have two ways to convert a PyTorch CPU tensor to kaldi's FloatSubMatrix: Method 1: v = kaldi.SubMatrixFromDLPack(to_dlpack(tensor)) Method 2: v = kaldi.DLPackFloatSubMatrix.from_dlpack(to_dlpack(tensor)) ''' tensor = torch.arange(6).reshape(2, 3).float() m = kaldi.SubMatrixFromDLPack(to_dlpack(tensor)) self.assertIsInstance(m, kaldi.FloatSubMatrix) m[0, 0] = 100 # also changes tensor, since memory is shared self.assertEqual(tensor[0, 0], 100) del m # memory is shared between `m` and `tensor` m = kaldi.DLPackFloatSubMatrix.from_dlpack(to_dlpack(tensor)) m[0, 1] = 200 self.assertEqual(tensor[0, 1], 200)
def generate(dim): shape = np.random.randint(1, 10, [dim]).tolist() if self.as_tensor: data = to_dlpack(torch.rand(size=[self.batch_size] + shape, device=self.device)) else: data = [to_dlpack(torch.rand(shape, device=self.device)) for _ in range(self.batch_size)] return data
def __init__(self, indexes: torch.Tensor, data: torch.Tensor): assert indexes.dtype == torch.int32 assert data.dtype == torch.float32 assert data.shape[1] == 2 self.indexes = indexes self.data = data super().__init__(to_dlpack(self.indexes), to_dlpack(self.data))
def response_thread(self, response_sender, input0, gpu_output): # Sleep 5 seconds to make sure the main thread has exited. time.sleep(5) if input0.is_cpu(): if not gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack()) else: outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(outptu0_pytorch)) else: if gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack()) else: output0_pytorch = from_dlpack(input0.to_dlpack()).cpu() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(output0_pytorch)) next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:]) infer_response = pb_utils.InferenceResponse([output0, next_gpu_output]) # Number of times to repeat the response response_repeat = 2 for _ in range(response_repeat): response_sender.send(infer_response) response_sender.send( flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) with self.inflight_thread_count_lck: self.inflight_thread_count -= 1
def to_cugraph(edge_index: Tensor, edge_weight: Optional[Tensor] = None, relabel_nodes: bool = True): r"""Converts a graph given by :obj:`edge_index` and optional :obj:`edge_weight` into a :obj:`cugraph` graph object. Args: relabel_nodes (bool, optional): If set to :obj:`True`, :obj:`cugraph` will remove any isolated nodes, leading to a relabeling of nodes. (default: :obj:`True`) """ import cudf import cugraph df = cudf.from_dlpack(to_dlpack(edge_index.t())) if edge_weight is not None: assert edge_weight.dim() == 1 df[2] = cudf.from_dlpack(to_dlpack(edge_weight)) return cugraph.from_cudf_edgelist( df, source=0, destination=1, edge_attr=2 if edge_weight is not None else None, renumber=relabel_nodes)
def test_get_cfsa_vec_size_multiple(self): s1 = r''' 0 1 1 0 2 2 1 3 3 2 3 3 3 16 -1 16 ''' fsa1 = k2.string_to_fsa(s1) cfsa1 = k2.Cfsa(fsa1) s2 = r''' 0 1 1 0 2 2 1 3 3 3 10 -1 10 ''' fsa2 = k2.string_to_fsa(s2) cfsa2 = k2.Cfsa(fsa2) cfsa_std_vec = k2.CfsaStdVec() cfsa_std_vec.push_back(cfsa1) cfsa_std_vec.push_back(cfsa2) self.assertEqual(len(cfsa_std_vec), 2) num_bytes = k2.get_cfsa_vec_size(cfsa_std_vec) # the value is taken from the corresponding fsa_test.cc self.assertEqual(num_bytes, 360) # now test from dlpack if SKIP_DLPACK: print('skip dlpack test') return else: print('Do dlpack testing') num_int32 = num_bytes // 4 tensor = torch.empty((num_int32, ), dtype=torch.int32) dlpack = to_dlpack(tensor) cfsa_vec = k2.create_cfsa_vec(dlpack, cfsa_std_vec) self.assertEqual(cfsa_vec.num_fsas(), 2) self.assertEqual(cfsa_vec[0], cfsa1) self.assertEqual(cfsa_vec[1], cfsa2) self.assertEqual(tensor[0], 1) # version self.assertEqual(tensor[1], 2) # num_fsas self.assertEqual(tensor[2], 64 // 4) # state_offsets_start # construct a CfsaVec from a `torch::Tensor` which has already been filled dlpack = to_dlpack(tensor.clone()) cfsa_vec = k2.create_cfsa_vec(dlpack) self.assertEqual(cfsa_vec.num_fsas(), 2) self.assertEqual(cfsa_vec[0], cfsa1) self.assertEqual(cfsa_vec[1], cfsa2)
def wrap_all_outputs(result): if isinstance(result, torch.Tensor): return [to_dlpack(result)] elif isinstance(result, tuple) or isinstance(result, list): return [to_dlpack(value) if value is not None else None for value in result] else: raise wrap_exception(ORTModuleIOError, TypeError(f'ORTModule does not support the following model output type {type(result)}.'))
def wrap_all_outputs(result, training_mode_flag): # This is mainly to hold grad_fn references by registering it into our PyNodeSharedPointerPool. def register_context(result): # Search for context among all outputs. ctx = None # All forward outputs of torch.autograd.Function shared a same gradient function pointer, # so here we just get the first tensor having grad_fn attribute. # (https://github.com/pytorch/pytorch/blob/15532595209d2daf34d35e10f8d3d3b64966aea2/torch/csrc/autograd/custom_function.cpp#L267) first_tensor_output = None for arg in result: if not isinstance(arg, torch.Tensor) or not hasattr(arg, 'grad_fn'): continue # Use the first context we see because all of arg's # share the same one. ctx = arg.grad_fn first_tensor_output = arg break if training_mode_flag: # Must extract one valid context from result tensors. assert ctx is not None # FORWARD BACKWARD FUNCTION CONNECTIONS # input_1 (leaf, constructed by from_dlpack) <----reference---- AccumulateGrad gradient function # ↓ ↑ # autograd.Function apply() ------------> autograd.Function backward() # ↓ | ↑ # output_1, output_2 --- shared_ptr<PyNode> --- ↑ # ↓ previous gradient function # We remove the edges starting between current autograd.Function's gradient function and # it's input's gradient function (e.g. AccumulateGrad gradient function), then # AccumulateGrad gradient function will be destroyed, releasing the reference to input_1 # (https://github.com/pytorch/pytorch/blob/15532595209d2daf34d35e10f8d3d3b64966aea2/torch/csrc/autograd/functions/accumulate_grad.cpp#L21). # The next edges are stored in Node, with which we can get next gradient function. # https://github.com/pytorch/pytorch/blob/15532595209d2daf34d35e10f8d3d3b64966aea2/torch/csrc/autograd/function.h#L527 # filter out the None in the saved_tensors. saved_tensors = [t for t in ctx.saved_tensors if t is not None] torch_interop_utils.clear_grad_fns_for_next_edges(first_tensor_output, saved_tensors) torch_interop_utils.register_grad_fn(id(ctx), first_tensor_output) else: # Context must not present under non-training mode. assert ctx is None return ctx if isinstance(result, torch.Tensor): ctx = register_context([result]) return [ctx, to_dlpack(result)] elif isinstance(result, tuple) or isinstance(result, list): ctx = register_context(result) wrapped = [ctx] wrapped.extend(list(to_dlpack(value) if value is not None else None for value in result)) # Inside the returned list, first element is context and the rest # are DLPack tensors. return wrapped else: raise wrap_exception(ORTModuleIOError, TypeError(f'ORTModule does not support the following model output type {type(result)}.'))
def pytorch_adapter(fun, in1, in2): with torch.cuda.stream(torch_stream): tin1 = [torch_dlpack.from_dlpack(dltensor) for dltensor in in1] tin2 = [torch_dlpack.from_dlpack(dltensor) for dltensor in in2] tout1, tout2 = fun(tin1, tin2) out1, out2 = [torch_dlpack.to_dlpack(tout) for tout in tout1], \ [torch_dlpack.to_dlpack(tout) for tout in tout2] torch_stream.synchronize() return out1, out2
def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5): """ Predict the class with the trained model :param input_data: input text data for prediction :type input_data: cudf.Series :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len. :type max_seq_len: int :param batch_size: batch size :type batch_size: int :param threshold: results with probabilities higher than this will be labeled as positive :type threshold: float :return: predictions, probabilities: predictions are labels (0 or 1) based on minimum threshold :rtype: cudf.Series, cudf.Series Examples -------- >>> from cuml.preprocessing.model_selection import train_test_split >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8) >>> sc.train_model(emails_train, labels_train) >>> predictions = sc.predict(emails_test, threshold=0.8) """ predict_inputs, predict_masks = self._bert_uncased_tokenize( input_data, max_seq_len) predict_inputs = predict_inputs.type(torch.LongTensor).to(self._device) predict_masks = predict_masks.to(self._device) predict_inputs = predict_inputs.type(torch.LongTensor) predict_data = TensorDataset(predict_inputs, predict_masks) predict_sampler = SequentialSampler(predict_data) predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=batch_size) preds = cudf.Series() probs = cudf.Series() for batch in predict_dataloader: batch = tuple(t.to(self._device) for t in batch) b_input_ids, b_input_masks = batch with torch.no_grad(): logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks)[0] b_probs = torch.sigmoid(logits[:, 1]) b_preds = b_probs.ge(threshold) b_probs = cudf.io.from_dlpack(to_dlpack(b_probs)) b_preds = cudf.io.from_dlpack(to_dlpack(b_preds)) preds = preds.append(b_preds) probs = probs.append(b_probs) return preds, probs
def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32): """ Evaluate trained model :param test_data: test data to evaluate model :type test_data: cudf.Series :param labels: labels for each element in test_data :type labels: cudf.Series :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len. :type max_seq_len: int :param batch_size: batch size :type batch_size: int Examples -------- >>> from cuml.preprocessing.model_selection import train_test_split >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8) >>> sc.evaluate_model(emails_test, labels_test) """ self._model.eval() test_gdf = cudf.DataFrame() test_gdf["text"] = test_data test_gdf["label"] = labels test_dataset = Dataset(test_gdf) test_dataloader = DataLoader(test_dataset, batchsize=batch_size) eval_accuracy = 0 nb_eval_steps = 0 for df in test_dataloader.get_chunks(): b_input_ids, b_input_mask = self._bert_uncased_tokenize( df["text"], max_seq_len) b_labels = torch.tensor(df["label"].to_numpy()) with torch.no_grad(): logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] logits = logits.type(torch.DoubleTensor).to(self._device) logits = cupy.fromDlpack(to_dlpack(logits)) label_ids = b_labels.type(torch.IntTensor).to(self._device) label_ids = cupy.fromDlpack(to_dlpack(label_ids)) temp_eval_accuracy = self._flatten_accuracy(logits, label_ids) eval_accuracy += temp_eval_accuracy nb_eval_steps += 1 accuracy = eval_accuracy / nb_eval_steps return float(accuracy)
def _train(self, train_dataloader, validation_dataloader, model, epochs): model.train() # Enable training mode for _ in trange(epochs, desc="Epoch"): tr_loss = 0 # Tracking variables nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(self._device) for t in batch) # Add batch to GPU b_input_ids, b_input_mask, b_labels = batch # Unpack the inputs from dataloader self._optimizer.zero_grad() # Clear out the gradients loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0] # forwardpass loss.sum().backward() self._optimizer.step() # update parameters tr_loss += loss.sum().item() # get a numeric value nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("Train loss: {}".format(tr_loss / nb_tr_steps)) model.eval( ) # Put model in evaluation mode to evaluate loss on the validation set eval_accuracy = 0 nb_eval_steps = 0 for batch in validation_dataloader: batch = tuple(t.to(self._device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad( ): # Telling the model not to compute or store gradients, saving memory and speeding up validation logits = model( b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[ 0] # Forward pass, calculate logit predictions logits = cupy.fromDlpack(to_dlpack(logits)) label_ids = cupy.fromDlpack(to_dlpack(b_labels)) # logits = logits.detach().cpu().numpy() # label_ids = b_labels.to('cpu').numpy() temp_eval_accuracy = self._flatten_accuracy(logits, label_ids) eval_accuracy += temp_eval_accuracy nb_eval_steps += 1 print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) return model
def forward(ctx, input, proj): batchsize = input.shape[0] ctx.save_for_backward(input) if (input.shape[1] != 1): raise NotImplementedError sp = Config.getScanParam() out = input.clone() for i in range(batchsize): cupy_input = fromDlpack(to_dlpack(out[i, 0, :, :])) cupy_proj = fromDlpack(to_dlpack(proj[i, 0, :, :])) CTOperator.SART2D(cupy_proj, sp, sp['order'], cupy_input) return out
def __call__(self, loc, fg_score, anchor, img_size, scale=1.): """ Arg: - loc: (N,4) - fg_score: (N,) - anchor: (9, 4) - img_size: (2) """ if self.parent_model.training: n_pre_nms = self.n_train_pre_nms n_post_nms = self.n_train_post_nms else: n_pre_nms = self.n_test_pre_nms n_post_nms = self.n_test_post_nms loc = cp.fromDlpack(to_dlpack(loc)) fg_score = cp.fromDlpack(to_dlpack(fg_score)) anchor = cp.asarray(anchor) roi = loc2bbox(anchor, loc) # clip roi[:, slice(0, 4, 2)] = cp.clip(roi[:, slice(0, 4, 2)], 0, img_size[1]) roi[:, slice(1, 4, 2)] = cp.clip(roi[:, slice(1, 4, 2)], 0, img_size[0]) # remove small box less than threshold min_size = self.min_size * scale hs = roi[:, 3] - roi[:, 1] ws = roi[:, 2] - roi[:, 0] keep = cp.where((hs > min_size) & (ws > min_size))[0] roi = roi[keep, :] fg_score = fg_score[keep] # sort the score order = cp.argsort(fg_score.ravel())[::-1] if n_pre_nms > 0: order = order[0:n_pre_nms] roi = roi[order, :] keep = non_maximum_suppression(cp.ascontiguousarray(cp.asarray(roi)), thresh=self.nms_thresh) if n_post_nms > 0: keep = keep[:n_post_nms] roi = roi[keep] return roi
def _to_o3d(tensor, min_ndim=0, max_len=None): """Convert Tensorflow, PyTorch and Numpy tensors to Open3D tensor without copying. Python lists and tuples are also accepted, but will be copied. If max_len is specified, a tuple of tensors is returned, with the input split into tuple elements along the first dimension. This allows converting a batch of variable size data. Args: tensor: Input tensor to be converted. min_ndim (int): Tensor shape will be padded with ones on the left to reach this minimum number of dimensions. max_len (int): The max size along the first dimension. Other data will be discarded. """ if max_len is not None: return tuple( _to_o3d(tensor[k], min_ndim - 1) for k in range(min(max_len, len(tensor)))) if isinstance(tensor, o3d.core.Tensor): pass elif tf is not None and isinstance(tensor, tf.Tensor): tensor = o3d.core.Tensor.from_dlpack(tf_dlpack.to_dlpack(tensor)) elif torch is not None and isinstance(tensor, torch.Tensor): tensor = o3d.core.Tensor.from_dlpack(torch_dlpack.to_dlpack(tensor)) else: tensor = o3d.core.Tensor.from_numpy(np.asarray(tensor)) exp_shape = tensor.shape for _ in range(tensor.ndim, min_ndim): exp_shape.insert(0, 1) return tensor.reshape(exp_shape)
def test_dlpack_tensor_list_cpu_direct_creation(): arr = torch.rand(size=[3, 5, 6], device="cpu") tensor_list = TensorListCPU(to_dlpack(arr), "NHWC") dali_torch_tensor = convert_to_torch(tensor_list, device=arr.device, dtype=arr.dtype) assert torch.all(arr.eq(dali_torch_tensor))
def test_dlpack_tensor_list_gpu_to_cpu(): arr = torch.rand(size=[3, 5, 6], device="cuda") tensor_list = TensorListGPU(to_dlpack(arr), "NHWC") dali_torch_tensor = convert_to_torch(tensor_list, device=arr.device, dtype=arr.dtype) assert torch.all(arr.cpu().eq(dali_torch_tensor.cpu()))
def test_dlpack_tensor_gpu_direct_creation(): arr = torch.rand(size=[3, 5, 6], device="cuda") tensor = TensorGPU(to_dlpack(arr)) dali_torch_tensor = convert_to_torch(tensor, device=arr.device, dtype=arr.dtype) assert torch.all(arr.eq(dali_torch_tensor))
def update_W(W, a_last, z, u, rho): size = dist.get_world_size() rank = dist.get_rank() # convert to pytorch data #update W temp1 = z + u / rho temp1 = from_dlpack(toDlpack(temp1)) a_last = from_dlpack(toDlpack(a_last)) data1 = torch.mm(temp1, torch.t(a_last)) data2 = torch.mm(a_last, torch.t(a_last)) data = torch.cat((data1, data2), 0) # data = comm.reduce(data, op=MPI.SUM, root=0) dist.reduce(data, dst=0, op=dist.ReduceOp.SUM) if rank == 0: middle_pos = data1.shape[0] data1 = data[0:middle_pos] data2 = data[middle_pos:] inverse_data = torch.pinverse(data2) W = torch.mm(data1, inverse_data) else: W = from_dlpack(toDlpack(W)) # W = None dist.broadcast(W, src=0) # convert to cupy data W = fromDlpack(to_dlpack(W)) return W
def execution_session_run_forward(execution_session, onnx_model, device, *inputs): """Runs the forward graph on execution_session with given model inputs and device""" # Assert that the input and model device match _utils._check_same_device(device, "Input argument to forward", *inputs) # TODO: Try to reuse the output buffers as some of the output tensors are same sizes, # especially the backward graph outputs. # REVIEW(codemzs): Consolidate Training Agent with InferenceAgent on C++ side to not # have the need for passing IOBinding. state = C.PartialGraphExecutionState() forward_inputs = C.OrtValueVector() forward_inputs.reserve(len(inputs)) for input in inputs: forward_inputs.push_back(to_dlpack(input), input.dtype == torch.bool) forward_outputs = C.OrtValueVector() # Run and return module outputs. execution_session.run_forward(forward_inputs, forward_outputs, state) user_outputs = tuple(_utils._ortvalue_to_torch_tensor(forward_output) for forward_output in forward_outputs) output_info = [(output.shape, output.device, output.dtype) for output in user_outputs] run_info = RunStateInfo(state, output_info) # Return user outputs and forward run information return user_outputs, run_info
def th2nd(): ans = np.array([[1., 1., 1., 1.], [0., 0., 0., 0.], [0., 0., 0., 0.]]) x = th.zeros((3, 4)) dl = dlpack.to_dlpack(x) y = nd.from_dlpack(dl) x[0] = 1 assert np.allclose(y.asnumpy(), ans)
def test_dlpack_gpu_tensors(self): # Test different dtypes pytorch_dtypes = [ torch.float16, torch.float32, torch.float64, torch.int8, torch.int16, torch.int32, torch.int64, torch.uint8, torch.bool ] for pytorch_dtype in pytorch_dtypes: pytorch_tensor = torch.rand( [100], dtype=torch.float16, device='cuda') * 100 pytorch_tensor = pytorch_tensor.type(pytorch_dtype) dlpack_tensor = to_dlpack(pytorch_tensor) pb_tensor = pb_utils.Tensor.from_dlpack('test_tensor', dlpack_tensor) # Convert the tensor back to DLPack and ensure that both tensors are # the same pytorch_tensor_dlpack = from_dlpack(pb_tensor.to_dlpack()) self.assertTrue(torch.all(pytorch_tensor_dlpack == pytorch_tensor)) # DLPack does not properly support bool type: # https://github.com/google/jax/issues/4719 if pytorch_dtype != torch.bool: self.assertTrue( pytorch_tensor.type() == pytorch_tensor_dlpack.type()) else: self.assertFalse( pytorch_tensor.type() == pytorch_tensor_dlpack.type())
def torch2cupy(tensor): """ :param tensor: PyTorch CUDA tensor. :return: CuPy tensor. """ dx = to_dlpack(tensor) return cupy.fromDlpack(dx)