Python fromDlpackの例、cupy.fromDlpack Pythonの例

コード例 #1

0

ファイルを表示

def _convert_df_to_output_type(df, input_type):
    """
    Given a cudf.DataFrame df, convert it to a new type appropriate for the
    graph algos in this module, based on input_type.
    """
    if input_type in [Graph, DiGraph]:
        return df

    elif (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):
        return df.to_pandas()

    elif (cp is not None) and \
         (input_type in [cp_coo_matrix, cp_csr_matrix, cp_csc_matrix]):
        # A CuPy/SciPy input means the return value will be a 2-tuple of:
        #   distance: cupy.ndarray
        #   predecessor: cupy.ndarray
        sorted_df = df.sort_values("vertex")
        distances = cp.fromDlpack(sorted_df["distance"].to_dlpack())
        preds = cp.fromDlpack(sorted_df["predecessor"].to_dlpack())
        if "sp_counter" in df.columns:
            return (distances, preds,
                    cp.fromDlpack(sorted_df["sp_counter"].to_dlpack()))
        else:
            return (distances, preds)

    else:
        raise TypeError(f"input type {input_type} is not a supported type.")

コード例 #2

0

ファイルを表示

ファイル: vizb.py プロジェクト: davidenoma/GPU-GWAS

def show_manhattan_plot(df, group_by, x_axis, y_axis):
    chroms = df[group_by].unique().to_array()

    manhattan_fig = figure()

    start_position = -0.5
    for chrom in chroms:
        query = '%s == %s' % (group_by, chrom)
        cdf = df.query(query)

        x_array = cupy.fromDlpack(cdf[x_axis].to_dlpack()) + start_position
        y_array = cupy.fromDlpack(cdf[y_axis].to_dlpack())

        manhattan_fig.circle(x_array.get(),
                             y_array.get(),
                             size=2,
                             color='orange' if
                             (start_position - 0.5) % 2 == 0 else 'gray',
                             alpha=0.5)

        start_position += 1

    manhattan_handle = show(manhattan_fig, notebook_handle=True)
    push_notebook(handle=manhattan_handle)
    return manhattan_fig

コード例 #3

0

ファイルを表示

 def test_multiple_consumption_error(self):
     # Prevent segfault, see #3611
     array = cupy.empty(10)
     tensor = array.toDlpack()
     array2 = cupy.fromDlpack(tensor)  # noqa
     with pytest.raises(ValueError) as e:
         array3 = cupy.fromDlpack(tensor)  # noqa
     assert 'consumed multiple times' in str(e.value)

コード例 #4

0

ファイルを表示

def cupy_adapter_sync(fun, in1, in2):
    with cupy_stream:
        tin1 = [cupy.fromDlpack(dltensor) for dltensor in in1]
        tin2 = [cupy.fromDlpack(dltensor) for dltensor in in2]
        tout1, tout2 = fun(tin1, tin2)
        out1, out2 = [tout.toDlpack() for tout in tout1], \
                     [tout.toDlpack() for tout in tout2]
    cupy_stream.synchronize()
    return out1, out2

コード例 #5

0

ファイルを表示

 def getConnectionMatrix(self) -> csr_matrix:
     distances = cupy.ravel(cupy.fromDlpack(self.D.to_dlpack()))
     indices = cupy.ravel(cupy.fromDlpack(self.I.to_dlpack()))
     n_samples = indices.shape[0]
     n_nonzero = n_samples * self.nneighbors
     rowptr = cupy.arange(0, n_nonzero + 1, self.nneighbors)
     knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr),
                                               shape=(n_samples, n_samples))
     print(f"Completed KNN, sparse graph shape = {knn_graph.shape}")
     return knn_graph

コード例 #6

0

ファイルを表示

ファイル: sequence_classifier.py プロジェクト: zeblok/clx

    def _train(self, train_dataloader, validation_dataloader, model, epochs):
        model.train()  # Enable training mode
        for _ in trange(epochs, desc="Epoch"):
            tr_loss = 0  # Tracking variables
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(self._device)
                              for t in batch)  # Add batch to GPU
                b_input_ids, b_input_mask, b_labels = batch  # Unpack the inputs from dataloader
                self._optimizer.zero_grad()  # Clear out the gradients
                loss = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)[0]  # forwardpass

                loss.sum().backward()
                self._optimizer.step()  # update parameters
                tr_loss += loss.sum().item()  # get a numeric value
                nb_tr_examples += b_input_ids.size(0)
                nb_tr_steps += 1

            print("Train loss: {}".format(tr_loss / nb_tr_steps))

            model.eval(
            )  # Put model in evaluation mode to evaluate loss on the validation set

            eval_accuracy = 0
            nb_eval_steps = 0

            for batch in validation_dataloader:
                batch = tuple(t.to(self._device) for t in batch)

                b_input_ids, b_input_mask, b_labels = batch

                with torch.no_grad(
                ):  # Telling the model not to compute or store gradients, saving memory and speeding up validation
                    logits = model(
                        b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)[
                            0]  # Forward pass, calculate logit predictions
                logits = cupy.fromDlpack(to_dlpack(logits))
                label_ids = cupy.fromDlpack(to_dlpack(b_labels))
                # logits = logits.detach().cpu().numpy()
                # label_ids = b_labels.to('cpu').numpy()
                temp_eval_accuracy = self._flatten_accuracy(logits, label_ids)

                eval_accuracy += temp_eval_accuracy
                nb_eval_steps += 1

            print("Validation Accuracy: {}".format(eval_accuracy /
                                                   nb_eval_steps))

        return model

コード例 #7

0

ファイルを表示

ファイル: sequence_classifier.py プロジェクト: rapidsai/clx

    def evaluate_model(self,
                       test_data,
                       labels,
                       max_seq_len=128,
                       batch_size=32):
        """
        Evaluate trained model

        :param test_data: test data to evaluate model
        :type test_data: cudf.Series
        :param labels: labels for each element in test_data
        :type labels: cudf.Series
        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len.
        :type max_seq_len: int
        :param batch_size: batch size
        :type batch_size: int

        Examples
        --------
        >>> from cuml.preprocessing.model_selection import train_test_split
        >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8)
        >>> sc.evaluate_model(emails_test, labels_test)
        """
        self._model.eval()
        test_gdf = cudf.DataFrame()
        test_gdf["text"] = test_data
        test_gdf["label"] = labels

        test_dataset = Dataset(test_gdf)
        test_dataloader = DataLoader(test_dataset, batchsize=batch_size)

        eval_accuracy = 0
        nb_eval_steps = 0
        for df in test_dataloader.get_chunks():
            b_input_ids, b_input_mask = self._bert_uncased_tokenize(
                df["text"], max_seq_len)
            b_labels = torch.tensor(df["label"].to_numpy())
            with torch.no_grad():
                logits = self._model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask)[0]

            logits = logits.type(torch.DoubleTensor).to(self._device)
            logits = cupy.fromDlpack(to_dlpack(logits))
            label_ids = b_labels.type(torch.IntTensor).to(self._device)
            label_ids = cupy.fromDlpack(to_dlpack(label_ids))
            temp_eval_accuracy = self._flatten_accuracy(logits, label_ids)

            eval_accuracy += temp_eval_accuracy
            nb_eval_steps += 1

        accuracy = eval_accuracy / nb_eval_steps

        return float(accuracy)

コード例 #8

0

ファイルを表示

    def __call__(self, loc, fg_score, anchor, img_size, scale=1.):
        """
        Arg:
         - loc: (N,4)
         - fg_score: (N,)
         - anchor: (9, 4)
         - img_size: (2)
        """

        if self.parent_model.training:
            n_pre_nms = self.n_train_pre_nms
            n_post_nms = self.n_train_post_nms
        else:
            n_pre_nms = self.n_test_pre_nms
            n_post_nms = self.n_test_post_nms

        loc = cp.fromDlpack(to_dlpack(loc))
        fg_score = cp.fromDlpack(to_dlpack(fg_score))
        anchor = cp.asarray(anchor)
        roi = loc2bbox(anchor, loc)

        # clip
        roi[:, slice(0, 4, 2)] = cp.clip(roi[:, slice(0, 4, 2)], 0,
                                         img_size[1])
        roi[:, slice(1, 4, 2)] = cp.clip(roi[:, slice(1, 4, 2)], 0,
                                         img_size[0])

        # remove small box less than threshold
        min_size = self.min_size * scale
        hs = roi[:, 3] - roi[:, 1]
        ws = roi[:, 2] - roi[:, 0]
        keep = cp.where((hs > min_size) & (ws > min_size))[0]
        roi = roi[keep, :]
        fg_score = fg_score[keep]

        # sort the score
        order = cp.argsort(fg_score.ravel())[::-1]
        if n_pre_nms > 0:
            order = order[0:n_pre_nms]
        roi = roi[order, :]

        keep = non_maximum_suppression(cp.ascontiguousarray(cp.asarray(roi)),
                                       thresh=self.nms_thresh)

        if n_post_nms > 0:
            keep = keep[:n_post_nms]
        roi = roi[keep]
        return roi

コード例 #9

0

ファイルを表示

ファイル: para_func.py プロジェクト: Proxitrone/admm_for_dnn

def update_W(W, a_last, z, u, rho):
    size = dist.get_world_size()
    rank = dist.get_rank()
    # convert to pytorch data

    #update W
    temp1 = z + u/rho

    temp1 = from_dlpack(ndar.toDlpack(temp1))
    a_last = from_dlpack(ndar.toDlpack(a_last))

    data1 = torch.mm(temp1, torch.t(a_last))
    data2 = torch.mm(a_last, torch.t(a_last))
    data = torch.cat((data1, data2), 0)
    # data = comm.reduce(data, op=MPI.SUM, root=0)
    dist.reduce(data, dst=0, op=dist.ReduceOp.SUM)

    if rank == 0:
        middle_pos = data1.shape[0]
        data1 = data[0: middle_pos]
        data2 = data[middle_pos:]
        inverse_data = torch.pinverse(data2)
        W = torch.mm(data1, inverse_data)
    else:
        W = from_dlpack(ndar.toDlpack(W))
        # W = None
    dist.broadcast(W, src=0)

    # convert to cupy data
    W = fromDlpack(to_dlpack(W))
    return W

コード例 #10

0

ファイルを表示

    def calculate(self, **kwargs):
        smiles_dataset = kwargs['smiles_dataset']
        fingerprint_dataset = kwargs['fingerprint_dataset']
        properties = kwargs['properties']
        estimator = kwargs['estimator']
        param_dict = kwargs['param_dict']

        embeddings = self.sample_many(smiles_dataset,
                                      zero_padded_vals=False,
                                      average_tokens=True)
        embeddings = cupy.asarray(embeddings, dtype=cupy.float32)

        fingerprints = cupy.fromDlpack(fingerprint_dataset.data.to_dlpack())
        fingerprints = cupy.asarray(fingerprints,
                                    order='C',
                                    dtype=cupy.float32)

        metric, fingerprint_errors, embedding_errors = self._calculate_metric(
            embeddings, fingerprints, properties, estimator, param_dict)
        logger.info(
            f'{type(metric)}  {type(fingerprint_errors)} {type(embedding_errors)}'
        )
        metric = cupy.nanmean(metric)
        fingerprint_errors = cupy.nanmean(fingerprint_errors)
        embedding_errors = cupy.nanmean(embedding_errors)

        return pd.Series({
            'name': self.name,
            'value': metric,
            'fingerprint_error': fingerprint_errors,
            'embedding_error': embedding_errors
        })

コード例 #11

0

ファイルを表示

def convert_to_cupy_array(input_data):
    """Convert Tensor data to a CuPy Array.

    This method converts input tensor data to a cupy array.

    Parameters
    ----------
    input_data : torch.Tensor
        Input Tensor to be converted

    Returns
    -------
    cupy.ndarray
        The tensor data as a CuPy array

    Raises
    ------
    ImportError
        If Torch package not found

    """
    if not import_torch:
        raise ImportError(
            'Required version of Torch package not found' +
            'see documentation for details: https://cea-cosmic.' +
            'github.io/ModOpt/#optional-packages', )

    if input_data.is_cuda:
        return cp.fromDlpack(torch_to_dlpack(input_data))

    return input_data.detach().numpy()

コード例 #12

0

ファイルを表示

ファイル: nccl_util.py プロジェクト: zivzone/ray

def copy_tensor(dst_tensor, src_tensor):
    """Copy the content from src_tensor to dst_tensor.

    Args:
        dst_tensor: the tensor to copy from.
        src_tensor: the tensor to copy to.

    Returns:
        None
    """
    copied = True
    if isinstance(dst_tensor, cupy.ndarray) \
            and isinstance(src_tensor, cupy.ndarray):
        cupy.copyto(dst_tensor, src_tensor)
    elif torch_available():
        if isinstance(dst_tensor, torch.Tensor) and isinstance(
                src_tensor, torch.Tensor):
            dst_tensor.copy_(src_tensor)
        elif isinstance(dst_tensor, torch.Tensor) and isinstance(
                src_tensor, cupy.ndarray):
            t = torch.utils.dlpack.from_dlpack(src_tensor.toDlpack())
            dst_tensor.copy_(t)
        elif isinstance(dst_tensor, cupy.ndarray) and isinstance(
                src_tensor, torch.Tensor):
            t = cupy.fromDlpack(torch.utils.dlpack.to_dlpack(src_tensor))
            cupy.copyto(dst_tensor, t)
        else:
            copied = False
    else:
        copied = False
    if not copied:
        raise ValueError(
            "Unsupported tensor type. Got: {} and {}. Supported "
            "GPU tensor types are: torch.Tensor, cupy.ndarray.".format(
                type(dst_tensor), type(src_tensor)))

コード例 #13

0

ファイルを表示

ファイル: vizb.py プロジェクト: davidenoma/GPU-GWAS

def show_qq_plot(df, x_axis, y_axis):

    x_values = cupy.fromDlpack(df[x_axis].to_dlpack())
    y_values = cupy.fromDlpack(df[y_axis].to_dlpack())

    x_max = cupy.max(x_values).tolist()
    y_max = cupy.max(y_values).tolist()

    qq_fig = figure(x_range=(0, x_max), y_range=(0, y_max))
    qq_fig.circle(-cupy.log10(x_values + 1e-10).get(),
                  -cupy.log10(y_values).get())
    qq_fig.line([0, x_max], [0, y_max])

    qq_handle = show(qq_fig, notebook_handle=True)
    push_notebook(handle=qq_handle)
    return qq_fig

コード例 #14

0

ファイルを表示

ファイル: myoptim_scatter.py プロジェクト: tanghl1994/hankcompress

def torch2cupy(tensor):
    """
    :param tensor: PyTorch CUDA tensor.
    :return: CuPy tensor.
    """
    dx = to_dlpack(tensor)
    return cupy.fromDlpack(dx)

コード例 #15

0

ファイルを表示

    def get_embedding(self, text, show_tokens=False):
        bert_tokens = self.tokenizer.tokenize(text)
        ids = self.tokenizer.convert_tokens_to_ids(
            ['[CLS]'] + bert_tokens[:(self.window_size - 2)] + ['[SEP]'])
        tokens_tensor = torch.tensor(ids).reshape(1, -1).to(self.DEVICE)
        self.model.eval()

        with torch.no_grad():
            all_encoder_laylers, _ = self.model(tokens_tensor)
            embedding = all_encoder_laylers[0]

        if available:
            xp_embedding = xp.fromDlpack(to_dlpack(embedding))

        else:
            xp_embedding = embedding.numpy()

        if xp_embedding.shape[0] < self.window_size:
            xp_embedding = xp.concatenate([
                xp_embedding,
                xp.zeros(((self.window_size - xp_embedding.shape[0]), 768))
            ], 0)

        if show_tokens:
            return (xp_embedding, ['[CLS]'] +
                    bert_tokens[:(self.window_size - 2)] + ['[SEP]'])
        else:
            return xp_embedding

コード例 #16

0

ファイルを表示

    def forward(ctx, x, forw, adj, pylops, device):
        ctx.forw = forw
        ctx.adj = adj
        ctx.pylops = pylops
        ctx.device = device

        # prepare input
        if ctx.pylops:
            if ctx.device == 'cpu':
                # bring x to cpu and numpy
                x = x.cpu().detach().numpy()
            else:
                # pass x to cupy using DLPack
                x = cp.fromDlpack(to_dlpack(x))

        # apply forward operator
        y = ctx.forw(x)

        # prepare output
        if ctx.pylops:
            if ctx.device == 'cpu':
                # move y to torch and device
                y = torch.from_numpy(y)
            else:
                # move y to torch and device
                y = from_dlpack(y.toDlpack())
        return y

コード例 #17

0

ファイルを表示

ファイル: CPUCupyPinned.py プロジェクト: y-hann/SpeedTorch

    def afterOptimizerStep(self,
                           retrievedPosIndexes,
                           retrievedNegIndexes=None):
        reshapedRetrieval = self._getReshapedRetrieval(retrievedPosIndexes,
                                                       retrievedNegIndexes)

        self.CUPYmemmap[reshapedRetrieval] = (cupy.fromDlpack(
            to_dlpack(self.model_variable.weight.data)))

コード例 #18

0

ファイルを表示

def correlate(ip,
              weight,
              padding,
              stride,
              kernel_size,
              op_size=None,
              flip=False,
              dim_switch=False):
    if flip:
        w_transform = torch.flip(weight, torch.arange(weight.dim()).tolist())

    if dim_switch:
        ip = torch.transpose(ip, 1, 0)
        # calculate padding
        tmp = ((op_size - 1) * stride + kernel_size - ip.size(3))
        padding = math.floor(max(tmp, 0) / 2)

    if flip or dim_switch:
        weight = torch.transpose(weight, 1, 0)

    o_channels = weight.size(0)
    ip_unfold = torch.nn.functional.unfold(ip, (kernel_size, kernel_size),
                                           padding=padding,
                                           stride=stride)
    w_unfold = weight.contiguous().view(o_channels, -1)

    # print("op_size :", op_size)
    # print("stride : ", stride)
    # print("padding :", padding)
    # print("kernel_size :", kernel_size)
    # print("o_channels :", o_channels)
    # print("input :", ip.size())
    # print("input_unfold :", ip_unfold.size())
    # print("weight :", weight.size())
    # print("weight_unfold :", w_unfold.size())
    # print()

    # cupy conversion
    ip_cupy = cp.fromDlpack(to_dlpack(ip_unfold)).astype('int8')
    w_cupy = cp.fromDlpack(to_dlpack(w_unfold)).astype('int8')

    # multiplication in cupy
    op = cp.matmul(cp.transpose(ip_cupy, (0, 2, 1)), cp.transpose(w_cupy))
    op = cp.transpose(op, (0, 2, 1))

    return op

コード例 #19

0

ファイルを表示

ファイル: CUPYLive.py プロジェクト: huber1386/SpeedTorch

    def afterOptimizerStep(self,retrievedPosIndexes , retrievedNegIndexes = None):
        torch.cuda.synchronize()
        cupy.cuda.Device().synchronize()
        
        reshapedRetrieval = self._getReshapedRetrieval( retrievedPosIndexes, retrievedNegIndexes )

        self.CUPYcorpus[ reshapedRetrieval ] = (
            cupy.fromDlpack( to_dlpack( self.model_variable.weight.data ) ) )

コード例 #20

0

ファイルを表示

ファイル: NCEAverage.py プロジェクト: sforsyth6/MUFL

    def generate_z(self, memory, x, T, outputSize):
        batchSize = x.size(0)
        z_size = int(outputSize//4) + 1
        z = torch.zeros((batchSize,z_size), dtype=torch.float32).cuda()

        x1 = to_dlpack(x)
        z1 = to_dlpack(z)
        memory1 = to_dlpack(memory)

        c_x = cp.fromDlpack(x1) #.astype(cp.float32)
        c_z = cp.fromDlpack(z1)
        c_mem = cp.fromDlpack(memory1) #.astype(cp.float32)
        
        kernel = self.cuda_kernel_UFL()
        kernel((int(z_size//64) + 1, (batchSize // 16) + 1), (64,16), (c_x, c_mem, T, c_z, int(z_size), int(batchSize), int(outputSize)))

        return z.sum(dim=-1) #.double()

コード例 #21

0

ファイルを表示

ファイル: customFuncsCuda.py プロジェクト: adityarajagopal/multipres_training

def cuda_correlate(ip, weight, padding, stride, kernel_size, corner_case=False, op_channels=None, dilation=1) : 
    if corner_case:
        w_transform = torch.flip(weight, torch.arange(weight.dim()).tolist())
    o_channels = weight.size(0)
    ip_unfold = torch.nn.functional.unfold(ip, (kernel_size, kernel_size), padding=padding, stride=stride)
    w_unfold = weight.view(o_channels, -1) 
    
    # setup output
    batch_size, channels, height, width = ip.size()

    kernel_h, kernel_w = weight.size()[2:]
    output_h = int((height + 2 * padding - (dilation * (kernel_h - 1) + 1)) / stride + 1)
    output_w = int((width + 2 * padding - (dilation * (kernel_w - 1) + 1)) / stride + 1)
    output = ip.new(batch_size, weight.size(0), output_h, output_w)
    op_unfold = output.view(ip.size(0), weight.size(0), output_h*output_w).byte()

    # cupy conversion 
    ip_cupy = cp.fromDlpack(to_dlpack(ip_unfold)).astype('int8')
    w_cupy = cp.fromDlpack(to_dlpack(w_unfold)).astype('int8')
    op_cupy = cp.fromDlpack(to_dlpack(op_unfold)).astype('int8')

    # need to pretend like these are transposed, so swapping the values for m, n and k
    # should be: m = ip(1), n = ip(2) k = w(1)
    m = ip_unfold.size(2)
    n = ip_unfold.size(1)
    k = w_unfold.size(0)

    # set these up properly to make this work
    blockSize = 16
    batchNum = ip_unfold.size(0)
    dimBlock = (blockSize, blockSize, 1)
    dimGrid = (int((k + blockSize - 1)/ blockSize), int((m + blockSize - 1)/ blockSize), 1)
    # print("batchNum: ", batchNum, "dimBlock:", dimBlock, "dimGrid:", dimGrid)

    f = load_kernel('gpu_matrix_mult', matmul_kernel)
    f(block=dimBlock, 
      grid=dimGrid,
      args=[ip_cupy.data.ptr, w_cupy.data.ptr, op_cupy.data.ptr, m, n, k],
      stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

    # multiplication in cupy 
    op = cp.matmul(cp.transpose(ip_cupy, (0,2,1)), cp.transpose(w_cupy))
    op = cp.transpose(op, (0,2,1))

    return op

コード例 #22

0

ファイルを表示

ファイル: CUPYLive.py プロジェクト: huber1386/SpeedTorch

    def afterOptimizerStep(self, retrievedPosIndexes , retrievedNegIndexes = None):
        torch.cuda.synchronize()
        cupy.cuda.Device().synchronize()
        
        reshapedRetrieval = self._getReshapedRetrieval( retrievedPosIndexes, retrievedNegIndexes )

        for idx, optVar in enumerate(self.optVarList):
            self.CUPYcorpi[idx][ reshapedRetrieval ] = (
                cupy.fromDlpack( to_dlpack( self.given_optimizer.state_dict()['state'][ self.optimizerKey ][optVar] ) )  )

コード例 #23

0

ファイルを表示

 def decompress(self, tensor_compressed, shape):
     tensor_compressed, = tensor_compressed
     cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_compressed))
     sign = cupy_tensor > 127
     exps = cupy.bitwise_and(cupy_tensor, 0b01111111)
     floats = cupy.left_shift((exps + 18).astype(cupy.int32), 23).view(cupy.float32)
     tensor_decompressed = cupy.where(sign, -floats, floats)
     tensor_decompressed = cupy.multiply((exps >= 1).astype(cupy.float32), tensor_decompressed)
     return from_dlpack(tensor_decompressed.toDlpack()).view(shape)

コード例 #24

0

ファイルを表示

def com_reduce(buffer_m: torch.tensor, rank, world_size, comm):

    tensor_size = torch.numel(buffer_m)
    chunk_size = (tensor_size + world_size - 1) // world_size
    last_chunk_size = tensor_size - chunk_size * (world_size - 1)
    my_chunk_size = last_chunk_size if rank == world_size - 1 else chunk_size

    flatten_buffer_m = buffer_m.flatten()
    flatten_buffer_m_cupy = cupy.fromDlpack(to_dlpack(flatten_buffer_m))

    # First round of communication
    recvbuf = cupy.zeros([world_size, my_chunk_size],
                         dtype=flatten_buffer_m_cupy.dtype)

    requests = []
    for idx in range(world_size):
        start = idx * chunk_size
        length = last_chunk_size if idx == world_size - 1 else chunk_size

        req_sign = comm.Igather(flatten_buffer_m_cupy[start:start + length],
                                recvbuf,
                                root=idx)

        requests.append(req_sign)

    MPI.Request.Waitall(requests)

    # Second round of communication
    recvbuf_flatten = recvbuf.flatten()
    local_reduced_chunk = cupy.zeros(my_chunk_size,
                                     dtype=flatten_buffer_m_cupy.dtype)
    _avg_chunks(recvbuf_flatten, my_chunk_size, world_size,
                local_reduced_chunk)

    recvbuf_server = [
        cupy.zeros(chunk_size, dtype=flatten_buffer_m_cupy.dtype)
    ] * (world_size - 1)
    recvbuf_server.append(
        cupy.zeros(last_chunk_size, dtype=flatten_buffer_m_cupy.dtype))
    recvbuf_server[rank] = local_reduced_chunk

    server_requests = []
    for idx in range(world_size):
        if idx != rank:
            req_server_send = comm.Isend(local_reduced_chunk, idx)
            req_server_recv = comm.Irecv(recvbuf_server[idx], idx)

            server_requests.append(req_server_send)
            server_requests.append(req_server_recv)

    MPI.Request.Waitall(server_requests)

    recvbuf_server_flatten = cupy.concatenate(recvbuf_server)
    aggregated_m_tensor = from_dlpack(recvbuf_server_flatten.toDlpack())

    buffer_m.set_(aggregated_m_tensor.type(buffer_m.dtype).view_as(buffer_m))

コード例 #25

0

ファイルを表示

def torch_to_xp(input: torch.Tensor
                ) -> np.ndarray:
    # torch Tensor to numpy/cupy ndarray
    if not torch.is_tensor(input):
        raise RuntimeError(f'torch_to_numpy expects torch.Tensor as input, but got {type(input)}')

    if IS_CUPY_AVAILABLE and input.is_cuda:
        return cupy.fromDlpack(to_dlpack(input))
    else:
        return input.numpy()

コード例 #26

0

ファイルを表示

ファイル: CPUCupyPinned.py プロジェクト: y-hann/SpeedTorch

    def afterOptimizerStep(self,
                           retrievedPosIndexes,
                           retrievedNegIndexes=None):
        reshapedRetrieval = self._getReshapedRetrieval(retrievedPosIndexes,
                                                       retrievedNegIndexes)

        for idx, optVar in enumerate(self.optVarList):
            self.CUPYmemmap[idx][reshapedRetrieval] = (cupy.fromDlpack(
                to_dlpack(self.given_optimizer.state_dict()['state'][
                    self.optimizerKey][optVar])))

コード例 #27

0

ファイルを表示

def test_to_dlpack_mixed_dtypes():
    df = cudf.DataFrame({"a": [1, 2, 3, 4], "b": [10.32, 0.4, -0.2, -1000.32]})

    cudf_host_array = df.to_numpy()
    dlt = df.to_dlpack()

    cupy_array = cupy.fromDlpack(dlt)
    cupy_host_array = cupy_array.get()

    assert_eq(cudf_host_array, cupy_host_array)

コード例 #28

0

ファイルを表示

def test_to_dlpack_cupy_1d(data_1d):
    expectation = data_size_expectation_builder(data_1d, False)
    with expectation:
        gs = cudf.Series(data_1d, nan_as_null=False)
        cudf_host_array = gs.to_numpy(na_value=np.nan)
        dlt = gs.to_dlpack()

        cupy_array = cupy.fromDlpack(dlt)
        cupy_host_array = cupy_array.get()

        assert_eq(cudf_host_array, cupy_host_array)

コード例 #29

0

ファイルを表示

ファイル: test_dlpack.py プロジェクト: TravisHester/cudf

def test_to_dlpack_cupy_1d(data_1d):
    expectation = data_size_expectation_builder(data_1d, False)
    with expectation:
        gs = cudf.Series(data_1d, nan_as_null=False)
        cudf_host_array = gs.to_array(fillna="pandas")
        dlt = gs._column.to_dlpack()

        cupy_array = cupy.fromDlpack(dlt)
        cupy_host_array = cupy_array.get()

        assert_eq(cudf_host_array, cupy_host_array)

コード例 #30

0

ファイルを表示

 def content(self, input, gan_out):
     input = cp.fromDlpack(to_dlpack(input))
     gan_out = cp.fromDlpack(to_dlpack(gan_out))
     output = []
     for x_it, gan_out_it in zip(input, gan_out):
         ch, w, h = x_it.shape
         gan_out_it = gan_out_it.reshape((ch, -1))
         x_it = x_it.reshape((ch, -1))
         x_it = cp.concatenate((x_it, cp.ones((1, x_it.shape[1]))), axis=0)
         gan_out_it = cp.asarray(gan_out_it)
         x_it_inv = cp.linalg.pinv(x_it)
         weight = cp.dot(gan_out_it, x_it_inv)
         if (abs(weight[:, 3]).mean() > self.max_bias
                 or abs(weight[:3, :3]).mean() < self.min_weight
             ) and self.last_weight is not None:
             print(abs(weight[:, 3]).mean(), abs(weight[:3, :3]).mean())
             weight = self.last_weight.copy()
         else:
             self.last_weight = weight.copy()
         output.append(cp.dot(weight, x_it).reshape((ch, w, h)))
     return from_dlpack(cp.stack(output).toDlpack()).float()

コード例 #31

0

ファイルを表示

ファイル: wrappers.py プロジェクト: spacy-io/thinc

def torch2xp(torch_tensor):
    if torch_tensor.is_cuda:
        return cupy.fromDlpack(torch.utils.dlpack.to_dlpack(torch_tensor))
    else:
        return torch_tensor.detach().numpy()