Beispiel #1
0
 def forward(self, x, init_states=None):
     """Assumes x is of shape (batch, sequence, feature)"""
     bs, seq_sz, _ = x.size()
     hidden_seq = []
     if init_states is None:
         h_t, c_t = (
             flow.zeros((bs, self.hidden_size)).to(x.device),
             flow.zeros((bs, self.hidden_size)).to(x.device),
         )
     else:
         h_t, c_t = init_states
     HS = self.hidden_size
     for t in range(seq_sz):
         x_t = x[:, t, :].reshape(x.shape[0], x.shape[2])
         gates = flow.matmul(x_t, self.W) + flow.matmul(h_t, self.U) + self.bias
         i_t, f_t, g_t, o_t = (
             flow.sigmoid(gates[:, :HS]),
             flow.sigmoid(gates[:, HS : HS * 2]),
             flow.tanh(gates[:, HS * 2 : HS * 3]),
             flow.sigmoid(gates[:, HS * 3 :]),
         )
         c_t = f_t * c_t + i_t * g_t
         h_t = o_t * flow.tanh(c_t)
         hidden_seq.append(h_t.unsqueeze(1))
     hidden_seq = flow.cat(hidden_seq, dim=1)
     return hidden_seq, (h_t, c_t)
Beispiel #2
0
    def forward(self, x, init_states=None):
        """Assumes x is of shape (batch, sequence, feature)"""
        seq_sz, bs, _ = x.size()
        hidden_seq = []
        if init_states is None:
            h_t, c_t = (
                flow.zeros((bs, self.hidden_size)).to("cuda"),
                flow.zeros((bs, self.hidden_size)).to("cuda"),
            )
        else:
            h_t, c_t = init_states

        HS = self.hidden_size
        for t in range(seq_sz):
            x_t = x[t, :, :].reshape(x.shape[1], x.shape[2])
            # batch the computations into a single matrix multiplication
            # NOTE(Xu Zhiqiu): flow does not support view now, use reshape instead
            gates = flow.matmul(x_t, self.W) + flow.matmul(h_t,
                                                           self.U) + self.bias
            i_t, f_t, g_t, o_t = (
                flow.sigmoid(gates[:, :HS]),
                flow.sigmoid(gates[:, HS:HS * 2]),
                flow.tanh(gates[:, HS * 2:HS * 3]),
                flow.sigmoid(gates[:, HS * 3:]),
            )
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * flow.tanh(c_t)
            hidden_seq.append(h_t.unsqueeze(0))
        hidden_seq = flow.cat(hidden_seq, dim=0)
        return hidden_seq, (h_t, c_t)
Beispiel #3
0
    def forward(self, x, init_states=None):
        seq_sz, bs, _ = x.size()
        hidden_seq = []
        if init_states is None:
            h_t, c_t = (
                flow.zeros((bs, self.hidden_size)).to("cuda"),
                flow.zeros((bs, self.hidden_size)).to("cuda"),
            )
        else:
            h_t, c_t = init_states

        HS = self.hidden_size
        for t in range(seq_sz):
            x_t = x[t, :, :]
            x_t = x_t.reshape(x.shape[1], x.shape[2])
            gates = flow.matmul(x_t, self.W) + flow.matmul(h_t,
                                                           self.U) + self.bias
            i_t, f_t, g_t, o_t = (
                flow.sigmoid(gates[:, :HS]),
                flow.sigmoid(gates[:, HS:HS * 2]),
                flow.tanh(gates[:, HS * 2:HS * 3]),
                flow.sigmoid(gates[:, HS * 3:]),
            )
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * flow.tanh(c_t)
            hidden_seq.append(h_t.unsqueeze(0))
        hidden_seq = flow.cat(hidden_seq, dim=0)
        return hidden_seq, (h_t, c_t)
Beispiel #4
0
 def _create_parameters(self, weight_shape, weight_bound, bias_shape,
                        bias_bound):
     self.weight = flow.nn.Parameter(flow.zeros(weight_shape).uniform_(
         -weight_bound, weight_bound),
                                     requires_grad=True)
     if bias_shape is not None:
         self.bias = flow.nn.Parameter(flow.zeros(bias_shape).uniform_(
             -bias_bound, bias_bound),
                                       requires_grad=True)
     else:
         self.bias = None
Beispiel #5
0
def get_mean_and_std(dataset):
    '''Compute the mean and std value of dataset.'''
    dataloader = flow.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
    mean = flow.zeros(3)
    std = flow.zeros(3)
    print('==> Computing mean and std..')
    for inputs, targets in dataloader:
        for i in range(3):
            mean[i] += inputs[:,i,:,:].mean()
            std[i] += inputs[:,i,:,:].std()
    mean.div_(len(dataset))
    std.div_(len(dataset))
    return mean, std
Beispiel #6
0
 def __init__(self, pred):
     super().__init__()
     if pred.is_global:
         self.param = flow.nn.Parameter(
             flow.zeros(
                 *pred.shape,
                 dtype=pred.dtype,
                 placement=pred.placement,
                 sbp=pred.sbp,
             )
         )
     else:
         self.param = flow.nn.Parameter(
             flow.zeros(*pred.shape, dtype=pred.dtype, device=pred.device)
         )
Beispiel #7
0
 def __init__(self, features, eps=1e-6):
     super(LayerNorm, self).__init__()
     self.eps = eps
     self.weight = nn.Parameter(
         flow.Tensor(flow.ones(features, dtype=flow.float32)))
     self.bias = nn.Parameter(
         flow.Tensor(flow.zeros(features, dtype=flow.float32)))
Beispiel #8
0
    def forward(self, inputs, targets):
        """
        Args:
            inputs (torch.Tensor): feature matrix with shape (batch_size, feat_dim).
            targets (torch.LongTensor): ground truth labels with shape (num_classes).
        """
        n = inputs.size(0)

        # Compute pairwise distance, replace by the official when merged
        dist = flow.pow(inputs, 2).sum(dim=1).expand(n, n)
        dist = dist + flow.transpose(dist, dim0=1, dim1=0)
        temp1 = -2 * flow.matmul(inputs, flow.transpose(inputs, dim0=1,
                                                        dim1=0))
        dist = flow.add(dist, temp1)
        dist = flow.sqrt(flow.clamp(dist, min=1e-12))
        # For each anchor, find the hardest positive and negative
        mask = targets.expand(n, n).eq(
            flow.transpose(targets.expand(n, n), dim0=1, dim1=0))
        dist_ap, dist_an = [], []
        y1 = flow.zeros((1, n), dtype=flow.float32).to("cuda")
        y2 = flow.Tensor(np.exp(100 * np.ones((1, n)))).to("cuda")

        for i in range(n):
            temp_dist = flow.slice(dist, [(i, i + 1, 1)])
            temp_mask = flow.slice(mask, [(i, i + 1, 1)])
            temp_mask_rev = flow.slice(1 - mask, [(i, i + 1, 1)])
            dist_ap.append(temp_mask.where(temp_dist, y1).max().unsqueeze(0))
            dist_an.append(
                temp_mask_rev.where(temp_dist, y2).min().unsqueeze(0))
        dist_ap = flow.cat(dist_ap)
        dist_an = flow.cat(dist_an)

        # Compute ranking hinge loss
        y = flow.ones_like(dist_an)
        return self.ranking_loss(dist_an, dist_ap, y)
Beispiel #9
0
def _setitem(self, key, value):
    if self.is_consistent:
        if isinstance(value, (int, float)):
            value = flow._C.consistent_constant(
                [1],
                value,
                dtype=self.dtype,
                placement=self.placement,
                sbp=flow.sbp.broadcast,
            )
        else:
            if value.is_consistent:
                value = value.to_consistent(sbp=flow.sbp.broadcast)
                # TODO: remove these lines after asymmetric boxing is ready
                local_tensor = value.to_local()
                if local_tensor.nelement() == 0:
                    local_tensor = flow.zeros(*value.shape)
                value = local_tensor.to_consistent(self.placement,
                                                   sbp=flow.sbp.broadcast)
            else:
                value = value.to_consistent(self.placement,
                                            sbp=flow.sbp.broadcast)
    else:
        if isinstance(value, (int, float)):
            value = flow._C.constant([1],
                                     value,
                                     dtype=self.dtype,
                                     device=self.device)
        else:
            value = value.to(device=self.device)

    flow._C.tensor_setitem(self, key, value)
    return self
Beispiel #10
0
    def __init__(
        self,
        num_features: int,
        eps: float = 1e-05,
        momentum: float = 0.1,
        affine: bool = True,
        track_running_stats: bool = True,
    ) -> None:
        super().__init__()
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        self.affine = affine
        self.track_running_stats = track_running_stats
        if self.affine:
            self.weight = flow.nn.Parameter(flow.Tensor(num_features))
            self.bias = flow.nn.Parameter(flow.Tensor(num_features))
        else:
            self.register_parameter("weight", None)
            self.register_parameter("bias", None)
        if self.track_running_stats:
            self.register_buffer("running_mean", flow.zeros(num_features))
            self.register_buffer("running_var", flow.ones(num_features))
            self.register_buffer("num_batches_tracked",
                                 flow.tensor(0, dtype=flow.long))
        else:
            self.register_buffer("running_mean", None)
            self.register_buffer("running_var", None)
            self.register_buffer("num_batches_tracked", None)

        self.reset_parameters()
Beispiel #11
0
    def forward(
        self,
        input_ids: flow.Tensor,
        token_type_ids: Optional[flow.Tensor] = None,
        position_ids: Optional[flow.Tensor] = None,
    ) -> flow.Tensor:
        input_shape = input_ids.size()
        seq_length = input_shape[1]

        if token_type_ids is None:
            token_type_ids = flow.zeros(input_shape,
                                        dtype=flow.long,
                                        device=input_ids.device)
        if position_ids is None:
            position_ids = flow.arange(seq_length,
                                       dtype=flow.long,
                                       device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)

        input_embeddings = self.token_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        position_embeddings = self.position_embeddings(position_ids)

        embeddings = input_embeddings + position_embeddings + \
            token_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings
Beispiel #12
0
    def forward(self, x, hidden=None):
        batch_size, seq_len, _ = x.size()
        H_S = self.hidden_size
        hidden_seq = []

        if hidden is None:
            h_t = flow.zeros((batch_size, self.hidden_size))
        else:
            h_t = hidden

        for t in range(seq_len):
            x_t = x[:, t, :]
            gates_1 = flow.matmul(x_t, self.inp_W) + self.inp_b
            gates_2 = flow.matmul(h_t, self.hid_W) + self.hid_b

            r_gate = flow.sigmoid(gates_1[:, :H_S] + gates_2[:, :H_S])
            z_gate = flow.sigmoid(gates_1[:, H_S:H_S * 2] +
                                  gates_2[:, H_S:H_S * 2])
            h_t_ = flow.tanh(gates_1[:, H_S * 2:H_S * 3] +
                             r_gate * gates_2[:, H_S * 2:H_S * 3])
            h_t = (1 - z_gate) * h_t_ + z_gate * h_t

            hidden_seq.append(h_t.unsqueeze(1))

        hidden_seq = flow.cat(hidden_seq, dim=1)
        return hidden_seq, h_t
Beispiel #13
0
 def test_cuda_manual_seed(test_case):
     flow.cuda.manual_seed(30)
     device = flow.device("cuda", flow.cuda.current_device())
     x = flow.randn(2, 4, device=device)
     tensor_list = [flow.zeros((2, 4), dtype=flow.int32) for _ in range(2)]
     flow.comm.all_gather(tensor_list, x)
     test_case.assertTrue(
         np.allclose(tensor_list[0].numpy(), tensor_list[1].numpy()))
Beispiel #14
0
    def noisy_top_k_gating(self, x, train, noise_epsilon=1e-2):
        """Noisy top-k gating.
          See paper: https://arxiv.org/abs/1701.06538.
          Args:
            x: input Tensor with shape [batch_size, input_size]
            train: a boolean - we only add noise at training time.
            noise_epsilon: a float
          Returns:
            gates: a Tensor with shape [batch_size, num_experts]
            load: a Tensor with shape [num_experts]
        """

        clean_logits = oneflow.matmul(x, self.w_gate)

        if self.noisy_gating:
            raw_noise_stddev = oneflow.matmul(x, self.w_noise)
            noise_stddev = (self.softplus(raw_noise_stddev) + noise_epsilon) * train
            #            noisy_logits = clean_logits + ( torch.randn(clean_logits.size()) * noise_stddev)
            # TODO, fix this after torch randn argument fixed
            noisy_logits = clean_logits + (
                flow.randn(
                    clean_logits.size()[0],
                    clean_logits.size()[1],
                    device=clean_logits.device,
                )
                * noise_stddev
            )

            logits = noisy_logits
        else:
            logits = clean_logits

        # calculate topk + 1 that will be needed for the noisy gates
        top_logits, top_indices = logits.topk(min(self.k + 1, self.num_experts), dim=1)
        top_k_logits = top_logits[:, : self.k]
        top_k_indices = top_indices[:, : self.k]
        top_k_gates = self.softmax(top_k_logits)

        top_k_logits = top_k_logits.to(logits.device)
        top_indices = top_indices.to(logits.device)
        top_logits = top_logits.to(logits.device)

        zeros = flow.zeros(
            logits.shape, dtype=logits.dtype, requires_grad=True, device=logits.device
        )
        gates = oneflow.scatter(zeros, 1, top_k_indices, top_k_gates)

        if self.noisy_gating and self.k < self.num_experts:
            load = (
                self._prob_in_top_k(
                    clean_logits, noisy_logits, noise_stddev, top_logits
                )
            ).sum(0)
        else:
            load = self._gates_to_load(gates)
        return gates, load
Beispiel #15
0
 def forward(self, cosine: flow.Tensor, label):
     index = flow.where(label != -1)[0]
     m_hot = flow.zeros(index.size()[0],
                        cosine.size()[1],
                        device=cosine.device)
     m_hot.scatter_(1, label[index, None], self.m)
     cosine.acos_()
     cosine[index] += m_hot
     cosine.cos_().mul_(self.s)
     return cosine
Beispiel #16
0
def masked_select_op(input, mask):
    """

    Returns a new 1-D tensor which indexes the input tensor according to the boolean mask mask which is a BoolTensor(In oneFlow BoolTensor is replaced by Int8Tensor).

    The shapes of the mask tensor and the input tensor don’t need to match, but they must be broadcastable.

    Args:
        input (Tensor): the input tensor.
        mask (Tensor): the tensor containing the binary mask to index with

    For example:

    .. code-block:: python

        >>> import oneflow as flow
        >>> import numpy as np
        
        >>> input = flow.tensor(np.array([[-0.4620, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]), dtype=flow.float32)
        >>> mask = input.gt(0.05)
        >>> out = flow.masked_select(input, mask)
        >>> out
        tensor([0.3139, 0.3898], dtype=oneflow.float32)
    """

    assert len(input.shape) == len(
        mask.shape
    ), f"The dim of masked_select module's inputs can not match, please check!"
    broadcast_like_shape = []
    broadcast_x_axes = []
    broadcast_mask_axes = []
    for i in range(len(input.shape)):
        max_dim = max(input.shape[i], mask.shape[i])
        broadcast_like_shape.append(max_dim)
        if max_dim != input.shape[i]:
            broadcast_x_axes.append(i)
        if max_dim != mask.shape[i]:
            broadcast_mask_axes.append(i)
    broadcast_like_tensor = flow.zeros(tuple(broadcast_like_shape),
                                       dtype=flow.float32,
                                       device=input.device)
    broadcast_like_tensor.requires_grad = input.requires_grad or mask.requires_grad
    if len(broadcast_x_axes) != 0:
        input = flow.broadcast_like(input,
                                    broadcast_like_tensor,
                                    broadcast_axes=tuple(broadcast_x_axes))
    if len(broadcast_mask_axes) != 0:
        mask = flow.broadcast_like(mask,
                                   broadcast_like_tensor,
                                   broadcast_axes=tuple(broadcast_mask_axes))
    mask = mask.to(dtype=input.dtype)
    res = flow._C.mul(input, mask)
    indices = flow.argwhere(res)
    gather_res = flow._C.gather_nd(res, indices)
    return gather_res.flatten()
Beispiel #17
0
def evaluate(encoder,
             decoder,
             sentence,
             input_lang,
             output_lang,
             max_length=MAX_LENGTH):
    with flow.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.init_Hidden().to(device)

        encoder_outputs = []

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs.append(encoder_output[0])
        if len(encoder_outputs) != max_length:
            for _ in range(max_length - len(encoder_outputs)):
                encoder_outputs.append(flow.zeros((1, 256)).to(device))
        encoder_outputs = flow.cat(encoder_outputs, dim=0)

        decoder_input = flow.tensor([[SOS_token]]).to(device)
        decoder_hidden = encoder_hidden
        decoded_words = []
        decoder_attentions = flow.zeros((max_length, max_length)).to(device)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.squeeze(0).data

            topv, topi = decoder_output.data.topk(1)
            if topi.squeeze().numpy() == EOS_token:
                decoded_words.append("<EOS>")
                break
            else:
                decoded_words.append(output_lang.index2word[int(
                    topi.squeeze().numpy())])
            decoder_input = topi.detach()

        return decoded_words, decoder_attentions[:di + 1]
Beispiel #18
0
    def forward(self, cosine, label):
        index = flow.where(label != -1)[0]
        m_hot = flow.zeros(index.size()[0],
                           cosine.size()[1],
                           device=cosine.device)

        m_hot = flow.scatter(m_hot, 1, label[index, None], self.m)
        cosine = cosine[index] - m_hot

        ret = cosine * self.s
        return ret
Beispiel #19
0
 def __init__(self, dim, eps=1e-05, elementwise_affine=True):
     super(GlobalChannelLayerNorm, self).__init__()
     self.eps = eps
     self.normalized_dim = dim
     self.elementwise_affine = elementwise_affine
     if elementwise_affine:
         self.beta = nn.Parameter(flow.zeros(dim, 1))
         self.gamma = nn.Parameter(flow.ones(dim, 1))
     else:
         self.register_parameter("weight", None)
         self.register_parameter("bias", None)
Beispiel #20
0
    def forward(self, preds, labels):
        top1_num = flow.zeros(1, dtype=flow.float32)
        num_samples = 0
        for pred, label in zip(preds, labels):
            clsidxs = pred.argmax(dim=-1)
            clsidxs = clsidxs.to(flow.int32)
            match = (clsidxs == label).sum()
            top1_num += match.to(device=top1_num.device, dtype=top1_num.dtype)
            num_samples += np.prod(label.shape).item()

        top1_acc = top1_num / num_samples
        return top1_acc
Beispiel #21
0
    def test_normal_out_tensor_data_type_error(test_case):
        with test_case.assertRaises(RuntimeError) as ctx:
            out = flow.zeros((3, 3), dtype=flow.float64)
            x = flow._C.normal(mean=0.0,
                               std=1.0,
                               size=(3, 3),
                               dtype=flow.float32,
                               out=out)

        test_case.assertTrue(
            "data type oneflow.float32 does not match data type of out parameter oneflow.float64"
            in str(ctx.exception))
Beispiel #22
0
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = flow.zeros((max_len, d_model))
        position = flow.arange(0, max_len, dtype=flow.float).unsqueeze(1)
        div_term = flow.exp(
            flow.arange(0, d_model, 2).to(flow.float) * (-math.log(10000.0) / d_model)
        ).unsqueeze(0)
        pe[:, 0::2] = flow.sin(position * div_term)
        pe[:, 1::2] = flow.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.pe = flow.nn.Parameter(pe, requires_grad=False)
Beispiel #23
0
 def __init__(self, d_model, max_len=5000):
     super(PositionalEncoding, self).__init__()
     # Compute the positional encodings once in log space.
     pe = flow.zeros(max_len, d_model, requires_grad=False)
     position = flow.arange(0, max_len).unsqueeze(1).to(dtype=flow.float32)
     div_term = flow.exp(
         flow.arange(0, d_model, 2).to(dtype=flow.float32)
         * -(math.log(10000.0) / d_model)
     )
     pe[:, 0::2] = flow.sin(position * div_term)
     pe[:, 1::2] = flow.cos(position * div_term)
     pe = pe.unsqueeze(0)
     self.register_buffer("pe", pe)
Beispiel #24
0
    def test(iter, model, loss_fn):
        size = len(iter.dataset)
        num_batches = len(iter)
        model.eval()
        test_loss, correct = 0, 0
        flag = 0
        with flow.no_grad():
            for x, y in iter:
                if x.shape[0] != config.batch_size:
                    flag = 1
                    n = config.batch_size - x.shape[0]
                    x_comp = flow.zeros((n, x.shape[1]))
                    y_comp = flow.zeros(y.shape[0])
                    x = flow.tensor(np.vstack((x.numpy(), x_comp.numpy())))
                    y = flow.tensor(np.hstack((y.numpy(), y_comp.numpy())))

                x = x.reshape(1, x.shape[0], x.shape[1])
                x = flow.tensor(x, dtype=flow.float32, device="cuda")
                y = flow.tensor(y, dtype=flow.int32, device="cuda")

                pred = model(x)

                test_loss += loss_fn(pred, y)
                if flag == 0:
                    bool_value = np.argmax(pred.numpy(), 1) == y.numpy()
                else:
                    bool_value = np.argmax(pred.numpy()[0:16],
                                           1) == y.numpy()[0:16]

                correct += float(bool_value.sum())
        test_loss /= num_batches
        print("test_loss", test_loss, "num_batches ", num_batches)
        correct /= size
        print(
            f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f}"
        )

        return test_loss, 100 * correct
Beispiel #25
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        kdim=None,
        vdim=None,
        batch_first=False,
    ) -> None:
        super(MultiheadAttention, self).__init__()

        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout
        self.batch_first = batch_first
        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"

        if self._qkv_same_embed_dim is False:
            self.q_proj_weight = Parameter(flow.zeros((embed_dim, embed_dim)))
            self.k_proj_weight = Parameter(flow.zeros((embed_dim, self.kdim)))
            self.v_proj_weight = Parameter(flow.zeros((embed_dim, self.vdim)))
            self.register_parameter("in_proj_weight", None)
        else:
            self.in_proj_weight = Parameter(
                flow.zeros((3 * embed_dim, embed_dim)))
            self.register_parameter("q_proj_weight", None)
            self.register_parameter("k_proj_weight", None)
            self.register_parameter("v_proj_weight", None)

        if bias:
            self.in_proj_bias = Parameter(flow.zeros(3 * embed_dim))
        else:
            self.register_parameter("in_proj_bias", None)
        self.out_proj = Linear(embed_dim, embed_dim, bias=bias)

        if add_bias_kv:
            self.bias_k = Parameter(flow.zeros((1, 1, embed_dim)))
            self.bias_v = Parameter(flow.zeros((1, 1, embed_dim)))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self._reset_parameters()
Beispiel #26
0
    def __init__(self, hidden_size, vocab_size, hidden_act=nn.GELU()):
        super().__init__()
        self.hidden_size = hidden_size

        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(hidden_size, vocab_size, bias=False)

        self.output_bias = nn.Parameter(flow.zeros(vocab_size))

        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.output_bias
Beispiel #27
0
    def test_normal_out_tensor_device_type_error(test_case):
        with test_case.assertRaises(RuntimeError) as ctx:
            out = flow.zeros((3, 3), dtype=flow.float32, device="cuda")
            x = flow._C.normal(
                mean=0.0,
                std=1.0,
                size=(3, 3),
                dtype=flow.float32,
                out=out,
                device="cpu",
            )

        test_case.assertTrue("does not match device type of out parameter" in
                             str(ctx.exception))
Beispiel #28
0
 def test_all_gather_1n2d(test_case):
     if flow.env.get_rank() == 0:
         np_arr = np.array([[2, 3], [4, 5]])
     elif flow.env.get_rank() == 1:
         np_arr = np.array([[1, 2], [3, 4]])
     input = flow.tensor(np_arr, device="cuda", dtype=flow.int32)
     tensor_list = [
         flow.zeros(np_arr.shape, dtype=flow.int32) for _ in range(2)
     ]
     flow.comm.all_gather(tensor_list, input)
     test_case.assertTrue(
         np.allclose(tensor_list[0].numpy(), np.array([[2, 3], [4, 5]])))
     test_case.assertTrue(
         np.allclose(tensor_list[1].numpy(), np.array([[1, 2], [3, 4]])))
    def test_copy(test_case):
        x = flow.zeros(2, 3)
        y = flow.ones(2, 3)
        x.copy_(y)
        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

        x = flow.zeros(4,
                       6,
                       placement=flow.placement("cuda", [0, 1]),
                       sbp=flow.sbp.broadcast)
        y = flow.ones(4,
                      6,
                      placement=flow.placement("cpu", [0]),
                      sbp=flow.sbp.broadcast)
        x.copy_(y)
        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

        x = flow.zeros(4,
                       6,
                       placement=flow.placement("cuda", [0, 1]),
                       sbp=flow.sbp.broadcast)
        y = flow.ones(4,
                      6,
                      placement=flow.placement("cuda", [0]),
                      sbp=flow.sbp.broadcast)
        x.copy_(y)
        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

        x = flow.zeros(4,
                       6,
                       placement=flow.placement("cuda", [0, 1]),
                       sbp=flow.sbp.split(0))
        y = flow.ones(4,
                      6,
                      placement=flow.placement("cuda", [0, 1]),
                      sbp=flow.sbp.broadcast)
        x.copy_(y)
        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

        x = flow.zeros(4,
                       6,
                       placement=flow.placement("cuda", [0, 1]),
                       sbp=flow.sbp.broadcast)
        y = flow.ones(4,
                      6,
                      placement=flow.placement("cuda", [0, 1]),
                      sbp=flow.sbp.broadcast)
        x.copy_(y)
        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

        x = flow.zeros(4,
                       6,
                       placement=flow.placement("cuda", [0, 1]),
                       sbp=flow.sbp.broadcast)
        y = np.ones((4, 6), dtype=np.float32)
        x.copy_(y)
        test_case.assertTrue(np.array_equal(x.numpy(), y))
Beispiel #30
0
    def test_lazy_1d_to_2d_sbp(test_case):
        P_1d = flow.placement(
            device_type="cuda", device_ids={0: range(4)}, hierarchy=(4,)
        )
        P_2d = flow.placement(
            device_type="cuda", device_ids={0: range(4)}, hierarchy=(2, 2)
        )
        B = flow.sbp.broadcast

        class Test1dTo2dModule(flow.nn.Module):
            def forward(self, x):
                return x.to_global(placement=P_2d, sbp=[B, B])

        class Test1dTo2dGraph(flow.nn.Graph):
            def __init__(self, model):
                super().__init__()
                self.model = model

            def build(self, x):
                return self.model(x)

        class Test2dTo1dModule(flow.nn.Module):
            def forward(self, x):
                return x.to_global(placement=P_1d, sbp=[B])

        class Test2dTo1dGraph(flow.nn.Graph):
            def __init__(self, model):
                super().__init__()
                self.model = model

            def build(self, x):
                return self.model(x)

        model_1d_to_2d = Test1dTo2dModule()
        graph_1d_to_2d = Test1dTo2dGraph(model_1d_to_2d)

        x = flow.zeros(4, 4, 4, 4, sbp=[B, B], placement=P_2d)
        x = x.to_global(placement=P_1d, sbp=[B])
        test_case.assertTrue(x.sbp == (B,))
        test_case.assertTrue(x.placement == P_1d)
        y = graph_1d_to_2d(x)
        test_case.assertTrue(y.sbp == (B, B))
        test_case.assertTrue(y.placement == P_2d)

        model_2d_to_1d = Test2dTo1dModule()
        graph_2d_to_1d = Test2dTo1dGraph(model_2d_to_1d)
        z = graph_2d_to_1d(y)
        test_case.assertTrue(z.sbp == x.sbp)
        test_case.assertTrue(z.placement == x.placement)