Exemple #1
0
    def build(self, inputs, targets):
        """
        Args:
            inputs (torch.Tensor): feature matrix with shape (batch_size, feat_dim).
            targets (torch.LongTensor): ground truth labels with shape (num_classes).
        """
        n = inputs.shape[0]
        dist = math.reduce_sum(math.pow(
            inputs, flow.constant_like(inputs, 2, dtype=flow.float32)),
                               axis=1)
        shape_tensor = flow.constant(value=0.0,
                                     dtype=flow.float32,
                                     shape=(n, n))
        dist = flow.broadcast_like(dist, like=shape_tensor, broadcast_axes=[1])
        dist = math.add(
            dist, flow.transpose(dist, perm=(1, 0),
                                 batch_axis_non_change=True))
        temp1 = math.multiply(
            -2,
            flow.matmul(
                inputs,
                flow.transpose(inputs, perm=(1, 0),
                               batch_axis_non_change=True)))
        dist = math.add(dist, temp1)
        dist = math.sqrt(flow.clamp(dist, min_value=1e-12))
        mask = math.equal(
            flow.broadcast_like(targets, like=shape_tensor,
                                broadcast_axes=[1]),
            flow.transpose(flow.broadcast_like(targets,
                                               like=shape_tensor,
                                               broadcast_axes=[1]),
                           perm=(1, 0),
                           batch_axis_non_change=True))
        mask_rev = math.not_equal(
            flow.broadcast_like(targets, like=shape_tensor,
                                broadcast_axes=[1]),
            flow.transpose(flow.broadcast_like(targets,
                                               like=shape_tensor,
                                               broadcast_axes=[1]),
                           perm=(1, 0),
                           batch_axis_non_change=True))
        dist_ap, dist_an = [], []
        for i in range(n):
            temp_dist = flow.slice_v2(dist, [(i, i + 1, 1)])
            temp_mask = flow.slice_v2(mask, [(i, i + 1, 1)])
            temp_mask_rev = flow.slice_v2(mask_rev, [(i, i + 1, 1)])
            dist_ap.append(
                math.reduce_max(
                    flow.gather_nd(temp_dist, flow.where(temp_mask))))
            dist_an.append(
                math.reduce_min(
                    flow.gather_nd(temp_dist, flow.where(temp_mask_rev))))
        dist_ap = flow.concat(dist_ap, 0)
        dist_an = flow.concat(dist_an, 0)
        y = flow.ones_like(dist_an)
        # return dist_an, dist_ap, y

        return self._MarginRankingLoss(dist_an, dist_ap, y)
Exemple #2
0
    def test_job(
            x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32),
            addend: oft.Numpy.Placeholder(input_shape, dtype=flow.float32),
    ):
        v = flow.get_variable(
            name="v",
            shape=(1, ),
            dtype=flow.float32,
            initializer=flow.zeros_initializer(),
        )

        x = x + v
        addend = addend + v

        x1 = flow.identity(x)
        x2 = flow.identity(x)

        addend1 = flow.identity(addend)
        addend2 = flow.identity(addend)

        flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
        flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))

        flow.watch_diff(addend1, test_global_storage.Setter("addend1_diff"))
        flow.watch_diff(addend2, test_global_storage.Setter("addend2_diff"))

        x1 = flow.cast(x1, data_type)
        x2 = flow.cast(x2, data_type)

        addend1 = flow.cast(addend1, data_type)
        addend2 = flow.cast(addend2, data_type)

        y1 = flow.layers.batch_normalization_add_relu(x1,
                                                      addend=addend1,
                                                      axis=axis,
                                                      name="BN1")
        y2 = flow.math.relu(
            flow.layers.batch_normalization(x2, axis=axis, name="BN2") +
            addend2)

        y1 = flow.cast(y1, flow.float32)
        y2 = flow.cast(y2, flow.float32)

        flow.watch(y1, test_global_storage.Setter("y1"))
        flow.watch(y2, test_global_storage.Setter("y2"))

        y1 = flow.where(flow.math.greater(y2, v), y1, v)
        y2 = flow.where(flow.math.greater(y1, v), y2, v)

        loss = y1 + y2
        flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([],
                                                                     [0.001]),
                           momentum=0).minimize(flow.math.reduce_sum(loss))

        return loss
Exemple #3
0
        def do_where(condition, x, y):
            with flow.scope.placement(device_type, "0:0"):
                x_var = flow.get_variable(
                    "x",
                    shape=x.shape,
                    dtype=flow.float,
                    initializer=flow.constant_initializer(0),
                )
                x_var = flow.cast_to_current_logical_view(x_var)
                x_var = x_var + x
                y_var = flow.get_variable(
                    "y",
                    shape=y.shape,
                    dtype=flow.float,
                    initializer=flow.constant_initializer(0),
                )
                y_var = flow.cast_to_current_logical_view(y_var)
                y_var = y_var + y

            z = flow.where(condition, x_var, y_var)

            with flow.scope.placement(device_type, "0:0"):
                flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
                    [], [1e-3]),
                                   momentum=0).minimize(z)

            flow.watch_diff(x_var, dz_dx_watcher)
            flow.watch_diff(y_var, dz_dy_watcher)
            return z
        def do_where(condition, x, y):
            with flow.scope.placement(device_type, "0:0"):
                x_var = flow.get_variable(
                    "x",
                    shape=x.shape,
                    dtype=flow.float,
                    initializer=flow.constant_initializer(0),
                )
                x_var = flow.cast_to_current_logical_view(x_var)
                x_var = x_var + x
                y_var = flow.get_variable(
                    "y",
                    shape=y.shape,
                    dtype=flow.float,
                    initializer=flow.constant_initializer(0),
                )
                y_var = flow.cast_to_current_logical_view(y_var)
                y_var = y_var + y

            z = flow.where(condition, x_var, y_var)

            with flow.scope.placement(device_type, "0:0"):
                flow.losses.add_loss(z)

            flow.watch_diff(x_var, dz_dx_watcher)
            flow.watch_diff(y_var, dz_dy_watcher)
            return z
Exemple #5
0
def _test_where_scalar(test_case, device):
    x = 0.5
    y = 2.0
    condition = flow.tensor(np.array([1]), dtype=flow.int32)
    of_out = flow.where(condition, x, y)
    np_out = np.array([0.5])
    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
Exemple #6
0
def _test_where_backward(test_case, device):
    x = flow.tensor(
        np.array([[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]),
        dtype=flow.float32,
        device=flow.device(device),
        requires_grad=True,
    )
    y = flow.tensor(
        np.ones(shape=(3, 2)),
        dtype=flow.float32,
        device=flow.device(device),
        requires_grad=True,
    )
    condition = flow.tensor(np.array([[0, 1], [1, 0], [1, 0]]),
                            dtype=flow.int32,
                            device=flow.device(device))
    of_out = flow.where(condition, x, y)
    of_out = of_out.sum()
    of_out.backward()
    test_case.assertTrue(
        np.allclose(x.grad.numpy(),
                    condition.numpy() == 1, 1e-05, 1e-05))
    test_case.assertTrue(
        np.allclose(y.grad.numpy(),
                    condition.numpy() == 0, 1e-05, 1e-05))
Exemple #7
0
 def forward(self, cosine: flow.Tensor, label):
     index = flow.where(label != -1)[0]
     m_hot = flow.zeros(index.size()[0],
                        cosine.size()[1],
                        device=cosine.device)
     m_hot.scatter_(1, label[index, None], self.m)
     cosine.acos_()
     cosine[index] += m_hot
     cosine.cos_().mul_(self.s)
     return cosine
Exemple #8
0
    def forward(self, cosine, label):
        index = flow.where(label != -1)[0]
        m_hot = flow.zeros(index.size()[0],
                           cosine.size()[1],
                           device=cosine.device)

        m_hot = flow.scatter(m_hot, 1, label[index, None], self.m)
        cosine = cosine[index] - m_hot

        ret = cosine * self.s
        return ret
Exemple #9
0
def _test_where_x_y_none(test_case, device):
    condition = flow.tensor(
        np.array([[[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]]),
        dtype=flow.float32,
        device=flow.device(device),
        requires_grad=True,
    )
    of_out = flow.where(condition)
    of_nonzero = flow.nonzero(condition, as_tuple=True)
    for i in range(len(of_out)):
        test_case.assertTrue(
            np.allclose(of_out[i].numpy(), of_nonzero[i].numpy(), 1e-05,
                        1e-05))
Exemple #10
0
def masked_fill(
    x: remote_blob_util.BlobDef,
    mask: remote_blob_util.BlobDef,
    value: Union[float, int],
    name: Optional[str] = None,
) -> remote_blob_util.BlobDef:
    r"""Fill a blob with a given value according to the given mask.

    Args:
        x (remote_blob_util.BlobDef): Input Blob.
        mask (remote_blob_util.BlobDef): Composed with 0 and 1, the input blob 'x' will be 
            filled with the given value where the mask is 1. 
        value (Union[int, int]): The value to use for filling the input blob.
        name (Optional[str], optional): The name for the operation. Defaults to None.
    Attention:
        x and mask must be broadcastable to each other.
        mask must be int type (int8/int32/int64).

    Returns:
        remote_blob_util.BlobDef: The value-filled Blob
    
    For example:

    .. code-block:: python

        import oneflow as flow
        import numpy as np
        import oneflow.typing as tp

        @flow.global_function()
        def masked_fill_Job(x: tp.Numpy.Placeholder((4, ), mask: tp.Numpy.Placeholder((4, ),
                            dtype = flow.int8))->tp.Numpy:
            return flow.masked_fill(x, mask, value=5)

        x = np.array([1, 2, 3, 4], dtype=np.float32)
        mask = np.array([1, 0, 0, 1], dtype=np.int8)

        out = masked_fill_Job(x, mask)
        
        # output [5 2 3 5]

    """
    if name is None:
        name = id_util.UniqueStr("MaskedFill_")
    value_like_x = flow.constant_like(like=x,
                                      value=value,
                                      name=name + "_ConstantLike")
    return flow.where(condition=mask,
                      x=value_like_x,
                      y=x,
                      name=name + "_Where")
Exemple #11
0
def _test_where(test_case, device):
    x = flow.tensor(
        np.array([[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]),
        dtype=flow.float32,
        device=flow.device(device),
    )
    y = flow.tensor(np.ones(shape=(3, 2)),
                    dtype=flow.float32,
                    device=flow.device(device))
    condition = flow.tensor(np.array([[0, 1], [1, 0], [1, 0]]),
                            dtype=flow.int32,
                            device=flow.device(device))
    of_out = flow.where(condition, x, y)
    np_out = np.array([[1.0, 0.3139], [0.3898, 1.0], [0.0478, 1.0]])
    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
Exemple #12
0
def att_distill(args, student_atts, teacher_atts):
    att_loss = 0.
    teacher_layer_num = len(teacher_atts)
    student_layer_num = len(student_atts)

    assert teacher_layer_num % student_layer_num == 0
    layers_per_block = int(teacher_layer_num / student_layer_num)
    new_teacher_atts = [
        teacher_atts[i * layers_per_block + layers_per_block - 1]
        for i in range(student_layer_num)
    ]

    for student_att, teacher_att in zip(student_atts, new_teacher_atts):
        student_att = flow.where(
            student_att <= flow.constant(-1e2, dtype=flow.float),
            flow.zeros_like(student_att), student_att)
        teacher_att = flow.where(
            teacher_att <= flow.constant(-1e2, dtype=flow.float),
            flow.zeros_like(teacher_att), teacher_att)

        tmp_loss = mseloss(student_att, teacher_att)
        att_loss += tmp_loss

    return att_loss
Exemple #13
0
    def _attn(self, query, key, value):
        attn_weights = flow.matmul(query, key.transpose(-2, -1))

        if self.scale_attn_weights:
            attn_weights = attn_weights / (float(value.size(-1))**0.5)

        query_length, key_length = query.size(-2), key.size(-2)
        causal_mask = self.bias[:, :, key_length -
                                query_length:key_length, :key_length]
        attn_weights = flow.where(causal_mask, attn_weights,
                                  self.masked_bias.to(attn_weights.dtype))

        attn_weights = nn.Softmax(dim=-1)(attn_weights)
        attn_weights = self.attn_dropout(attn_weights)

        attn_output = flow.matmul(attn_weights, value)
        return attn_output, attn_weights
Exemple #14
0
    def _prob_in_top_k(
        self, clean_values, noisy_values, noise_stddev, noisy_top_values
    ):
        """Helper function to NoisyTopKGating.
        Computes the probability that value is in top k, given different random noise.
        This gives us a way of backpropagating from a loss that balances the number
        of times each expert is in the top k experts per example.
        In the case of no noise, pass in None for noise_stddev, and the result will
        not be differentiable.
        Args:
        clean_values: a `Tensor` of shape [batch, n].
        noisy_values: a `Tensor` of shape [batch, n].  Equal to clean values plus
          normally distributed noise with standard deviation noise_stddev.
        noise_stddev: a `Tensor` of shape [batch, n], or None
        noisy_top_values: a `Tensor` of shape [batch, m].
           "values" Output of tf.top_k(noisy_top_values, m).  m >= k+1
        Returns:
        a `Tensor` of shape [batch, n].
        """

        batch = clean_values.size(0)
        m = noisy_top_values.size(1)
        top_values_flat = noisy_top_values.flatten()

        threshold_positions_if_in = (
            flow.arange(batch, device=noisy_values.device) * m + self.k
        )

        threshold_if_in = flow.unsqueeze(
            flow.gather(top_values_flat, 0, threshold_positions_if_in), 1
        )
        is_in = flow.gt(noisy_values, threshold_if_in)

        threshold_positions_if_out = threshold_positions_if_in - 1
        threshold_if_out = flow.unsqueeze(
            flow.gather(top_values_flat, 0, threshold_positions_if_out), 1
        )

        # is each value currently in the top k.
        prob_if_in = cdf((clean_values - threshold_if_in) / noise_stddev)
        prob_if_out = cdf((clean_values - threshold_if_out) / noise_stddev)

        prob = flow.where(is_in, prob_if_in, prob_if_out)
        return prob
Exemple #15
0
    def __call__(self, x, padding=None):
        # Retrieve dynamically known shapes
        batch_size = x.shape[0]
        length = x.shape[1]

        if padding is not None:
            with flow.scope.namespace("remove_padding"):
                # Flatten padding to [batch_size*length]
                pad_mask = flow.reshape(padding, [-1])

                nonpad_ids = flow.cast(flow.where(pad_mask < 1e-9), dtype=flow.int32)
                # nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9))

                # Reshape x to [batch_size*length, hidden_size] to remove padding
                x = flow.reshape(x, [-1, self.hidden_size])
                x = flow.gather_nd(x, indices=nonpad_ids)

                # Reshape x from 2 dimensions to 3 dimensions.

                # TODO:Maybe has a batch axis error in there
                x = flow.expand_dims(x, axis=0)

        output = self._build_dense(x, self.filter_size, name="filter_layer")
        if self.train:
            # In TensorFlow the param means `keep_prob` and use `1-dropout`,
            # but our dropout means drop rate so i just use dropout !
            output = flow.nn.dropout(output, self.relu_dropout)
        if padding is not None:
            with flow.scope.namespace("re_add_padding"):
                output = flow.squeeze(output, axis=[0, ])
                output = flow.scatter_nd(
                    indices=nonpad_ids,
                    updates=output,
                    shape=[batch_size * length, self.hidden_size]
                )
                output = flow.reshape(output, [batch_size, length, self.hidden_size])
        return output
Exemple #16
0
def _where(self, x=None, y=None):
    return flow.where(self, x, y)
Exemple #17
0
def ctc_loss(
    log_probs: oneflow_api.BlobDesc,
    targets: oneflow_api.BlobDesc,
    input_lengths: oneflow_api.BlobDesc,
    target_lengths: oneflow_api.BlobDesc,
    blank: int = 0,
    reduction: str = "mean",
    zero_infinity: bool = False,
    name: Optional[str] = None,
) -> oneflow_api.BlobDesc:
    r"""Computes the CTC(Connectionist Temporal Classification) loss.
    This operator implements the CTC loss as presented in (Graves et al., 2006).


    Args:
        log_probs (oneflow_api.BlobDesc): A Blob of shape [input_length, batch_size, num_labels]. The logarithmized probabilities of the outputs (e.g. obtained with flow.nn.logsoftmax()).
        targets (oneflow_api.BlobDesc): A Blob of shape [batch_size, max_target_length]. It represent the target sequences. Each element in the target sequence is a class index. And the target index cannot be blank (default=0).
        input_lengths (oneflow_api.BlobDesc): A Blob of shape [batch_size]. It represent the lengths of the inputs. And the lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths.
        target_lengths (oneflow_api.BlobDesc): A Blob of shape [batch_size]. It represent lengths of the targets. Lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths.
        blank (int, optional): Blank label. Defaults to 0.
        reduction (str, optional): The reduce type, it can be the one of "none", "mean", "sum". "none": no reduction will be applied, "mean": the output losses will be divided by the target lengths and then the mean over the batch is taken, "sum": the output will be summed. Defaults to "mean".
        zero_infinity (bool, optional):  Whether to zero infinite losses and the associated gradients. Infinite losses mainly occur when the inputs are too short to be aligned to the targets. Defaults to False.
        name (Optional[str], optional): The name for the operation. Defaults to None.

    Returns:
        oneflow_api.BlobDesc: The result Blob.

    For example: 

    .. code-block:: python 

        import oneflow as flow
        import oneflow.typing as tp
        import numpy as np


        @flow.global_function()
        def ctc_loss_job(
            log_probs: tp.Numpy.Placeholder(shape=(5, 2, 3)),
            targets: tp.Numpy.Placeholder(shape=(2, 3), dtype=flow.int32),
            input_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int32),
            target_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int32),
        ) -> tp.Numpy:
            loss = flow.ctc_loss(
                log_probs, targets, input_lengths, target_lengths, blank=0, reduction="none"
            )
            return loss


        log_probs = np.array(
            [
                [[-1.1031, -0.7998, -1.5200], [-0.9808, -1.1363, -1.1908]],
                [[-1.2258, -1.0665, -1.0153], [-1.1135, -1.2331, -0.9671]],
                [[-1.3348, -0.6611, -1.5118], [-0.9823, -1.2355, -1.0941]],
                [[-1.3850, -1.3273, -0.7247], [-0.8235, -1.4783, -1.0994]],
                [[-0.9049, -0.8867, -1.6962], [-1.4938, -1.3630, -0.6547]],
            ]
        ).astype(np.float32)
        targets = np.array([[1, 2, 2], [1, 2, 2]]).astype("int32")
        input_lengths = np.array([5, 5]).astype("int32")
        target_lengths = np.array([3, 3]).astype("int32")
        loss = ctc_loss_job(log_probs, targets, input_lengths, target_lengths)

        # loss [3.918017 2.907672]

    """
    name = name if name is not None else id_util.UniqueStr("CTCLoss_")
    loss, _ = (
        flow.user_op_builder(name)
        .Op("ctc_loss")
        .Input("log_probs", [log_probs])
        .Input("targets", [targets])
        .Input("input_lengths", [input_lengths])
        .Input("target_lengths", [target_lengths])
        .Output("loss")
        .Output("alpha")
        .Attr("blank", int(blank))
        .Attr("zero_infinity", zero_infinity)
        .Build()
        .InferAndTryRun()
        .RemoteBlobList()
    )

    if zero_infinity:
        cond = flow.math.equal(
            loss,
            flow.constant(
                float("inf"),
                dtype=loss.dtype,
                shape=loss.shape,
                name=name + "_constant",
            ),
            name=name + "_equal",
        )
        loss = flow.where(
            cond,
            flow.zeros(dtype=loss.dtype, shape=loss.shape, name=name + "_zeros"),
            loss,
            name=name + "_where",
        )

    if reduction == "mean":
        return flow.math.reduce_mean(
            flow.math.xdivy(
                loss,
                flow.cast(
                    flow.math.clip_by_value(
                        target_lengths, min_value=1, name=name + "_clip_by_value"
                    ),
                    dtype=log_probs.dtype,
                    name=name + "_cast",
                ),
                name=name + "_xdivy",
            ),
            name=name + "_reduce_mean",
        )
    elif reduction == "sum":
        return flow.math.reduce_sum(loss, name=name + "_reduce_sum")
    else:
        return loss
Exemple #18
0
def _TransformerModel(input_blob,
                      attention_mask_blob,
                      seq_length,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=_Gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False,
                      replace_prob=0.0,
                      compress_ratio=1):

  # print('| transformer num hidden layers: ', num_hidden_layers)
  assert hidden_size % num_attention_heads == 0
  attention_head_size = int(hidden_size / num_attention_heads)
  input_width = hidden_size
  prev_output_blob = flow.reshape(input_blob, (-1, input_width))
  # all_layer_output_blobs = []

  per_add_teacher_layers = compress_ratio
  per_add_student_layers = 1
  teacher_layer_idx = student_layer_idx = 0

  def add_teacher_layer(base_teacher_layer_idx, sub_teacher_output_blob):
    for add_teacher_layer_idx in range(per_add_teacher_layers):
      sub_teacher_output_blob = addOnelayer(
          layer_idx=base_teacher_layer_idx+add_teacher_layer_idx,
          prev_output_blob=sub_teacher_output_blob,
          attention_mask_blob=attention_mask_blob,
          num_attention_heads=num_attention_heads,
          attention_head_size=attention_head_size,
          attention_probs_dropout_prob=attention_probs_dropout_prob,
          initializer_range=initializer_range, seq_length=seq_length, hidden_size=hidden_size,
          hidden_dropout_prob=hidden_dropout_prob,
          intermediate_act_fn=intermediate_act_fn,
          intermediate_size=intermediate_size, namescope_prefix='', is_train=False)
    return sub_teacher_output_blob

  def add_student_layer(base_student_layer_idx, sub_student_output_blob):
    # with flow.scope.namespace("student"):
      sub_student_output_blob = addOnelayer(
        base_student_layer_idx, sub_student_output_blob, attention_mask_blob,
        num_attention_heads, attention_head_size,
        attention_probs_dropout_prob, initializer_range, seq_length, hidden_size, hidden_dropout_prob,
        intermediate_act_fn, intermediate_size, namescope_prefix='student-', is_train=True)
      return sub_student_output_blob

  while teacher_layer_idx < num_hidden_layers:
    with flow.scope.placement("cpu", "0:0"):
      sample = flow.random.coin_flip(name='layer{}_replacing_prob'.format(teacher_layer_idx), probability=replace_prob)
      sample = sample.with_distribute(flow.distribute.broadcast())

    prev_output_blob = flow.where(
      sample,
      x=add_student_layer(student_layer_idx, prev_output_blob),
      y=add_teacher_layer(teacher_layer_idx, prev_output_blob),
      name='where_layer{}'.format(teacher_layer_idx)
    )

    teacher_layer_idx += per_add_teacher_layers
    student_layer_idx += per_add_student_layers
    # print('| current teacher_layer: ', teacher_layer_idx)
    # print('| current student_layer: ', student_layer_idx)
    # print('| num_hidden_layers: ', num_hidden_layers)

  input_shape = (-1, seq_length, hidden_size)
  final_output_blob = flow.reshape(prev_output_blob, input_shape)
  return [final_output_blob]
Exemple #19
0
 def where_fn(input_def: oft.ListNumpy.Placeholder(input_shape,
                                                   dtype=flow.float)):
     return flow.where(input_def)
Exemple #20
0
 def forward(self, x):
     return flow.where(
         x * self.beta > self.threshold,
         x,
         1 / self.beta * flow.log(1.0 + flow.exp(self.beta * x)),
     )
Exemple #21
0
def insightface_train_job():
    if args.use_synthetic_data:
        (labels, images) = ofrecord_util.load_synthetic(args)
    else:
        labels, images = ofrecord_util.load_train_dataset(args)
    print("train batch data: ", images.shape)
    embedding = insightface(images)

    def _get_initializer():
        return flow.random_normal_initializer(mean=0.0, stddev=0.01)

    trainable = True
    if args.loss_type == "arc_loss":
        s = args.margin_s
        m = args.margin
        fc1 = flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10)
        fc1 = flow.math.multiply(fc1, s)
        fc7 = flow.get_variable(
            name="fc7-weight",
            shape=(args.class_num, fc1.shape[1]),
            dtype=fc1.dtype,
            initializer=_get_initializer(),
            trainable=trainable,
            model_name="weight",
        )
        fc7 = flow.math.l2_normalize(input=fc7, axis=1, epsilon=1e-10)
        matmul = flow.matmul(a=fc1, b=fc7, transpose_b=True)
        labels_expand = flow.reshape(labels, (labels.shape[0], 1))
        zy = flow.gather(matmul, labels_expand, batch_dims=1)
        cos_t = flow.math.multiply(zy, 1 / s)
        cos_m = math.cos(m)
        sin_m = math.sin(m)
        mm = math.sin(math.pi - m) * m
        threshold = math.cos(math.pi - m)
        if args.easy_margin:
            cond = flow.math.relu(cos_t)
        else:
            cond_v = cos_t - threshold
            cond = flow.math.relu(cond_v)
        body = flow.math.square(cos_t)
        body = flow.math.multiply(body, -1.0)
        body = flow.math.add(1, body)
        sin_t = flow.math.sqrt(body)

        new_zy = flow.math.multiply(cos_t, cos_m)
        b = flow.math.multiply(sin_t, sin_m)
        b = flow.math.multiply(b, -1.0)
        new_zy = flow.math.add(new_zy, b)
        new_zy = flow.math.multiply(new_zy, s)
        if args.easy_margin:
            zy_keep = zy
        else:
            zy_keep = flow.math.add(zy, -s * mm)
        cond = flow.cast(cond, dtype=flow.int32)
        new_zy = flow.where(cond, new_zy, zy_keep)
        zy = flow.math.multiply(zy, -1.0)
        diff = flow.math.add(new_zy, zy)

        gt_one_hot = flow.one_hot(
            labels, depth=args.class_num, dtype=flow.float
        )
        body = flow.math.multiply(gt_one_hot, diff)
        fc7 = flow.math.add(matmul, body)
    elif args.loss_type == "margin_softmax":
        fc7_weight = flow.get_variable(
            name="fc7-weight",
            shape=(args.class_num, embedding.shape[1]),
            dtype=embedding.dtype,
            initializer=_get_initializer(),
            trainable=trainable,
            model_name="weight",
        )
        s = args.margin_s
        fc7_weight = flow.math.l2_normalize(
            input=fc7_weight, axis=1, epsilon=1e-10
        )
        fc1 = (
            flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) * s
        )
        fc7 = flow.matmul(a=fc1, b=fc7_weight, transpose_b=True)
        if args.loss_m1 != 1.0 or args.loss_m2 != 0.0 or args.loss_m3 != 0.0:
            if args.loss_m1 == 1.0 and args.loss_m2 == 0.0:
                s_m = s * args.loss_m3
                gt_one_hot = flow.one_hot(
                    labels,
                    depth=args.class_num,
                    on_value=s_m,
                    off_value=0.0,
                    dtype=flow.float,
                )
                fc7 = fc7 - gt_one_hot
            else:
                labels_expand = flow.reshape(labels, (labels.shape[0], 1))
                zy = flow.gather(fc7, labels_expand, batch_dims=1)
                cos_t = zy * (1 / s)
                t = flow.math.acos(cos_t)
                if args.loss_m1 != 1.0:
                    t = t * args.loss_m1
                if args.loss_m2 > 0.0:
                    t = t + args.loss_m2
                body = flow.math.cos(t)
                if args.loss_m3 > 0.0:
                    body = body - args.loss_m3
                new_zy = body * s
                diff = new_zy - zy
                gt_one_hot = flow.one_hot(
                    labels,
                    depth=args.class_num,
                    on_value=1.0,
                    off_value=0.0,
                    dtype=flow.float,
                )
                body = gt_one_hot * diff
                fc7 = fc7 + body
    elif args.loss_type == "softmax":
        if args.model_parallel:
            labels = labels.with_distribute(flow.distribute.broadcast())
            fc1_distribute = flow.distribute.broadcast()
            fc7_data_distribute = flow.distribute.split(1)
            fc7_model_distribute = flow.distribute.split(0)
        else:
            fc1_distribute = flow.distribute.split(0)
            fc7_data_distribute = flow.distribute.split(0)
            fc7_model_distribute = flow.distribute.broadcast()
        print("loss 0")
        fc7 = flow.layers.dense(
            inputs=embedding.with_distribute(fc1_distribute),
            units=args.class_num,
            activation=None,
            use_bias=False,
            kernel_initializer=_get_initializer(),
            bias_initializer=None,
            trainable=trainable,
            name=args.models_name,
            model_distribute=fc7_model_distribute,
        )
        fc7 = fc7.with_distribute(fc7_data_distribute)
    elif args.loss_type == "arc_loss_ms":
        labels = labels.with_distribute(flow.distribute.broadcast())
        fc7_model_distribute = flow.distribute.split(0)
        fc7_data_distribute = flow.distribute.split(1)
        fc7_weight = flow.get_variable(
            name="fc7-weight",
            shape=(args.class_num, embedding.shape[1]),
            dtype=embedding.dtype,
            initializer=_get_initializer(),
            trainable=trainable,
            model_name="weight",
            distribute=fc7_model_distribute,
        )
        s = args.margin_s
        fc7_weight = flow.math.l2_normalize(
            input=fc7_weight, axis=1, epsilon=1e-10
        )
        fc1 = (
            flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10)
        )
        fc1 = flow.parallel_cast(fc1, distribute=flow.distribute.broadcast())
        fc7 = flow.matmul(a=fc1, b=fc7_weight, transpose_b=True) #s1
        fc7 = flow.arc_loss(fc7, labels, margin=args.loss_m2)*60
        fc7 = fc7.with_distribute(fc7_data_distribute)
    else:
        raise NotImplementedError

    loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
        labels, fc7, name="softmax_loss"
    )
    
    lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(args.base_lr, [100000, 140000, 160000], [0.1, 0.01, 0.001])
    flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss)
    return loss
Exemple #22
0
    def forward(self, inputs, targets):
        n = inputs.shape[0]
        # Compute pairwise distance, replace by the official when merged
        tempname = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f')
        shape_tensor = flow.constant(value=0.0,
                                     dtype=flow.float32,
                                     shape=(n, n))
        if self.distance == 'euclidean':
            blob_2 = flow.get_variable(
                "blob_2_" + tempname,
                shape=inputs.shape,
                initializer=flow.constant_initializer(2),
                dtype=inputs.dtype)
            dist = flow.math.pow(inputs, blob_2)

            dist = flow.math.reduce_sum(dist, axis=1, keepdims=True)
            dist = flow.broadcast_like(dist, shape_tensor)
            tempdist = flow.transpose(dist)
            dist = dist + tempdist
            inputs_t = flow.transpose(inputs)
            dist = addmm(dist, inputs, inputs_t, beta=1, alpha=-2)
            dist = flow.clamp(dist, min_value=1e-12)
            dist = flow.math.sqrt(dist)
        elif self.distance == 'cosine':
            #fnorm=flow.math.l2_normalize(inputs, axis=1)
            fnorm = flow.math.reduce_mean(flow.math.divide(
                inputs, flow.math.l2_normalize(inputs, axis=1)),
                                          axis=1,
                                          keepdims=True)

            expand_fnorm = flow.broadcast_like(fnorm,
                                               like=inputs,
                                               broadcast_axes=[1])
            l2norm = flow.math.divide(inputs, expand_fnorm)
            l2norm_t = flow.transpose(l2norm, perm=(1, 0))
            dist = flow.math.negative(flow.matmul(l2norm, l2norm_t))
        # For each anchor, find the hardest positive and negative
        mask = math.equal(
            flow.broadcast_like(targets, like=shape_tensor,
                                broadcast_axes=[1]),
            flow.transpose(flow.broadcast_like(targets,
                                               like=shape_tensor,
                                               broadcast_axes=[1]),
                           perm=(1, 0),
                           batch_axis_non_change=True))
        mask_rev = math.not_equal(
            flow.broadcast_like(targets, like=shape_tensor,
                                broadcast_axes=[1]),
            flow.transpose(flow.broadcast_like(targets,
                                               like=shape_tensor,
                                               broadcast_axes=[1]),
                           perm=(1, 0),
                           batch_axis_non_change=True))
        dist_ap, dist_an = [], []
        for i in range(n):
            temp_dist = flow.slice_v2(dist, [(i, i + 1, 1)])
            temp_mask = flow.slice_v2(mask, [(i, i + 1, 1)])
            temp_mask_rev = flow.slice_v2(mask_rev, [(i, i + 1, 1)])
            temp_dist_ap = flow.expand_dims(
                math.reduce_max(
                    flow.gather_nd(temp_dist, flow.where(temp_mask))), 0)
            temp_dist_an = flow.expand_dims(
                math.reduce_min(
                    flow.gather_nd(temp_dist, flow.where(temp_mask_rev))), 0)
            dist_ap.append(temp_dist_ap)
            dist_an.append(temp_dist_an)
        dist_ap = flow.concat(dist_ap, 0)
        dist_an = flow.concat(dist_an, 0)
        y = flow.ones_like(dist_an)
        return self._MarginRankingLoss(dist_an, dist_ap, y)
Exemple #23
0
    def loss_layer(self,
                   feature_map,
                   pred,
                   label,
                   bboxes,
                   stride,
                   prefix='loss_layer'):
        '''

        :param feature_map: [N, H, W, 3*(5+class_num)]
        :param pred: [N, H, W, 3, 4+1+class_num]
        :param label:  [N, H, W, 3, 4+1+class_num]
        :param bboxes:  [N, V, 4]
        :param stride:
        :param anchor_per_scale:
        :return:
            giou_loss:
            conf_loss:
            prob_loss:
        '''
        feature_map = flow.reshape(
            feature_map,
            shape=(feature_map.shape[0], feature_map.shape[1],
                   feature_map.shape[2], self.anchor_per_scale, -1))
        # shape: [N, H, W, 3, 1]
        raw_conf = flow.slice(feature_map,
                              begin=[None, None, None, None, 4],
                              size=[None, None, None, None, 1])
        # shape: [N, H, W, 3, class_num]
        raw_prob = flow.slice(
            feature_map,
            begin=[None, None, None, None, 5],
            size=[None, None, None, None, feature_map.shape[-1] - 5])

        #  [N, H, W, 3, 4]
        pred_xywh = flow.slice(pred,
                               begin=[None, None, None, None, 0],
                               size=[None, None, None, None, 4])
        pred_conf = flow.slice(pred,
                               begin=[None, None, None, None, 4],
                               size=[None, None, None, None, 1])

        #flow.slice(label, begin=[None, None, None, None, 0], size=[None, None, None, None, 4])
        label_xywh = flow.slice(label,
                                begin=[None, None, None, None, 0],
                                size=[None, None, None, None, 4])
        respond_bbox = flow.slice(label,
                                  begin=[None, None, None, None, 4],
                                  size=[None, None, None, None, 1])
        label_prob = flow.slice(
            label,
            begin=[None, None, None, None, 5],
            size=[None, None, None, None, label.shape[-1] - 5])
        # [N, H, W, 3, 1]
        giou = self.bbox_giou(pred_xywh, label_xywh)
        # label_w = flow.slice(label, begin=[None, None, None, None, 2], size=[None, None, None, None, 1])
        # label_h = flow.slice(label, begin=[None, None, None, None, 3], size=[None, None, None, None, 1])
        # bbox_loss_scale = 2.0 - 1.0 * label_w * label_h / ((stride * feature_map.shape[1]) ** 2)  #???
        # [N, H, W, 3, 1]
        # giou_loss = respond_bbox * bbox_loss_scale * (1 - giou)
        giou_loss = respond_bbox * (1 - giou)

        # [N, 1, 1, 1, V, 4]
        bboxes_ = flow.expand_dims(bboxes, axis=1)
        bboxes_ = flow.expand_dims(bboxes_, axis=1)
        bboxes_ = flow.expand_dims(bboxes_, axis=1)
        # [N, H, W, 3, V]
        iou = self.bbox_iou(flow.expand_dims(pred_xywh, axis=-2), bboxes_)
        iou = flow.squeeze(iou, axis=[
            -1,
        ])
        # [N, H, W, 3, 1]
        max_iou = flow.math.reduce_max(iou, axis=-1, keepdims=True)
        # respond_bgd = (1.0 - respond_bbox) * (max_iou < self.iou_loss_thresh)
        tmp = flow.math.less(
            max_iou,
            flow.constant_like(like=max_iou,
                               value=self.iou_loss_thresh,
                               dtype=flow.float32))
        # respond_bgd = (1.0 - respond_bbox) * tmp
        respond_bgd = flow.where(
            tmp, 1.0 - respond_bbox,
            flow.zeros_like(respond_bbox, dtype=flow.float32))
        # [N, H, W, 3, 1]
        # ce = flow.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=raw_conf)
        # alpha_t = respond_bbox*self.focus_loss_alpha+(1.0-respond_bbox)*(1.0-self.focus_loss_alpha)
        # conf_loss = alpha_t*flow.math.pow(1.0-flow.math.exp(flow.math.negative(ce)), self.focus_loss_gamma)*ce
        # conf_loss = (respond_bbox+respond_bgd)*conf_loss
        conf_focal = self.focal(respond_bbox, pred_conf)
        conf_loss = conf_focal * (
            respond_bbox * flow.nn.sigmoid_cross_entropy_with_logits(
                labels=respond_bbox, logits=raw_conf) +
            respond_bgd * flow.nn.sigmoid_cross_entropy_with_logits(
                labels=respond_bbox, logits=raw_conf))
        # [N, H, W, 3, 1]
        prob_loss = respond_bbox * flow.nn.sigmoid_cross_entropy_with_logits(
            labels=label_prob, logits=raw_prob)

        #??
        # label_w = flow.slice(label, begin=[None, None, None, None, 2], size=[None, None, None, None, 1])
        # label_h = flow.slice(label, begin=[None, None, None, None, 3], size=[None, None, None, None, 1])
        # bbox_loss_scale = 2.0 - 1.0 * label_w * label_h / ((stride * feature_map.shape[1]) * (stride * feature_map.shape[2]))  #???
        # # [N, H, W, 3, 1]
        # giou_loss = respond_bbox * bbox_loss_scale * flow.smooth_l1_loss(prediction=pred_xywh, label=label_xywh)

        giou_loss = flow.math.reduce_mean(
            flow.math.reduce_sum(giou_loss, axis=[1, 2, 3, 4]))
        conf_loss = flow.math.reduce_mean(
            flow.math.reduce_sum(conf_loss, axis=[1, 2, 3, 4]))
        prob_loss = flow.math.reduce_mean(
            flow.math.reduce_sum(prob_loss, axis=[1, 2, 3, 4]))

        return giou_loss, conf_loss, prob_loss
Exemple #24
0
 def do_where(condition, x, y):
     return flow.where(condition, x, y)