Ejemplo n.º 1
0
 def record(self, loss):
     if isinstance(loss, flow.Tensor):
         self.numel += loss.shape.numel()
         loss = loss.sum()
         if self.loss_sum is None:
             self.loss_sum = flow.zeros_like(loss)
         self.loss_sum += loss
     elif isinstance(loss, np.ndarray):
         self.numel += loss.size
         loss = loss.sum()
         if self.loss_sum is None:
             self.loss_sum = flow.zeros_like(loss)
         self.loss_sum += loss
     elif isinstance(loss, float):
         self.numel += 1
         if self.loss_sum is None:
             self.loss_sum = 0.0
         self.loss_sum += loss
     elif isinstance(loss, int):
         self.numel += 1
         if self.loss_sum is None:
             self.loss_sum = 0
         self.loss_sum += loss
     else:
         raise TypeError(f"invalid loss type: {type(loss)}")
Ejemplo n.º 2
0
    def step(self, closure: Callable = None):
        """Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        with flow.no_grad():
            loss = None
            if closure is not None:
                loss = closure()

            for param_group in self.param_groups:
                if param_group["do_bias_correction"]:
                    param_group["bias_correction1"] = 1.0 - math.pow(
                        param_group["betas"][0], self._state["step"] + 1
                    )
                    param_group["bias_correction2"] = 1.0 - math.pow(
                        param_group["betas"][1], self._state["step"] + 1
                    )

                kwargs = {
                    "learning_rate": param_group["lr"],
                    "bias_correction1": param_group["bias_correction1"],
                    "bias_correction2": param_group["bias_correction2"],
                    "l2": param_group["weight_decay"],
                    "beta1": param_group["betas"][0],
                    "beta2": param_group["betas"][1],
                    "epsilon": param_group["eps"],
                    "do_bias_correction": param_group["do_bias_correction"],
                    "amsgrad": param_group["amsgrad"],
                }
                for param in param_group.parameters:
                    if param.grad is None:
                        continue
                    if "exp_avg" not in self._state[param]:
                        self._state[param]["exp_avg"] = flow.zeros_like(param)
                    if "exp_avg_sq" not in self._state[param]:
                        self._state[param]["exp_avg_sq"] = flow.zeros_like(param)
                    if "max_exp_avg_sq" not in self._state[param]:
                        self._state[param]["max_exp_avg_sq"] = flow.zeros_like(param)
                    m_tensor = self._state[param]["exp_avg"]
                    v_tensor = self._state[param]["exp_avg_sq"]
                    max_v_tensor = self._state[param]["max_exp_avg_sq"]
                    flow._C.dispatch_adam_update(
                        self._op,
                        (param, param.grad, m_tensor, v_tensor, max_v_tensor),
                        **kwargs,
                    )

            self._state["step"] += 1

            return loss
Ejemplo n.º 3
0
    def _test_send_recv_without_sending_meta(test_case, x0, src, dst):
        rank = flow.env.get_rank()
        if rank == src:
            x1 = x0
            flow.comm.send(x1, dst, send_meta=False)

            x2 = x0
            flow.comm.send(x2, dst, send_meta=False)
        elif rank == dst:
            x1 = flow.comm.recv(src,
                                shape=x0.shape,
                                dtype=x0.dtype,
                                device=x0.device)
            test_case.assertTrue(np.array_equal(x1.numpy(), x0.numpy()))

            x2 = flow.zeros_like(x0)
            flow.comm.recv(src,
                           shape=x0.shape,
                           dtype=x0.dtype,
                           device=x0.device,
                           out=x2)
            test_case.assertTrue(np.array_equal(x2.numpy(), x0.numpy()))
        else:
            # do nothing
            pass
Ejemplo n.º 4
0
 def step(self, closure: Callable = None):
     with flow.no_grad():
         loss = None
         if closure is not None:
             loss = closure()
         for param_group in self.param_groups:
             lr = param_group["lr"]
             l2 = param_group["weight_decay"]
             for param in param_group.parameters:
                 if param.grad is None:
                     continue
                 if param_group["momentum"] == 0.0:
                     flow._C.dispatch_sgd_update(self._sgd,
                                                 (param, param.grad),
                                                 learning_rate=lr,
                                                 l2=l2)
                 else:
                     if "momentum_buf" not in self._state[param]:
                         self._state[param][
                             "momentum_buf"] = flow.zeros_like(param)
                     momentum_buf = self._state[param]["momentum_buf"]
                     beta = param_group["momentum"]
                     flow._C.dispatch_momentum_update(
                         self._momentum_sgd,
                         (param, param.grad, momentum_buf),
                         learning_rate=lr,
                         l2=l2,
                         beta=beta,
                     )
         self._state["step"] = self._state["step"] + 1
         return loss
Ejemplo n.º 5
0
        def test_discriminator(
            z: oft.Numpy.Placeholder((self.batch_size, 100)),
            images: oft.Numpy.Placeholder((self.batch_size, 1, 28, 28)),
            label1: oft.Numpy.Placeholder((self.batch_size, 1)),
            label0: oft.Numpy.Placeholder((self.batch_size, 1)),
        ):
            g_out = self.generator(z, trainable=False, const_init=True)
            g_logits = self.discriminator(g_out, trainable=True, const_init=True)
            d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits(
                flow.zeros_like(g_logits),
                g_logits,
                name="Dloss_fake_sigmoid_cross_entropy_with_logits",
            )

            d_logits = self.discriminator(
                images, trainable=True, reuse=True, const_init=True
            )
            d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits(
                flow.ones_like(d_logits),
                d_logits,
                name="Dloss_real_sigmoid_cross_entropy_with_logits",
            )
            d_loss = d_loss_fake + d_loss_real
            flow.optimizer.SGD(
                flow.optimizer.PiecewiseConstantScheduler([], [self.lr]), momentum=0
            ).minimize(d_loss)

            return d_loss
Ejemplo n.º 6
0
    def training_step(self, batch, optimizer_idx):
        if optimizer_idx == 0:
            # generator
            (z,) = batch
            g_out = self._generator(z, trainable=True, const_init=True)
            g_logits = self._discriminator(g_out, trainable=False, const_init=True)
            g_loss = flow.nn.sigmoid_cross_entropy_with_logits(
                flow.ones_like(g_logits),
                g_logits,
                name="Gloss_sigmoid_cross_entropy_with_logits",
            )
            return (g_loss, g_out)
        elif optimizer_idx == 1:
            # discriminator
            z, images = batch
            g_out = self._generator(z, trainable=False, const_init=True)
            g_logits = self._discriminator(g_out, trainable=True, const_init=True)
            d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits(
                flow.zeros_like(g_logits),
                g_logits,
                name="Dloss_fake_sigmoid_cross_entropy_with_logits",
            )

            d_logits = self._discriminator(
                images, trainable=True, reuse=True, const_init=True
            )
            d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits(
                flow.ones_like(d_logits),
                d_logits,
                name="Dloss_real_sigmoid_cross_entropy_with_logits",
            )
            d_loss = d_loss_fake + d_loss_real
            return d_loss
Ejemplo n.º 7
0
    def train_discriminator(self, images, label1, label0):
        z = self.generate_noise()
        z = flow.zeros_like(z)
        g_out = self.generator(z)

        cat = flow.cat((images, g_out), dim=0)

        result = self.discriminator(cat)
        d_logits = result[: images.shape[0]]
        g_logits = result[images.shape[0] :]

        d_loss_real = self.of_cross_entropy(d_logits, label1)

        d_loss_fake = self.of_cross_entropy(g_logits, label0)

        d_loss = d_loss_fake + d_loss_real

        d_loss.backward()
        self.optimizerD.step()
        self.optimizerD.zero_grad()

        return (
            to_numpy(d_loss),
            to_numpy(d_loss_fake),
            to_numpy(d_loss_real),
            to_numpy(d_logits),
            to_numpy(g_logits),
        )
Ejemplo n.º 8
0
        def test_discriminator(
                z=flow.FixedTensorDef((self.batch_size, 100)),
                images=flow.FixedTensorDef((self.batch_size, 1, 28, 28)),
                label1=flow.FixedTensorDef((self.batch_size, 1)),
                label0=flow.FixedTensorDef((self.batch_size, 1)),
        ):
            g_out = self.generator(z, trainable=False, const_init=True)
            g_logits = self.discriminator(g_out,
                                          trainable=True,
                                          const_init=True)
            d_loss_fake = flow.nn.sigmoid_cross_entropy_with_logits(
                flow.zeros_like(g_logits),
                g_logits,
                name="Dloss_fake_sigmoid_cross_entropy_with_logits",
            )

            d_logits = self.discriminator(images,
                                          trainable=True,
                                          reuse=True,
                                          const_init=True)
            d_loss_real = flow.nn.sigmoid_cross_entropy_with_logits(
                flow.ones_like(d_logits),
                d_logits,
                name="Dloss_real_sigmoid_cross_entropy_with_logits",
            )
            d_loss = d_loss_fake + d_loss_real
            flow.losses.add_loss(d_loss)

            return d_loss
Ejemplo n.º 9
0
def reduce_variance(
    input_tensor: remote_blob_util.BlobDef,
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
    name: Optional[str] = None,
) -> remote_blob_util.BlobDef:
    name = _gen_unique_name_if_need(name, "ReduceVariance_")
    axis = _check_axis(axis, input_tensor.shape)
    if isinstance(axis, list) and len(axis) == 0:
        return flow.zeros_like(input_tensor,
                               dtype=input_tensor.dtype,
                               name=name + "_zeros_like")
    return flow.math.subtract(
        flow.math.reduce_mean(
            flow.math.square(input_tensor, name + "_square_minuend"),
            axis,
            keepdims,
            name + "_reduce_mean_minuend",
        ),
        flow.math.square(
            flow.math.reduce_mean(input_tensor, axis, keepdims,
                                  name + "_reduce_mean_subtrahend"),
            name + "_square_subtrahend",
        ),
        name + "_subtract",
    )
Ejemplo n.º 10
0
def reduce_std(
    input_tensor: oneflow._oneflow_internal.BlobDesc,
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
    name: Optional[str] = None,
) -> oneflow._oneflow_internal.BlobDesc:
    r"""This operator computes the standard deviation of input Blob along the specified axis

    The equation is:

    .. math::

        out=\sqrt{\frac{1}{n}*\sum_{i=1}^{n}(x_i-mean)^2}

    Args:
        input_tensor (oneflow._oneflow_internal.BlobDesc): A Blob
        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the standard deviation is computed. Defaults to None.
        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
        name (Optional[str], optional): The name for the operation. Defaults to None.

    Returns:
        oneflow._oneflow_internal.BlobDesc: The result of standard deviation on the specified axis of input Blob

    For example:

    .. code-block:: python

        import oneflow as flow
        import numpy as np
        import oneflow.typing as tp


        @flow.global_function()
        def reduce_std_Job(x: tp.Numpy.Placeholder((3, 3))
        ) -> tp.Numpy:
            return flow.math.reduce_std(x, axis=1, keepdims=True)


        x = np.array([[0, 5, 10], [5, 5, 5], [12, 3, 0]]).astype(np.float32)
        out = reduce_std_Job(x)

        # out [[4.0824833]
        #      [0.       ]
        #      [5.0990195]]

    """
    name = _gen_unique_name_if_need(name, "ReduceStd_")
    axis = _check_axis(axis, input_tensor.shape)
    if isinstance(axis, list) and len(axis) == 0:
        return flow.zeros_like(
            input_tensor, dtype=input_tensor.dtype, name=name + "_zeros_like"
        )
    return flow.math.sqrt(
        flow.math.reduce_variance(
            input_tensor, axis, keepdims, name + "_reduce_variance"
        ),
        name + "_sqrt",
    )
Ejemplo n.º 11
0
    def step(self, closure: Callable = None):
        """Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        with flow.no_grad():
            loss = None
            if closure is not None:
                loss = closure()
            for param_group in self.param_groups:
                kwargs = {
                    "learning_rate": param_group["lr"],
                    "epsilon": param_group["eps"],
                    "decay_rate": param_group["alpha"],
                    "l2": param_group["weight_decay"],
                }
                for param in param_group.parameters:
                    if param.grad is None:
                        continue

                    if "square_avg" not in self._state[param]:
                        self._state[param]["square_avg"] = flow.zeros_like(
                            param)
                    ms_tensor = self._state[param]["square_avg"]

                    if param_group["centered"]:
                        if "grad_avg" not in self._state[param]:
                            self._state[param]["grad_avg"] = flow.zeros_like(
                                param)
                        mg_tensor = self._state[param]["grad_avg"]
                        flow._C.dispatch_rmsprop_update(
                            self._centered_rmsprop,
                            (param, param.grad, ms_tensor, mg_tensor),
                            centered=True,
                            **kwargs,
                        )
                    else:
                        flow._C.dispatch_rmsprop_update(
                            self._rmsprop, (param, param.grad, ms_tensor),
                            **kwargs)
            self._state["step"] = self._state["step"] + 1
            return loss
Ejemplo n.º 12
0
    def train_generator(self, label1):
        z = self.generate_noise()
        z = flow.zeros_like(z)
        g_out = self.generator(z)
        g_logits = self.discriminator(g_out)
        g_loss = self.of_cross_entropy(g_logits, label1)
        g_loss.backward()
        self.optimizerG.step()
        self.optimizerG.zero_grad()

        return (to_numpy(g_loss), to_numpy(g_out, False), to_numpy(g_logits))
Ejemplo n.º 13
0
def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]:
    if permutation is None:
        return None
    return flow.scatter(
        flow.zeros_like(permutation),
        0,
        permutation,
        flow.arange(0,
                    permutation.numel(),
                    device=permutation.device,
                    dtype=flow.int32),
    )
Ejemplo n.º 14
0
def _zeros_by_val(val):
    ret = 0
    if isinstance(val, flow.Tensor):
        ret = flow.zeros_like(val)
    elif isinstance(val, np.ndarray):
        ret = np.zeros_like(val)
    elif isinstance(val, int):
        ret = 0
    elif isinstance(val, float):
        ret = 0.0
    else:
        raise ValueError
    return ret
Ejemplo n.º 15
0
    def step(self, closure: Callable = None):
        """Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        with flow.no_grad():
            loss = None
            if closure is not None:
                loss = closure()
            for param_group in self.param_groups:
                kwargs = {
                    "learning_rate_val": param_group["lr"],
                    "bias_correction1_val": param_group["bias_correction1"],
                    "bias_correction2_val": param_group["bias_correction2"],
                    "decay_rate": param_group["alpha"],
                    "l2": param_group["weight_decay"],
                    "beta1": param_group["betas"][0],
                    "beta2": param_group["betas"][1],
                    "epsilon": param_group["eps"],
                    "do_bias_correction": param_group["do_bias_correction"],
                }
                for param in param_group.parameters:
                    if param.grad is None:
                        continue
                    if "exp_avg" not in self._state[param]:
                        self._state[param]["exp_avg"] = flow.zeros_like(param)
                    if "exp_avg_sq" not in self._state[param]:
                        self._state[param]["exp_avg_sq"] = flow.zeros_like(
                            param)
                    m_tensor = self._state[param]["exp_avg"]
                    v_tensor = self._state[param]["exp_avg_sq"]

                    self._lamb_op(param, param.grad, m_tensor, v_tensor)
            self._state["step"] = self._state["step"] + 1
            return loss
Ejemplo n.º 16
0
def att_distill(args, student_atts, teacher_atts):
    att_loss = 0.
    teacher_layer_num = len(teacher_atts)
    student_layer_num = len(student_atts)

    assert teacher_layer_num % student_layer_num == 0
    layers_per_block = int(teacher_layer_num / student_layer_num)
    new_teacher_atts = [
        teacher_atts[i * layers_per_block + layers_per_block - 1]
        for i in range(student_layer_num)
    ]

    for student_att, teacher_att in zip(student_atts, new_teacher_atts):
        student_att = flow.where(
            student_att <= flow.constant(-1e2, dtype=flow.float),
            flow.zeros_like(student_att), student_att)
        teacher_att = flow.where(
            teacher_att <= flow.constant(-1e2, dtype=flow.float),
            flow.zeros_like(teacher_att), teacher_att)

        tmp_loss = mseloss(student_att, teacher_att)
        att_loss += tmp_loss

    return att_loss
Ejemplo n.º 17
0
    def get_target_tensor(self, prediction, target_is_real):
        """Create label tensors with the same size as the input.

        Parameters:
            prediction (tensor) - - tpyically the prediction from a discriminator
            target_is_real (bool) - - if the ground truth label is for real images or fake images

        Returns:
            A label tensor filled with ground truth label, and with the size of the input
        """

        if target_is_real:
            target_tensor = flow.ones_like(prediction)
        else:
            target_tensor = flow.zeros_like(prediction)
        return target_tensor
Ejemplo n.º 18
0
    def _test_send_recv(test_case, x0, src, dst):
        rank = flow.env.get_rank()
        if rank == src:
            x1 = x0
            flow.comm.send(x1, dst)

            x2 = x0
            flow.comm.send(x2, dst)
        elif rank == dst:
            x1 = flow.comm.recv(src)
            test_case.assertTrue(np.array_equal(x1.numpy(), x0.numpy()))
            test_case.assertEqual(x1.device, x0.device)

            x2 = flow.zeros_like(x0)
            flow.comm.recv(src, out=x2)
            test_case.assertTrue(np.array_equal(x2.numpy(), x0.numpy()))
            test_case.assertEqual(x2.device, x0.device)
        else:
            # do nothing
            pass
Ejemplo n.º 19
0
    def __init__(
        self,
        params: Union[Iterator[Parameter], List[Dict]],
        lr: float = 0.001,
        lr_decay: float = 0.0,
        weight_decay: float = 0,
        initial_accumulator_value: float = 0.0,
        eps: float = 1e-10,
    ):
        assert lr >= 0.0, f"Invalid learning rate: {lr}"
        assert weight_decay >= 0.0, f"Invalid weight_decay value: {weight_decay}"
        assert (
            initial_accumulator_value >= 0.0
        ), f"Invalid initial_accumulator_value value: {initial_accumulator_value}"
        assert eps >= 0.0, f"Invalid epsilon value: {eps}"

        options = dict()
        options["lr"] = lr
        options["initial_accumulator_value"] = initial_accumulator_value
        options["lr_decay"] = lr_decay
        options["weight_decay"] = weight_decay
        options["eps"] = eps
        super().__init__(params, options)

        for param_group in self.param_groups:
            for param in param_group.parameters:
                assert param.is_leaf, "parameters must be leaf tensor"
                self._state[param] = dict()
                self._state[param]["sum"] = flow.zeros_like(param).fill_(
                    initial_accumulator_value
                )

        self._op = (
            flow.stateful_op("adagrad_update")
            .Input("model")
            .Input("model_diff")
            .Input("sum")
            .Build()
        )
Ejemplo n.º 20
0
    def step(self, closure: Callable = None):
        """Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        with flow.no_grad():
            loss = None
            if closure is not None:
                loss = closure()
            for param_group in self.param_groups:
                lr = param_group["lr"]
                l2 = param_group["weight_decay"]
                for param in param_group.parameters:
                    if param.grad is None:
                        continue
                    if param_group["momentum"] == 0.0:
                        flow._C.dispatch_sgd_update(self._sgd,
                                                    (param, param.grad),
                                                    learning_rate=lr,
                                                    l2=l2)
                    else:
                        if "momentum_buf" not in self._state[param]:
                            self._state[param][
                                "momentum_buf"] = flow.zeros_like(param)
                        momentum_buf = self._state[param]["momentum_buf"]
                        beta = param_group["momentum"]
                        flow._C.dispatch_momentum_update(
                            self._momentum_sgd,
                            (param, param.grad, momentum_buf),
                            learning_rate=lr,
                            l2=l2,
                            beta=beta,
                        )
            self._state["step"] = self._state["step"] + 1
            return loss
Ejemplo n.º 21
0
def mask_finished_scores(score, flag):
    """
    If a sequence is finished, we only allow one alive branch. This function aims to give one branch a zero score
    and the rest -inf score.
    Args:
        score: A real value array with shape [batch_size * beam_size, beam_size].
        flag: A bool array with shape [batch_size * beam_size, 1].
    Returns:
        A real value array with shape [batch_size * beam_size, beam_size].
    """
    beam_width = score.size(-1)
    zero_mask = flow.zeros_like(flag).to(dtype=flow.uint8)
    if beam_width > 1:
        unfinished = flow.cat(
            [zero_mask, flag.repeat([1, beam_width - 1])], dim=1)
        finished = flow.cat(
            (flag.to(dtype=flow.uint8), zero_mask.repeat([1, beam_width - 1])),
            dim=1)
    else:
        unfinished = zero_mask
        finished = flag.to(dtype=flow.uint8)
    score = flow.masked_fill(score, unfinished == 1, -float("inf"))
    score = flow.masked_fill(score, finished == 1, 0)
    return score
Ejemplo n.º 22
0
def shift_tokens_right(input_ids: flow.Tensor, pad_token_id: int,
                       decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = flow.zeros_like(input_ids)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()

    # shifted_input_ids[:, 0] = decoder_start_token_id
    # tensor assignment in oneflow:
    shifted_input_ids[:, 0] = flow.tensor(
        decoder_start_token_id,
        dtype=shifted_input_ids.dtype,
        device=shifted_input_ids.device,
    )

    assert pad_token_id is not None, "self.model.pad_token_id has to be defined."
    # replace possible -100 values in labels by `pad_token_id`
    # masked
    shifted_input_ids = (shifted_input_ids.to(flow.float).masked_fill(
        shifted_input_ids.eq(-100).to(flow.int32),
        pad_token_id).to(flow.int32))

    return shifted_input_ids
Ejemplo n.º 23
0
    def loss_layer(self,
                   feature_map,
                   pred,
                   label,
                   bboxes,
                   stride,
                   prefix='loss_layer'):
        '''

        :param feature_map: [N, H, W, 3*(5+class_num)]
        :param pred: [N, H, W, 3, 4+1+class_num]
        :param label:  [N, H, W, 3, 4+1+class_num]
        :param bboxes:  [N, V, 4]
        :param stride:
        :param anchor_per_scale:
        :return:
            giou_loss:
            conf_loss:
            prob_loss:
        '''
        feature_map = flow.reshape(
            feature_map,
            shape=(feature_map.shape[0], feature_map.shape[1],
                   feature_map.shape[2], self.anchor_per_scale, -1))
        # shape: [N, H, W, 3, 1]
        raw_conf = flow.slice(feature_map,
                              begin=[None, None, None, None, 4],
                              size=[None, None, None, None, 1])
        # shape: [N, H, W, 3, class_num]
        raw_prob = flow.slice(
            feature_map,
            begin=[None, None, None, None, 5],
            size=[None, None, None, None, feature_map.shape[-1] - 5])

        #  [N, H, W, 3, 4]
        pred_xywh = flow.slice(pred,
                               begin=[None, None, None, None, 0],
                               size=[None, None, None, None, 4])
        pred_conf = flow.slice(pred,
                               begin=[None, None, None, None, 4],
                               size=[None, None, None, None, 1])

        #flow.slice(label, begin=[None, None, None, None, 0], size=[None, None, None, None, 4])
        label_xywh = flow.slice(label,
                                begin=[None, None, None, None, 0],
                                size=[None, None, None, None, 4])
        respond_bbox = flow.slice(label,
                                  begin=[None, None, None, None, 4],
                                  size=[None, None, None, None, 1])
        label_prob = flow.slice(
            label,
            begin=[None, None, None, None, 5],
            size=[None, None, None, None, label.shape[-1] - 5])
        # [N, H, W, 3, 1]
        giou = self.bbox_giou(pred_xywh, label_xywh)
        # label_w = flow.slice(label, begin=[None, None, None, None, 2], size=[None, None, None, None, 1])
        # label_h = flow.slice(label, begin=[None, None, None, None, 3], size=[None, None, None, None, 1])
        # bbox_loss_scale = 2.0 - 1.0 * label_w * label_h / ((stride * feature_map.shape[1]) ** 2)  #???
        # [N, H, W, 3, 1]
        # giou_loss = respond_bbox * bbox_loss_scale * (1 - giou)
        giou_loss = respond_bbox * (1 - giou)

        # [N, 1, 1, 1, V, 4]
        bboxes_ = flow.expand_dims(bboxes, axis=1)
        bboxes_ = flow.expand_dims(bboxes_, axis=1)
        bboxes_ = flow.expand_dims(bboxes_, axis=1)
        # [N, H, W, 3, V]
        iou = self.bbox_iou(flow.expand_dims(pred_xywh, axis=-2), bboxes_)
        iou = flow.squeeze(iou, axis=[
            -1,
        ])
        # [N, H, W, 3, 1]
        max_iou = flow.math.reduce_max(iou, axis=-1, keepdims=True)
        # respond_bgd = (1.0 - respond_bbox) * (max_iou < self.iou_loss_thresh)
        tmp = flow.math.less(
            max_iou,
            flow.constant_like(like=max_iou,
                               value=self.iou_loss_thresh,
                               dtype=flow.float32))
        # respond_bgd = (1.0 - respond_bbox) * tmp
        respond_bgd = flow.where(
            tmp, 1.0 - respond_bbox,
            flow.zeros_like(respond_bbox, dtype=flow.float32))
        # [N, H, W, 3, 1]
        # ce = flow.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=raw_conf)
        # alpha_t = respond_bbox*self.focus_loss_alpha+(1.0-respond_bbox)*(1.0-self.focus_loss_alpha)
        # conf_loss = alpha_t*flow.math.pow(1.0-flow.math.exp(flow.math.negative(ce)), self.focus_loss_gamma)*ce
        # conf_loss = (respond_bbox+respond_bgd)*conf_loss
        conf_focal = self.focal(respond_bbox, pred_conf)
        conf_loss = conf_focal * (
            respond_bbox * flow.nn.sigmoid_cross_entropy_with_logits(
                labels=respond_bbox, logits=raw_conf) +
            respond_bgd * flow.nn.sigmoid_cross_entropy_with_logits(
                labels=respond_bbox, logits=raw_conf))
        # [N, H, W, 3, 1]
        prob_loss = respond_bbox * flow.nn.sigmoid_cross_entropy_with_logits(
            labels=label_prob, logits=raw_prob)

        #??
        # label_w = flow.slice(label, begin=[None, None, None, None, 2], size=[None, None, None, None, 1])
        # label_h = flow.slice(label, begin=[None, None, None, None, 3], size=[None, None, None, None, 1])
        # bbox_loss_scale = 2.0 - 1.0 * label_w * label_h / ((stride * feature_map.shape[1]) * (stride * feature_map.shape[2]))  #???
        # # [N, H, W, 3, 1]
        # giou_loss = respond_bbox * bbox_loss_scale * flow.smooth_l1_loss(prediction=pred_xywh, label=label_xywh)

        giou_loss = flow.math.reduce_mean(
            flow.math.reduce_sum(giou_loss, axis=[1, 2, 3, 4]))
        conf_loss = flow.math.reduce_mean(
            flow.math.reduce_sum(conf_loss, axis=[1, 2, 3, 4]))
        prob_loss = flow.math.reduce_mean(
            flow.math.reduce_sum(prob_loss, axis=[1, 2, 3, 4]))

        return giou_loss, conf_loss, prob_loss
Ejemplo n.º 24
0
def inference(args):
    start_t = time.time()
    bert_module = BertForPreTraining(
        args.vocab_size,
        args.seq_length,
        args.hidden_size,
        args.num_hidden_layers,
        args.num_attention_heads,
        args.intermediate_size,
        nn.GELU(),
        args.hidden_dropout_prob,
        args.attention_probs_dropout_prob,
        args.max_position_embeddings,
        args.type_vocab_size,
        args.vocab_size,
    )
    end_t = time.time()
    print("Initialize model using time: {:.3f}s".format(end_t - start_t))

    start_t = time.time()
    if args.use_lazy_model:
        from utils.compare_lazy_outputs import load_params_from_lazy

        load_params_from_lazy(
            bert_module.state_dict(),
            args.model_path,
        )
    else:
        bert_module.load_state_dict(flow.load(args.model_path))
    end_t = time.time()
    print("Loading parameters using time: {:.3f}s".format(end_t - start_t))

    bert_module.eval()
    bert_module.to(args.device)

    class BertEvalGraph(nn.Graph):
        def __init__(self):
            super().__init__()
            self.bert = bert_module

        def build(self, input_ids, input_masks, segment_ids):
            input_ids = input_ids.to(device=args.device)
            input_masks = input_masks.to(device=args.device)
            segment_ids = segment_ids.to(device=args.device)

            with flow.no_grad():
                # 1. forward the next_sentence_prediction and masked_lm model
                _, seq_relationship_scores = self.bert(input_ids, input_masks,
                                                       segment_ids)

            return seq_relationship_scores

    bert_eval_graph = BertEvalGraph()

    start_t = time.time()
    inputs = [np.random.randint(0, 20, size=args.seq_length)]
    inputs = flow.Tensor(inputs,
                         dtype=flow.int64,
                         device=flow.device(args.device))
    mask = flow.cast(inputs > 0, dtype=flow.int64)

    segment_info = flow.zeros_like(inputs)
    prediction = bert_eval_graph(inputs, mask, segment_info)
    print(prediction.numpy())
    end_t = time.time()
    print("Inference using time: {:.3f}".format(end_t - start_t))
Ejemplo n.º 25
0
def reduce_variance(
    input_tensor: remote_blob_util.BlobDef,
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
    name: Optional[str] = None,
) -> remote_blob_util.BlobDef:
    r"""This operator computes the variance of input Blob along the specified axis

    The equation is: 

    .. math:: 

        out=\frac{1}{n}*\sum_{i=1}^{n}(x_i-mean)^2

    Args:
        input_tensor (remote_blob_util.BlobDef): A Blob
        axis (Optional[Union[int, Sequence[int]]], optional): The dimension along which the variance is computed. Defaults to None.
        keepdims (bool, optional): Whether to keep the reduced dimension in the output Blob. Defaults to False.
        name (Optional[str], optional): The name for the operation. Defaults to None.

    Returns:
        remote_blob_util.BlobDef: The result of variance on the specified axis of input Blob
    
    For example: 

    .. code-block:: python 

        import oneflow as flow
        import numpy as np
        import oneflow.typing as tp


        @flow.global_function()
        def reduce_variance_Job(x: tp.Numpy.Placeholder((3, 3))
        ) -> tp.Numpy:
            return flow.math.reduce_variance(x, axis=1, keepdims=True)


        x = np.array([[0, 5, 10], [5, 5, 5], [12, 3, 0]]).astype(np.float32)
        out = reduce_variance_Job(x)

        # output [[16.666668]
        #         [ 0.      ]
        #         [26.      ]]

    """
    name = _gen_unique_name_if_need(name, "ReduceVariance_")
    axis = _check_axis(axis, input_tensor.shape)
    if isinstance(axis, list) and len(axis) == 0:
        return flow.zeros_like(input_tensor,
                               dtype=input_tensor.dtype,
                               name=name + "_zeros_like")
    return flow.math.subtract(
        flow.math.reduce_mean(
            flow.math.square(input_tensor, name + "_square_minuend"),
            axis,
            keepdims,
            name + "_reduce_mean_minuend",
        ),
        flow.math.square(
            flow.math.reduce_mean(input_tensor, axis, keepdims,
                                  name + "_reduce_mean_subtrahend"),
            name + "_square_subtrahend",
        ),
        name + "_subtract",
    )
Ejemplo n.º 26
0
def multi_head_attention_forward(
    query: Tensor,
    key: Tensor,
    value: Tensor,
    embed_dim_to_check: int,
    num_heads: int,
    in_proj_weight: Tensor,
    in_proj_bias: Optional[Tensor],
    bias_k: Optional[Tensor],
    bias_v: Optional[Tensor],
    add_zero_attn: bool,
    dropout_p: float,
    out_proj_weight: Tensor,
    out_proj_bias: Optional[Tensor],
    training: bool = True,
    key_padding_mask: Optional[Tensor] = None,
    need_weights: bool = True,
    attn_mask: Optional[Tensor] = None,
    use_separate_proj_weight: bool = False,
    q_proj_weight: Optional[Tensor] = None,
    k_proj_weight: Optional[Tensor] = None,
    v_proj_weight: Optional[Tensor] = None,
    static_k: Optional[Tensor] = None,
    static_v: Optional[Tensor] = None,
) -> Tuple[Tensor, Optional[Tensor]]:
    # set up shape vars
    tgt_len, bsz, embed_dim = query.shape
    src_len, _, _ = key.shape
    assert (
        embed_dim == embed_dim_to_check
    ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
    if isinstance(embed_dim, Tensor):
        # embed_dim can be a tensor when JIT tracing
        head_dim = embed_dim.div(num_heads)
    else:
        head_dim = embed_dim // num_heads
    assert (head_dim * num_heads == embed_dim
            ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
    if use_separate_proj_weight:
        # allow MHA to have different embedding dimensions when separate projection weights are used
        assert (
            key.shape[:2] == value.shape[:2]
        ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
    else:
        assert (
            key.shape == value.shape
        ), f"key shape {key.shape} does not match value shape {value.shape}"

    #
    # compute in-projection
    #
    if not use_separate_proj_weight:
        q, k, v = _in_projection_packed(query, key, value, in_proj_weight,
                                        in_proj_bias)
    else:
        assert (q_proj_weight is not None
                ), "use_separate_proj_weight is True but q_proj_weight is None"
        assert (k_proj_weight is not None
                ), "use_separate_proj_weight is True but k_proj_weight is None"
        assert (v_proj_weight is not None
                ), "use_separate_proj_weight is True but v_proj_weight is None"
        if in_proj_bias is None:
            b_q = b_k = b_v = None
        else:
            b_q, b_k, b_v = in_proj_bias.chunk(3, dim=0)
        q, k, v = _in_projection(
            query,
            key,
            value,
            q_proj_weight,
            k_proj_weight,
            v_proj_weight,
            b_q,
            b_k,
            b_v,
        )

    # prep attention mask
    if attn_mask is not None:
        assert (
            attn_mask.dtype.is_floating_point == False
        ), f"Only integer type are supported for attn_mask, not {attn_mask.dtype}"
        # ensure attn_mask's dim is 3
        if attn_mask.dim() == 2:
            correct_2d_size = (tgt_len, src_len)
            if attn_mask.shape != correct_2d_size:
                raise RuntimeError(
                    f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}."
                )
            attn_mask = attn_mask.unsqueeze(0)
        elif attn_mask.dim() == 3:
            correct_3d_size = (bsz * num_heads, tgt_len, src_len)
            if attn_mask.shape != correct_3d_size:
                raise RuntimeError(
                    f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}."
                )
        else:
            raise RuntimeError(
                f"attn_mask's dimension {attn_mask.dim()} is not supported")

    # add bias along batch dimension (currently second)
    if bias_k is not None and bias_v is not None:
        assert static_k is None, "bias cannot be added to static key."
        assert static_v is None, "bias cannot be added to static value."
        k = flow.cat([k, bias_k.repeat((1, bsz, 1))])
        v = flow.cat([v, bias_v.repeat((1, bsz, 1))])
        if attn_mask is not None:
            attn_mask = pad(attn_mask, (0, 1, 0, 0))
        if key_padding_mask is not None:
            key_padding_mask = pad(key_padding_mask, (0, 1, 0, 0))
    else:
        assert bias_k is None
        assert bias_v is None

    #
    # reshape q, k, v for multihead attention and make em batch first
    #
    # replace torch.contiguous with reshape
    q = q.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
    if static_k is None:
        k = k.reshape(-1, bsz * num_heads, head_dim).transpose(0, 1)
    else:
        assert (
            static_k.size(0) == bsz * num_heads
        ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
        assert (
            static_k.size(2) == head_dim
        ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
        k = static_k
    if static_v is None:
        v = v.reshape(-1, bsz * num_heads, head_dim).transpose(0, 1)
    else:
        assert (
            static_v.size(0) == bsz * num_heads
        ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
        assert (
            static_v.size(2) == head_dim
        ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
        v = static_v

    # add zero attention along batch dimension (now first)
    if add_zero_attn:
        zero_attn_shape = (bsz * num_heads, 1, head_dim)
        k = flow.cat(
            [k, flow.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)],
            dim=1)
        v = flow.cat(
            [v, flow.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)],
            dim=1)
        if attn_mask is not None:
            attn_mask = pad(attn_mask, (0, 1, 0, 0))
        if key_padding_mask is not None:
            key_padding_mask = pad(key_padding_mask, (0, 1, 0, 0))

    # update source sequence length after adjustments
    src_len = k.size(1)

    # merge key padding and attention masks
    if key_padding_mask is not None:
        assert key_padding_mask.shape == (
            bsz,
            src_len,
        ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
        key_padding_mask = (key_padding_mask.reshape(
            bsz, 1, 1, src_len).expand(-1, num_heads, tgt_len,
                                       -1).reshape(bsz * num_heads, tgt_len,
                                                   src_len))
        if attn_mask is not None:
            attn_mask = attn_mask.expand(bsz * num_heads, -1, -1)
        if attn_mask is None:
            attn_mask = key_padding_mask
        else:
            attn_mask = flow.logical_or(attn_mask, key_padding_mask)

    # convert mask to float
    if attn_mask is not None and attn_mask.dtype.is_floating_point == False:
        new_attn_mask = flow.zeros_like(attn_mask).to(flow.float)
        new_attn_mask = new_attn_mask.masked_fill(attn_mask, float("-inf"))
        attn_mask = new_attn_mask

    # adjust dropout probability
    if not training:
        dropout_p = 0.0

    #
    # (deep breath) calculate attention and out projection
    #
    attn_output, attn_output_weights = _scaled_dot_product_attention(
        q, k, v, attn_mask, dropout_p)
    attn_output = attn_output.transpose(0, 1).reshape(tgt_len, bsz, embed_dim)
    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)

    if need_weights:
        # average attention weights over heads
        attn_output_weights = attn_output_weights.reshape(
            bsz, num_heads, tgt_len, src_len)
        return attn_output, attn_output_weights.sum(dim=1) / num_heads
    else:
        return attn_output, None
Ejemplo n.º 27
0
    def train(self):
        # Learning rate cache for decaying.
        g_lr = self.g_lr
        d_lr = self.d_lr
        c_lr = self.c_lr

        start_iters = 0
        if self.resume_iters:
            pass

        norm = Normalizer()
        data_iter = iter(self.data_loader)

        print("Start training......")
        start_time = datetime.now()

        for i in range(start_iters, self.num_iters):
            # Preprocess input data
            # Fetch real images and labels.
            try:
                x_real, speaker_idx_org, label_org = next(data_iter)
            except:
                data_iter = iter(self.data_loader)
                x_real, speaker_idx_org, label_org = next(data_iter)

            # Generate target domain labels randomly.
            rand_idx = flow.randperm(label_org.size(0))
            label_trg = label_org[rand_idx]
            speaker_idx_trg = speaker_idx_org[rand_idx]

            x_real = x_real.to(self.device)
            # Original domain one-hot labels.
            label_org = label_org.to(self.device)
            # Target domain one-hot labels.
            label_trg = label_trg.to(self.device)
            speaker_idx_org = speaker_idx_org.to(self.device)
            speaker_idx_trg = speaker_idx_trg.to(self.device)

            # Train the discriminator
            # Compute loss with real audio frame.
            CELoss = nn.CrossEntropyLoss()
            cls_real = self.C(x_real)
            cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org)

            self.reset_grad()
            cls_loss_real.backward()
            self.c_optimizer.step()
            # Logging.
            loss = {}
            loss["C/C_loss"] = cls_loss_real.item()

            out_r = self.D(x_real, label_org)
            # Compute loss with fake audio frame.
            x_fake = self.G(x_real, label_trg)
            out_f = self.D(x_fake.detach(), label_trg)
            d_loss_t = nn.BCEWithLogitsLoss()(
                input=out_f, target=flow.zeros_like(
                    out_f).float()) + nn.BCEWithLogitsLoss()(
                        input=out_r, target=flow.ones_like(out_r).float())

            out_cls = self.C(x_fake)
            d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg)

            # Compute loss for gradient penalty.
            alpha = flow.rand(x_real.size(0), 1, 1, 1).to(self.device)
            x_hat = ((alpha * x_real +
                      (1 - alpha) * x_fake).detach().requires_grad_(True))
            out_src = self.D(x_hat, label_trg)

            # TODO: Second-order derivation is not currently supported in oneflow, so gradient penalty cannot be used temporarily.
            if self.use_gradient_penalty:
                d_loss_gp = self.gradient_penalty(out_src, x_hat)
                d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp
            else:
                d_loss = d_loss_t + self.lambda_cls * d_loss_cls

            self.reset_grad()
            d_loss.backward()
            self.d_optimizer.step()

            loss["D/D_loss"] = d_loss.item()

            # Train the generator
            if (i + 1) % self.n_critic == 0:
                # Original-to-target domain.
                x_fake = self.G(x_real, label_trg)
                g_out_src = self.D(x_fake, label_trg)
                g_loss_fake = nn.BCEWithLogitsLoss()(
                    input=g_out_src, target=flow.ones_like(g_out_src).float())

                out_cls = self.C(x_real)
                g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org)

                # Target-to-original domain.
                x_reconst = self.G(x_fake, label_org)
                g_loss_rec = nn.L1Loss()(x_reconst, x_real)

                # Original-to-Original domain(identity).
                x_fake_iden = self.G(x_real, label_org)
                id_loss = nn.L1Loss()(x_fake_iden, x_real)

                # Backward and optimize.
                g_loss = (g_loss_fake + self.lambda_cycle * g_loss_rec +
                          self.lambda_cls * g_loss_cls +
                          self.lambda_identity * id_loss)

                self.reset_grad()
                g_loss.backward()
                self.g_optimizer.step()

                # Logging.
                loss["G/loss_fake"] = g_loss_fake.item()
                loss["G/loss_rec"] = g_loss_rec.item()
                loss["G/loss_cls"] = g_loss_cls.item()
                loss["G/loss_id"] = id_loss.item()
                loss["G/g_loss"] = g_loss.item()

            # Miscellaneous
            # Print out training information.
            if (i + 1) % self.log_step == 0:
                et = datetime.now() - start_time
                et = str(et)[:-7]
                log = "Elapsed [{}], Iteration [{}/{}]".format(
                    et, i + 1, self.num_iters)
                for tag, value in loss.items():
                    log += ", {}: {:.4f}".format(tag, value)
                print(log)

            # Translate fixed images for debugging.
            if (i + 1) % self.sample_step == 0:
                with flow.no_grad():
                    d, speaker = TestSet(self.test_dir).test_data()
                    target = random.choice(
                        [x for x in speakers if x != speaker])
                    label_t = self.spk_enc.transform([target])[0]
                    label_t = np.asarray([label_t])

                    for filename, content in d.items():
                        f0 = content["f0"]
                        ap = content["ap"]
                        sp_norm_pad = self.pad_coded_sp(
                            content["coded_sp_norm"])

                        convert_result = []
                        for start_idx in range(
                                0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES):
                            one_seg = sp_norm_pad[:,
                                                  start_idx:start_idx + FRAMES]

                            one_seg = flow.Tensor(one_seg).to(self.device)
                            one_seg = one_seg.view(1, 1, one_seg.size(0),
                                                   one_seg.size(1))
                            l = flow.Tensor(label_t)
                            one_seg = one_seg.to(self.device)
                            l = l.to(self.device)
                            one_set_return = self.G(one_seg,
                                                    l).detach().cpu().numpy()
                            one_set_return = np.squeeze(one_set_return)
                            one_set_return = norm.backward_process(
                                one_set_return, target)
                            convert_result.append(one_set_return)

                        convert_con = np.concatenate(convert_result, axis=1)
                        convert_con = convert_con[:,
                                                  0:content["coded_sp_norm"].
                                                  shape[1]]
                        contigu = np.ascontiguousarray(convert_con.T,
                                                       dtype=np.float64)
                        decoded_sp = decode_spectral_envelope(contigu,
                                                              SAMPLE_RATE,
                                                              fft_size=FFTSIZE)
                        f0_converted = norm.pitch_conversion(
                            f0, speaker, target)
                        wav = synthesize(f0_converted, decoded_sp, ap,
                                         SAMPLE_RATE)

                        name = f"{speaker}-{target}_iter{i+1}_{filename}"
                        path = os.path.join(self.sample_dir, name)
                        print(f"[save]:{path}")
                        sf.write(path, wav, SAMPLE_RATE)

            # Save model checkpoints.
            if (i + 1) % self.model_save_step == 0:
                G_path = os.path.join(self.model_save_dir,
                                      "{}-G".format(i + 1))
                D_path = os.path.join(self.model_save_dir,
                                      "{}-D".format(i + 1))
                C_path = os.path.join(self.model_save_dir,
                                      "{}-C".format(i + 1))
                flow.save(self.G.state_dict(), G_path)
                flow.save(self.D.state_dict(), D_path)
                flow.save(self.C.state_dict(), C_path)
                print("Saved model checkpoints into {}...".format(
                    self.model_save_dir))

            # Decay learning rates.
            if (i + 1) % self.lr_update_step == 0 and (i + 1) > (
                    self.num_iters - self.num_iters_decay):
                g_lr -= self.g_lr / float(self.num_iters_decay)
                d_lr -= self.d_lr / float(self.num_iters_decay)
                c_lr -= self.c_lr / float(self.num_iters_decay)
                self.update_lr(g_lr, d_lr, c_lr)
                print("Decayed learning rates, g_lr: {}, d_lr: {}.".format(
                    g_lr, d_lr))
Ejemplo n.º 28
0
    def recognize(self, inputs, inputs_mask):

        cache = {"fronend": None, "encoder": None, "decoder": None, "lm": None}

        self.attn_weights = {}
        memory, memory_mask, _, enc_attn_weights = self.encode(
            inputs, inputs_mask)

        self.attn_weights["encoder"] = enc_attn_weights
        self.attn_weights["decoder"] = []

        b, t, v = memory.size()

        beam_memory = (memory.unsqueeze(1).repeat(
            [1, self.beam_width, 1, 1]).view(b * self.beam_width, t, v))
        beam_memory_mask = (memory_mask.unsqueeze(1).repeat(
            [1, self.beam_width, 1]).view(b * self.beam_width, t))

        preds = (flow.ones(
            [b * self.beam_width, 1], dtype=flow.int64, device=memory.device) *
                 BOS)

        scores = flow.tensor([0.0] + [-float("inf")] * (self.beam_width - 1),
                             dtype=flow.float32)
        scores = scores.to(memory.device).repeat([b]).unsqueeze(1)
        ending_flag = flow.zeros_like(scores).to(dtype=flow.uint8)

        with flow.no_grad():
            for _ in range(1, self.max_len + 1):
                preds, cache, scores, ending_flag = self.decode_step(
                    preds, beam_memory, beam_memory_mask, cache, scores,
                    ending_flag)

                # whether stop or not
                if ending_flag.sum() == b * self.beam_width:
                    break

            scores = scores.view(b, self.beam_width)
            preds = preds.view(b, self.beam_width, -1)

            lengths = flow.sum(flow.ne(preds, EOS).float(), dim=-1)

            # length penalty
            if self.penalty:
                lp = flow.pow((self.lamda + lengths) / (self.lamda + 1),
                              self.penalty)
                scores /= lp

            sorted_scores, offset_indices = flow.sort(scores,
                                                      dim=-1,
                                                      descending=True)

            base_indices = (flow.arange(
                b, dtype=flow.int64, device=offset_indices.device) *
                            self.beam_width)
            base_indices = (base_indices.unsqueeze(1).repeat(
                [1, self.beam_width]).view(-1))
            preds = preds.view(b * self.beam_width, -1)
            indices = offset_indices.view(-1) + base_indices

            # remove BOS
            sorted_preds = preds[indices].view(b, self.beam_width, -1)
            nbest_preds = sorted_preds[:, :min(self.beam_width, self.nbest),
                                       1:]
            nbest_scores = sorted_scores[:, :min(self.beam_width, self.nbest)]

        return self.nbest_translate(nbest_preds), nbest_scores
Ejemplo n.º 29
0
 def reset(self):
     self.val = flow.zeros_like(self.val)
     self.sum = flow.zeros_like(self.sum)
     self.count = flow.zeros_like(self.count)