Exemple #1
0
    def train_forward(self):
        entity_embedding, relation_embedding, transfer_matrix = self.create_share_variables(
        )
        pos_head = self.lookup_table(self.train_pos_input[:, 0],
                                     entity_embedding)
        pos_tail = self.lookup_table(self.train_pos_input[:, 2],
                                     entity_embedding)
        pos_rel = self.lookup_table(self.train_pos_input[:, 1],
                                    relation_embedding)
        neg_head = self.lookup_table(self.train_neg_input[:, 0],
                                     entity_embedding)
        neg_tail = self.lookup_table(self.train_neg_input[:, 2],
                                     entity_embedding)
        neg_rel = self.lookup_table(self.train_neg_input[:, 1],
                                    relation_embedding)

        rel_matrix = layers.reshape(
            self.lookup_table(self.train_pos_input[:, 1], transfer_matrix),
            [-1, self.hidden_size, self.hidden_size])
        pos_head_trans = self.matmul_with_expend_dims(pos_head, rel_matrix)
        pos_tail_trans = self.matmul_with_expend_dims(pos_tail, rel_matrix)

        rel_matrix_neg = layers.reshape(
            self.lookup_table(self.train_neg_input[:, 1], transfer_matrix),
            [-1, self.hidden_size, self.hidden_size])
        neg_head_trans = self.matmul_with_expend_dims(neg_head, rel_matrix_neg)
        neg_tail_trans = self.matmul_with_expend_dims(neg_tail, rel_matrix_neg)

        pos_score = self._algorithm(pos_head_trans, pos_rel, pos_tail_trans)
        neg_score = self._algorithm(neg_head_trans, neg_rel, neg_tail_trans)
        pos = layers.reduce_sum(layers.abs(pos_score), -1, keep_dim=False)
        neg = layers.reduce_sum(layers.abs(neg_score), -1, keep_dim=False)
        neg = layers.reshape(neg, shape=[-1, 1], inplace=True)
        loss = layers.reduce_mean(layers.relu(pos - neg + self.margin))
        return [loss]
Exemple #2
0
	def test_forward(self):
		entity_embedding, relation_embedding = self.create_share_variables()
		entity = layers.l2_normalize(entity_embedding, axis=-1)
		relation = layers.l2_normalize(relation_embedding, axis=-1)
		head_vec = self.lookup_table(self.test_input[0], entity)
		rel_vec = self.lookup_table(self.test_input[1], relation)
		tail_vec = self.lookup_table(self.test_input[2], entity)
		id_replace_head = layers.reduce_sum(layers.abs(entity + rel_vec - tail_vec), dim=1)
		id_replace_tail = layers.reduce_sum(layers.abs(entity - rel_vec - head_vec), dim=1)
		return [id_replace_head, id_replace_tail]
Exemple #3
0
 def spec_loss(self, decoded, input, num_frames=None):
     if num_frames is None:
         l1_loss = F.reduce_mean(F.abs(decoded - input))
     else:
         # mask the <pad> part of the decoder
         num_channels = decoded.shape[-1]
         l1_loss = F.abs(decoded - input)
         mask = F.sequence_mask(num_frames, dtype="float32")
         l1_loss *= F.unsqueeze(mask, axes=[-1])
         l1_loss = F.reduce_sum(l1_loss) / F.scale(F.reduce_sum(mask), num_channels)
     return l1_loss
Exemple #4
0
	def train_forward(self):
		entity_embedding, relation_embedding = self.create_share_variables()
		pos_head = self.lookup_table(self.train_pos_input[:, 0], entity_embedding)
		pos_tail = self.lookup_table(self.train_pos_input[:, 2], entity_embedding)
		pos_rel = self.lookup_table(self.train_pos_input[:, 1], relation_embedding)
		neg_head = self.lookup_table(self.train_neg_input[:, 0], entity_embedding)
		neg_tail = self.lookup_table(self.train_neg_input[:, 2], entity_embedding)
		neg_rel = self.lookup_table(self.train_neg_input[:, 1], relation_embedding)
		pos_score = self._algorithm(pos_head, pos_rel, pos_tail)
		neg_score = self._algorithm(neg_head, neg_rel, neg_tail)
		pos = layers.reduce_sum(layers.abs(pos_score), 1, keep_dim=False)
		neg = layers.reduce_sum(layers.abs(neg_score), 1, keep_dim=False)
		neg = layers.reshape(neg, shape=[-1, 1], inplace=True)
		loss = layers.reduce_mean(layers.relu(pos - neg + self.margin))
		return [loss]
Exemple #5
0
    def compute_mask_loss(self, occ_mask, warped_image, tgt_image):
        """
        Compute losses on the generated occlusion mask.

        Args:
            occ_mask (tensor): Generated occlusion masks.
            warped_image (tensor): Warped image using the flow map.
            tgt_image (tensor): Target image for the warped image.
        Returns:
            (tensor): Loss for the mask.
        """
        loss_mask = dg.to_variable(np.zeros((1, )).astype("float32"))
        if occ_mask is not None:
            dummy0 = L.zeros_like(occ_mask)
            dummy1 = L.ones_like(occ_mask)

            # Compute the confidence map based L1 distance between warped and GT image.
            img_diff = L.reduce_sum(L.abs(warped_image - tgt_image),
                                    1,
                                    keep_dim=True)

            conf = L.clip(1 - img_diff, 0, 1)

            # Force mask value to be small if warped image is similar to GT, and vice versa.
            loss_mask = self.criterionMasked(occ_mask, dummy0, conf)
            loss_mask += self.criterionMasked(occ_mask, dummy1, 1 - conf)

        return loss_mask
Exemple #6
0
 def test_abs(self):
     program = Program()
     with program_guard(program):
         input = layers.data(name="input", shape=[16], dtype="float32")
         out = layers.abs(input, name='abs')
         self.assertIsNotNone(out)
     print(str(program))
Exemple #7
0
    def compute_flow_losses(self, flow, warped_images, tgt_image, flow_gt,
                            flow_conf_gt, fg_mask, tgt_label, ref_label):
        """
        Compute losses on the generated flow maps.

        Args:
            flow (tensor or list of tensors): Generated flow maps.
            warped_image (tensor or list of tensors): Warped images using the flow maps.
            tgt_image (tensor): Target image for the warped image.
            flow_gt (tensor or list of tensors): Ground truth flow maps.
            flow_conf_gt (tensor or list of tensors): Configence for the ground truth flow maps.
            fg_mask (tensor): Foreground mask for the target image.
            tgt_label (tensor): Target label map.
            ref_label (tensor): Reference label map.
        Returns:
            (dict):
                - loss_flow_L1 (tensor): L1 loss compared to ground truth flow.
                - loss_flow_warp (tensor): L1 loss between the warped image and the target image when using the flow to warp.
                - body_mask_diff (tensor): Difference between warped body part map and target body part map. Used for pose dataset only.
        """
        loss_flow_L1 = dg.to_variable(np.zeros((1, )))
        loss_flow_warp = dg.to_variable(np.zeros((1, )))
        if isinstance(flow, list):
            # Compute flow losses for both warping reference -> target and previous -> target.
            for i in range(len(flow)):
                loss_flow_L1_i, loss_flow_warp_i = self.compute_flow_loss(
                    flow[i], warped_images[i], tgt_image, flow_gt[i],
                    flow_conf_gt[i], fg_mask)
                loss_flow_L1 += loss_flow_L1_i
                loss_flow_warp += loss_flow_warp_i
        else:
            # Compute loss for warping either reference or previous images.
            loss_flow_L1, loss_flow_warp = self.compute_flow_loss(
                flow, warped_images, tgt_image, flow_gt[-1], flow_conf_gt[-1],
                fg_mask)

        # For pose dataset only.
        body_mask_diff = None
        if self.warp_ref:
            if self.for_pose_dataset:
                # Warped reference body part map should similar to target body part map.
                body_mask = get_part_mask(tgt_label[:, 2])
                ref_body_mask = get_part_mask(ref_label[:, 2])
                warped_ref_body_mask = resample(ref_body_mask, flow[0])
                loss_flow_warp += self.criterion(warped_ref_body_mask,
                                                 body_mask)
                body_mask_diff = L.reduce_sum(L.abs(warped_ref_body_mask -
                                                    body_mask),
                                              1,
                                              keep_dim=True)

            if self.has_fg:
                # Warped reference foreground map should be similar to target foreground map.
                fg_mask, ref_fg_mask = get_fg_mask([tgt_label, ref_label],
                                                   True)
                warped_ref_fg_mask = resample(ref_fg_mask, flow[0])
                loss_flow_warp += self.criterion(warped_ref_fg_mask, fg_mask)

        return loss_flow_L1, loss_flow_warp, body_mask_diff
Exemple #8
0
def get_enc_bias(source_inputs):
    """
        get_enc_bias
    """
    source_inputs = layers.cast(source_inputs, 'float32')
    emb_sum = layers.reduce_sum(layers.abs(source_inputs), dim=-1)
    zero = layers.fill_constant([1], 'float32', value=0) 
    bias = layers.cast(layers.equal(emb_sum, zero), 'float32') * -1e9
    return layers.unsqueeze(layers.unsqueeze(bias, axes=[1]), axes=[1])
Exemple #9
0
def binary_op(u_embed, v_embed, binary_op_type):
    if binary_op_type == "Average":
        edge_embed = (u_embed + v_embed) / 2
    elif binary_op_type == "Hadamard":
        edge_embed = u_embed * v_embed
    elif binary_op_type == "Weighted-L1":
        edge_embed = l.abs(u_embed - v_embed)
    elif binary_op_type == "Weighted-L2":
        edge_embed = (u_embed - v_embed) * (u_embed - v_embed)
    else:
        raise ValueError(binary_op_type + " binary_op_type doesn't exists")
    return edge_embed
Exemple #10
0
def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg):
    n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[
        'num_attention_heads']
    head_importance = L.zeros(shape=[n_layers, n_heads], dtype='float32')
    head_mask = L.ones(shape=[n_layers, n_heads], dtype='float32')
    head_mask.stop_gradient = False

    intermediate_weight = []
    intermediate_bias = []
    output_weight = []

    for name, w in model.named_parameters():
        if 'ffn.i' in name:
            if len(w.shape) > 1:
                intermediate_weight.append(w)
            else:
                intermediate_bias.append(w)

        if 'ffn.o' in name:
            if len(w.shape) > 1:
                output_weight.append(w)

    neuron_importance = []
    for w in intermediate_weight:
        neuron_importance.append(np.zeros(shape=[w.shape[1]], dtype='float32'))

    eval_task_names = ('mnli',
                       'mnli-mm') if args.task == 'mnli' else (args.task, )

    for eval_task in eval_task_names:
        for batch in dev_ds.start(place):
            ids, sids, label = batch
            out = model(ids,
                        sids,
                        labels=label,
                        head_mask=head_mask,
                        num_layers=model_cfg['num_hidden_layers'])
            loss = out[0]
            loss.backward()
            head_importance += L.abs(FD.to_variable(head_mask.gradient()))

            for w1, b1, w2, current_importance in zip(intermediate_weight,
                                                      intermediate_bias,
                                                      output_weight,
                                                      neuron_importance):
                current_importance += np.abs(
                    (np.sum(w1.numpy() * w1.gradient(), axis=0) +
                     b1.numpy() * b1.gradient()))
                current_importance += np.abs(
                    np.sum(w2.numpy() * w2.gradient(), axis=1))

    return head_importance, neuron_importance
Exemple #11
0
    def get_feature(self, im: np.ndarray):
        """Get the feature. Generally, call this function.
        args:
            im: image patch
        """

        # Return empty tensor if it should not be used
        is_color = im.shape[1] == 3
        if is_color and not self.use_for_color or not is_color and not self.use_for_gray:
            return np.array([])

        feat_list = self.extract(im)

        output_sz = [None] * len(
            feat_list) if self.output_size is None else self.output_size

        # Pool/downsample
        with fluid.dygraph.guard():
            feat_list = [n2p(f) for f in feat_list]

            for i, (sz, s) in enumerate(zip(output_sz, self.pool_stride)):
                if sz is not None:
                    feat_list[i] = layers.adaptive_pool2d(feat_list[i],
                                                          sz,
                                                          pool_type='avg')
                elif s != 1:
                    feat_list[i] = layers.pool2d(feat_list[i],
                                                 s,
                                                 pool_stride=s,
                                                 pool_type='avg')

            # Normalize
            if self.normalize_power is not None:
                new_feat_list = []
                for feat in feat_list:
                    norm = (layers.reduce_sum(layers.reshape(
                        layers.abs(feat), [feat.shape[0], 1, 1, -1])**
                                              self.normalize_power,
                                              dim=3,
                                              keep_dim=True) /
                            (feat.shape[1] * feat.shape[2] * feat.shape[3]) +
                            1e-10)**(1 / self.normalize_power)
                    feat = broadcast_op(feat, norm, 'div')
                    new_feat_list.append(feat)
                feat_list = new_feat_list

            # To numpy
            feat_list = TensorList([f.numpy() for f in feat_list])
        return feat_list
Exemple #12
0
    def test_forward(self):
        entity_embedding, relation_embedding, transfer_matrix = self.create_share_variables(
        )

        rel_matrix = layers.reshape(
            self.lookup_table(self.test_input[1], transfer_matrix),
            [self.hidden_size, self.hidden_size])
        entity_embedding_trans = layers.matmul(entity_embedding, rel_matrix,
                                               False, False)
        rel_vec = self.lookup_table(self.test_input[1], relation_embedding)
        entity_embedding_trans = layers.l2_normalize(entity_embedding_trans,
                                                     axis=-1)
        rel_vec = layers.l2_normalize(rel_vec, axis=-1)
        head_vec = self.lookup_table(self.test_input[0],
                                     entity_embedding_trans)
        tail_vec = self.lookup_table(self.test_input[2],
                                     entity_embedding_trans)
        id_replace_head = layers.reduce_sum(layers.abs(entity_embedding_trans +
                                                       rel_vec - tail_vec),
                                            dim=1)
        id_replace_tail = layers.reduce_sum(layers.abs(entity_embedding_trans -
                                                       rel_vec - head_vec),
                                            dim=1)
        return [id_replace_head, id_replace_tail]
Exemple #13
0
 def forward(self, img, label, mask=None, return_loss=True):
     outs = self.backbone(img)
     outs = self.neck(outs)
     if return_loss:
         s = img + outs[-1]
         cls_out = self.avgpool(self.head(s)[-1])
         cls_out = L.squeeze(cls_out, axes=[2, 3])
         if self.dropout:
             cls_out = L.dropout(cls_out, dropout_prob=self.dropout)
         cls_out = self.fc(cls_out)
         losses = self.get_losses(outs, cls_out, mask, label)
         return losses
     else:
         cue = L.abs(outs[-1]).numpy()
         return cue
Exemple #14
0
    def func(self, place):
        # the shape of input variable should be clearly specified, not inlcude -1.
        shape = [2, 3, 7, 9]
        eps = 1e-6
        dtype = np.float64

        x = layers.data('x', shape, False, dtype)
        x.persistable = True
        y = layers.abs(x)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        # Because we set delta = 0.005 in calculating numeric gradient,
        # if x is too small, the numeric gradient is inaccurate.
        # we should avoid this
        x_arr[np.abs(x_arr) < 0.005] = 0.02

        gradient_checker.double_grad_check(
            [x], y, x_init=x_arr, place=place, eps=eps)
Exemple #15
0
 def get_losses(self, out, cls_out, mask, gt_labels):
     loss_cls = L.mean(L.cross_entropy(cls_out,
                                       gt_labels)) * self.train_cfg['w_cls']
     cue = out[-1] if self.train_cfg['with_mask'] else L.elementwise_mul(
         out[-1], L.cast(gt_labels, 'float32'), axis=0)
     num_reg = L.cast(
         L.reduce_sum(gt_labels) * cue.shape[1] * cue.shape[2] *
         cue.shape[3], 'float32')
     loss_reg = L.reduce_sum(
         L.abs(mask - cue)) / (num_reg + 1e-8) * self.train_cfg['w_reg']
     loss_tir = 0
     for feat in out[:-1]:
         feat = L.squeeze(self.avgpool(feat), axes=[2, 3])
         loss_tir += self.triple_loss(feat,
                                      gt_labels) * self.train_cfg['w_tri']
     loss = loss_cls + loss_reg + loss_tir
     return dict(loss_cls=loss_cls,
                 loss_reg=loss_reg,
                 loss_tir=loss_tir,
                 loss=loss)
Exemple #16
0
    def get_feature(self, im: np.ndarray):
        """Get the feature. Generally, call this function.
        args:
            im: image patch
        """

        # Return empty tensor if it should not be used
        is_color = im.shape[1] == 3
        if is_color and not self.use_for_color or not is_color and not self.use_for_gray:
            return np.array([])

        # Extract feature
        feat = self.extract(im)

        # Pool/downsample
        with fluid.dygraph.guard():
            feat = n2p(feat)

            if self.output_size is not None:
                feat = layers.adaptive_pool2d(feat, self.output_size, 'avg')
            elif self.pool_stride != 1:
                feat = layers.pool2d(
                    feat,
                    self.pool_stride,
                    pool_stride=self.pool_stride,
                    pool_type='avg')

            # Normalize
            if self.normalize_power is not None:
                feat /= (
                    layers.reduce_sum(
                        layers.reshape(
                            layers.abs(feat), [feat.shape[0], 1, 1, -1])**
                        self.normalize_power,
                        dim=3,
                        keep_dim=True) /
                    (feat.shape[1] * feat.shape[2] * feat.shape[3]) + 1e-10)**(
                        1 / self.normalize_power)

            feat = feat.numpy()
        return feat
Exemple #17
0
    def l1_loss(self, prediction, target, mask, priority_bin=None):
        """L1 loss for spectrogram.

        Args:
            prediction (Variable): shape(B, T, C), dtype float32, predicted spectrogram.
            target (Variable): shape(B, T, C), dtype float32, target spectrogram.
            mask (Variable): shape(B, T), mask.
            priority_bin (int, optional): frequency bands for linear spectrogram loss to be prioritized. Defaults to None.

        Returns:
            Variable: shape(1,), dtype float32, l1 loss(with mask and possibly priority bin applied.)
        """
        abs_diff = F.abs(prediction - target)

        # basic mask-weighted l1 loss
        w = self.masked_weight
        if w > 0 and mask is not None:
            base_l1_loss = w * masked_mean(abs_diff, mask) \
                         + (1 - w) * F.reduce_mean(abs_diff)
        else:
            base_l1_loss = F.reduce_mean(abs_diff)

        if self.priority_weight > 0 and priority_bin is not None:
            # mask-weighted priority channels' l1-loss
            priority_abs_diff = abs_diff[:, :, :priority_bin]
            if w > 0 and mask is not None:
                priority_loss = w * masked_mean(priority_abs_diff, mask) \
                              + (1 - w) * F.reduce_mean(priority_abs_diff)
            else:
                priority_loss = F.reduce_mean(priority_abs_diff)

            # priority weighted sum
            p = self.priority_weight
            loss = p * priority_loss + (1 - p) * base_l1_loss
        else:
            loss = base_l1_loss
        return loss
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output,
                                        'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    network_cfg = cfg['network']
    model = TransformerTTS(
        network_cfg['embedding_size'], network_cfg['hidden_size'],
        network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
        cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
        network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(
            1 / (cfg['train']['warm_up_step'] *
                 (cfg['train']['learning_rate']**2)),
            cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(
            cfg['train']['grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     optimizer=optimizer,
                                     checkpoint_dir=os.path.join(
                                         args.output, 'checkpoints'),
                                     iteration=args.iteration,
                                     checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(cfg['audio'],
                            place,
                            args.data,
                            cfg['train']['batch_size'],
                            nranks,
                            local_rank,
                            shuffle=True).reader()

    for epoch in range(cfg['train']['max_epochs']):
        pbar = tqdm(reader)
        for i, data in enumerate(pbar):
            pbar.set_description('Processing at epoch %d' % epoch)
            character, mel, mel_input, pos_text, pos_mel = data

            global_step += 1

            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                character, mel_input, pos_text, pos_mel)

            mel_loss = layers.mean(
                layers.abs(layers.elementwise_sub(mel_pred, mel)))
            post_mel_loss = layers.mean(
                layers.abs(layers.elementwise_sub(postnet_pred, mel)))
            loss = mel_loss + post_mel_loss

            # Note: When used stop token loss the learning did not work.
            if cfg['network']['stop_token']:
                label = (pos_mel == 0).astype(np.float32)
                stop_loss = cross_entropy(stop_preds, label)
                loss = loss + stop_loss

            if local_rank == 0:
                writer.add_scalars(
                    'training_loss', {
                        'mel_loss': mel_loss.numpy(),
                        'post_mel_loss': post_mel_loss.numpy()
                    }, global_step)

                if cfg['network']['stop_token']:
                    writer.add_scalar('stop_loss', stop_loss.numpy(),
                                      global_step)

                if parallel:
                    writer.add_scalars(
                        'alphas', {
                            'encoder_alpha':
                            model._layers.encoder.alpha.numpy(),
                            'decoder_alpha':
                            model._layers.decoder.alpha.numpy(),
                        }, global_step)
                else:
                    writer.add_scalars(
                        'alphas', {
                            'encoder_alpha': model.encoder.alpha.numpy(),
                            'decoder_alpha': model.decoder.alpha.numpy(),
                        }, global_step)

                writer.add_scalar('learning_rate',
                                  optimizer._learning_rate.step().numpy(),
                                  global_step)

                if global_step % cfg['train']['image_interval'] == 1:
                    for i, prob in enumerate(attn_probs):
                        for j in range(cfg['network']['decoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_%d_0' % global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

                    for i, prob in enumerate(attn_enc):
                        for j in range(cfg['network']['encoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_enc_%d_0' %
                                             global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

                    for i, prob in enumerate(attn_dec):
                        for j in range(cfg['network']['decoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_dec_%d_0' %
                                             global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

            if parallel:
                loss = model.scale_loss(loss)
                loss.backward()
                model.apply_collective_grads()
            else:
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()

            # save checkpoint
            if local_rank == 0 and global_step % cfg['train'][
                    'checkpoint_interval'] == 0:
                io.save_parameters(os.path.join(args.output, 'checkpoints'),
                                   global_step, model, optimizer)

    if local_rank == 0:
        writer.close()
Exemple #19
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    model = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'],
                    cfg['audio']['num_mels'], cfg['audio']['n_fft'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(
            1 / (cfg['train']['warm_up_step'] *
                 (cfg['train']['learning_rate']**2)),
            cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(
            cfg['train']['grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     optimizer=optimizer,
                                     checkpoint_dir=os.path.join(
                                         args.output, 'checkpoints'),
                                     iteration=args.iteration,
                                     checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(cfg['audio'],
                            place,
                            args.data,
                            cfg['train']['batch_size'],
                            nranks,
                            local_rank,
                            is_vocoder=True).reader()

    for epoch in range(cfg['train']['max_iteration']):
        pbar = tqdm(reader)
        for i, data in enumerate(pbar):
            pbar.set_description('Processing at epoch %d' % epoch)
            mel, mag = data
            mag = dg.to_variable(mag.numpy())
            mel = dg.to_variable(mel.numpy())
            global_step += 1

            mag_pred = model(mel)
            loss = layers.mean(
                layers.abs(layers.elementwise_sub(mag_pred, mag)))

            if parallel:
                loss = model.scale_loss(loss)
                loss.backward()
                model.apply_collective_grads()
            else:
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()

            if local_rank == 0:
                writer.add_scalar('training_loss/loss', loss.numpy(),
                                  global_step)

            # save checkpoint
            if local_rank == 0 and global_step % cfg['train'][
                    'checkpoint_interval'] == 0:
                io.save_parameters(os.path.join(args.output, 'checkpoints'),
                                   global_step, model, optimizer)

    if local_rank == 0:
        writer.close()
Exemple #20
0
def position_id(x, r=0):
    pid = layers.arange(0, x.shape[1], dtype="int32")
    pid = layers.unsqueeze(pid, 0)
    r = layers.cast(layers.ones_like(x), dtype="int32") * r
    return layers.cast(layers.abs(layers.elementwise_sub(pid, r)), dtype='int64')
Exemple #21
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    network_cfg = cfg['network']
    model = TransformerTTS(
        network_cfg['embedding_size'], network_cfg['hidden_size'],
        network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
        cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
        network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
                                        (cfg['train']['learning_rate']**2)),
                                   cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
            'grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(
        model=model,
        optimizer=optimizer,
        checkpoint_dir=os.path.join(args.output, 'checkpoints'),
        iteration=args.iteration,
        checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(
        cfg['audio'],
        place,
        args.data,
        cfg['train']['batch_size'],
        nranks,
        local_rank,
        shuffle=True).reader

    iterator = iter(tqdm(reader))

    global_step += 1

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch

        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            character, mel_input, pos_text, pos_mel)

        mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(mel_pred, mel)))
        post_mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(postnet_pred, mel)))
        loss = mel_loss + post_mel_loss

        stop_loss = cross_entropy(
            stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight'])
        loss = loss + stop_loss

        if local_rank == 0:
            writer.add_scalar('training_loss/mel_loss',
                              mel_loss.numpy(),
                              global_step)
            writer.add_scalar('training_loss/post_mel_loss',
                              post_mel_loss.numpy(),
                              global_step)
            writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)

            if parallel:
                writer.add_scalar('alphas/encoder_alpha',
                                   model._layers.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model._layers.decoder.alpha.numpy(),
                                   global_step)
            else:
                writer.add_scalar('alphas/encoder_alpha',
                                   model.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model.decoder.alpha.numpy(),
                                   global_step)

            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

            if global_step % cfg['train']['image_interval'] == 1:
                for i, prob in enumerate(attn_probs):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_enc):
                    for j in range(cfg['network']['encoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_enc_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_dec):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_dec_%d_0' % global_step,
                            x,
                            i * 4 + j)

        if parallel:
            loss = model.scale_loss(loss)
            loss.backward()
            model.apply_collective_grads()
        else:
            loss.backward()
        optimizer.minimize(loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(
                os.path.join(args.output, 'checkpoints'), global_step, model,
                optimizer)
        global_step += 1

    if local_rank == 0:
        writer.close()
Exemple #22
0
 def _abs(x):
     if isinstance(x, PTensor):
         return layers.abs(x)
     else:
         return np.abs(x)
Exemple #23
0
def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1

    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
             if args.use_data_parallel else fluid.CUDAPlace(0)
             if args.use_gpu else fluid.CPUPlace())

    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
    path = os.path.join(args.log_dir, 'fastspeech')

    writer = SummaryWriter(path) if local_rank == 0 else None

    with dg.guard(place):
        with fluid.unique_name.guard():
            transformer_tts = TransformerTTS(cfg)
            model_dict, _ = load_checkpoint(
                str(args.transformer_step),
                os.path.join(args.transtts_path, "transformer"))
            transformer_tts.set_dict(model_dict)
            transformer_tts.eval()

        model = FastSpeech(cfg)
        model.train()
        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=dg.NoamDecay(1 / (
                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
            parameter_list=model.parameters())
        reader = LJSpeechLoader(
            cfg, args, nranks, local_rank, shuffle=True).reader()

        if args.checkpoint_path is not None:
            model_dict, opti_dict = load_checkpoint(
                str(args.fastspeech_step),
                os.path.join(args.checkpoint_path, "fastspeech"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.fastspeech_step
            print("load checkpoint!!!")

        if args.use_data_parallel:
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)

        for epoch in range(args.epochs):
            pbar = tqdm(reader)

            for i, data in enumerate(pbar):
                pbar.set_description('Processing at epoch %d' % epoch)
                (character, mel, mel_input, pos_text, pos_mel, text_length,
                 mel_lens, enc_slf_mask, enc_query_mask, dec_slf_mask,
                 enc_dec_mask, dec_query_slf_mask, dec_query_mask) = data

                _, _, attn_probs, _, _, _ = transformer_tts(
                    character,
                    mel_input,
                    pos_text,
                    pos_mel,
                    dec_slf_mask=dec_slf_mask,
                    enc_slf_mask=enc_slf_mask,
                    enc_query_mask=enc_query_mask,
                    enc_dec_mask=enc_dec_mask,
                    dec_query_slf_mask=dec_query_slf_mask,
                    dec_query_mask=dec_query_mask)
                alignment, max_attn = get_alignment(attn_probs, mel_lens,
                                                    cfg['transformer_head'])
                alignment = dg.to_variable(alignment).astype(np.float32)

                if local_rank == 0 and global_step % 5 == 1:
                    x = np.uint8(
                        cm.viridis(max_attn[8, :mel_lens.numpy()[8]]) * 255)
                    writer.add_image(
                        'Attention_%d_0' % global_step,
                        x,
                        0,
                        dataformats="HWC")

                global_step += 1

                #Forward
                result = model(
                    character,
                    pos_text,
                    mel_pos=pos_mel,
                    length_target=alignment,
                    enc_non_pad_mask=enc_query_mask,
                    enc_slf_attn_mask=enc_slf_mask,
                    dec_non_pad_mask=dec_query_slf_mask,
                    dec_slf_attn_mask=dec_slf_mask)
                mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
                mel_loss = layers.mse_loss(mel_output, mel)
                mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
                duration_loss = layers.mean(
                    layers.abs(
                        layers.elementwise_sub(duration_predictor_output,
                                               alignment)))
                total_loss = mel_loss + mel_postnet_loss + duration_loss

                if local_rank == 0:
                    writer.add_scalar('mel_loss',
                                      mel_loss.numpy(), global_step)
                    writer.add_scalar('post_mel_loss',
                                      mel_postnet_loss.numpy(), global_step)
                    writer.add_scalar('duration_loss',
                                      duration_loss.numpy(), global_step)
                    writer.add_scalar('learning_rate',
                                      optimizer._learning_rate.step().numpy(),
                                      global_step)

                if args.use_data_parallel:
                    total_loss = model.scale_loss(total_loss)
                    total_loss.backward()
                    model.apply_collective_grads()
                else:
                    total_loss.backward()
                optimizer.minimize(
                    total_loss,
                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
                        'grad_clip_thresh']))
                model.clear_gradients()

                # save checkpoint
                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
                    save_path = os.path.join(args.save_path,
                                             'fastspeech/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
        if local_rank == 0:
            writer.close()
def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1

    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
             if args.use_data_parallel else
             fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())

    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
    path = os.path.join(args.log_dir, 'transformer')

    writer = SummaryWriter(path) if local_rank == 0 else None

    with dg.guard(place):
        model = TransformerTTS(cfg)

        model.train()
        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=dg.NoamDecay(
                1 / (cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
            parameter_list=model.parameters())

        if args.checkpoint_path is not None:
            model_dict, opti_dict = load_checkpoint(
                str(args.transformer_step),
                os.path.join(args.checkpoint_path, "transformer"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.transformer_step
            print("load checkpoint!!!")

        if args.use_data_parallel:
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)

        reader = LJSpeechLoader(cfg, args, nranks, local_rank,
                                shuffle=True).reader()

        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
                pbar.set_description('Processing at epoch %d' % epoch)
                character, mel, mel_input, pos_text, pos_mel, text_length, _, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask = data

                global_step += 1

                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                    character,
                    mel_input,
                    pos_text,
                    pos_mel,
                    dec_slf_mask=dec_slf_mask,
                    enc_slf_mask=enc_slf_mask,
                    enc_query_mask=enc_query_mask,
                    enc_dec_mask=enc_dec_mask,
                    dec_query_slf_mask=dec_query_slf_mask,
                    dec_query_mask=dec_query_mask)

                mel_loss = layers.mean(
                    layers.abs(layers.elementwise_sub(mel_pred, mel)))
                post_mel_loss = layers.mean(
                    layers.abs(layers.elementwise_sub(postnet_pred, mel)))
                loss = mel_loss + post_mel_loss

                # Note: When used stop token loss the learning did not work.
                if args.stop_token:
                    label = (pos_mel == 0).astype(np.float32)
                    stop_loss = cross_entropy(stop_preds, label)
                    loss = loss + stop_loss

                if local_rank == 0:
                    writer.add_scalars(
                        'training_loss', {
                            'mel_loss': mel_loss.numpy(),
                            'post_mel_loss': post_mel_loss.numpy()
                        }, global_step)

                    if args.stop_token:
                        writer.add_scalar('stop_loss', stop_loss.numpy(),
                                          global_step)

                    if args.use_data_parallel:
                        writer.add_scalars(
                            'alphas', {
                                'encoder_alpha':
                                model._layers.encoder.alpha.numpy(),
                                'decoder_alpha':
                                model._layers.decoder.alpha.numpy(),
                            }, global_step)
                    else:
                        writer.add_scalars(
                            'alphas', {
                                'encoder_alpha': model.encoder.alpha.numpy(),
                                'decoder_alpha': model.decoder.alpha.numpy(),
                            }, global_step)

                    writer.add_scalar('learning_rate',
                                      optimizer._learning_rate.step().numpy(),
                                      global_step)

                    if global_step % args.image_step == 1:
                        for i, prob in enumerate(attn_probs):
                            for j in range(4):
                                x = np.uint8(
                                    cm.viridis(prob.numpy()[j * args.batch_size
                                                            // 2]) * 255)
                                writer.add_image('Attention_%d_0' %
                                                 global_step,
                                                 x,
                                                 i * 4 + j,
                                                 dataformats="HWC")

                        for i, prob in enumerate(attn_enc):
                            for j in range(4):
                                x = np.uint8(
                                    cm.viridis(prob.numpy()[j * args.batch_size
                                                            // 2]) * 255)
                                writer.add_image('Attention_enc_%d_0' %
                                                 global_step,
                                                 x,
                                                 i * 4 + j,
                                                 dataformats="HWC")

                        for i, prob in enumerate(attn_dec):
                            for j in range(4):
                                x = np.uint8(
                                    cm.viridis(prob.numpy()[j * args.batch_size
                                                            // 2]) * 255)
                                writer.add_image('Attention_dec_%d_0' %
                                                 global_step,
                                                 x,
                                                 i * 4 + j,
                                                 dataformats="HWC")

                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
                    loss.backward()
                    model.apply_collective_grads()
                else:
                    loss.backward()
                optimizer.minimize(
                    loss,
                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(
                        cfg['grad_clip_thresh']))
                model.clear_gradients()

                # save checkpoint
                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
                    save_path = os.path.join(args.save_path,
                                             'transformer/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
        if local_rank == 0:
            writer.close()
Exemple #25
0
def main(args):

    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1

    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
             if args.use_data_parallel else
             fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())

    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
    path = os.path.join(args.log_dir, 'vocoder')

    writer = SummaryWriter(path) if local_rank == 0 else None

    with dg.guard(place):
        model = Vocoder(cfg, args.batch_size)

        model.train()
        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=dg.NoamDecay(
                1 / (cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
            parameter_list=model.parameters())

        if args.checkpoint_path is not None:
            model_dict, opti_dict = load_checkpoint(
                str(args.vocoder_step),
                os.path.join(args.checkpoint_path, "vocoder"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.vocoder_step
            print("load checkpoint!!!")

        if args.use_data_parallel:
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)

        reader = LJSpeechLoader(cfg, args, nranks, local_rank,
                                is_vocoder=True).reader()

        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
                pbar.set_description('Processing at epoch %d' % epoch)
                mel, mag = data
                mag = dg.to_variable(mag.numpy())
                mel = dg.to_variable(mel.numpy())
                global_step += 1

                mag_pred = model(mel)
                loss = layers.mean(
                    layers.abs(layers.elementwise_sub(mag_pred, mag)))

                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
                    loss.backward()
                    model.apply_collective_grads()
                else:
                    loss.backward()
                optimizer.minimize(
                    loss,
                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(
                        cfg['grad_clip_thresh']))
                model.clear_gradients()

                if local_rank == 0:
                    writer.add_scalars('training_loss', {
                        'loss': loss.numpy(),
                    }, global_step)

                    if global_step % args.save_step == 0:
                        if not os.path.exists(args.save_path):
                            os.mkdir(args.save_path)
                        save_path = os.path.join(args.save_path,
                                                 'vocoder/%d' % global_step)
                        dg.save_dygraph(model.state_dict(), save_path)
                        dg.save_dygraph(optimizer.state_dict(), save_path)

        if local_rank == 0:
            writer.close()
Exemple #26
0
 def forward(self, x, y):
     return L.mean(L.abs(x - y))
Exemple #27
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(dg.parallel.Env()
                            .dev_id) if args.use_gpu else fluid.CPUPlace()
    fluid.enable_dygraph(place)

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output,
                                        'log')) if local_rank == 0 else None

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
                                        (cfg['train']['learning_rate']**2)),
                                   cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
            'grad_clip_thresh']))
    reader = LJSpeechLoader(
        cfg['audio'],
        place,
        args.data,
        args.alignments_path,
        cfg['train']['batch_size'],
        nranks,
        local_rank,
        shuffle=True).reader
    iterator = iter(tqdm(reader))

    # Load parameters.
    global_step = io.load_parameters(
        model=model,
        optimizer=optimizer,
        checkpoint_dir=os.path.join(args.output, 'checkpoints'),
        iteration=args.iteration,
        checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        (character, mel, pos_text, pos_mel, alignment) = batch

        global_step += 1

        #Forward
        result = model(
            character, pos_text, mel_pos=pos_mel, length_target=alignment)
        mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
        mel_loss = layers.mse_loss(mel_output, mel)
        mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
        duration_loss = layers.mean(
            layers.abs(
                layers.elementwise_sub(duration_predictor_output, alignment)))
        total_loss = mel_loss + mel_postnet_loss + duration_loss

        if local_rank == 0:
            writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
            writer.add_scalar('post_mel_loss',
                              mel_postnet_loss.numpy(), global_step)
            writer.add_scalar('duration_loss',
                              duration_loss.numpy(), global_step)
            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

        if parallel:
            total_loss = model.scale_loss(total_loss)
            total_loss.backward()
            model.apply_collective_grads()
        else:
            total_loss.backward()
        optimizer.minimize(total_loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(
                os.path.join(args.output, 'checkpoints'), global_step, model,
                optimizer)

    if local_rank == 0:
        writer.close()