def train_forward(self): entity_embedding, relation_embedding, transfer_matrix = self.create_share_variables( ) pos_head = self.lookup_table(self.train_pos_input[:, 0], entity_embedding) pos_tail = self.lookup_table(self.train_pos_input[:, 2], entity_embedding) pos_rel = self.lookup_table(self.train_pos_input[:, 1], relation_embedding) neg_head = self.lookup_table(self.train_neg_input[:, 0], entity_embedding) neg_tail = self.lookup_table(self.train_neg_input[:, 2], entity_embedding) neg_rel = self.lookup_table(self.train_neg_input[:, 1], relation_embedding) rel_matrix = layers.reshape( self.lookup_table(self.train_pos_input[:, 1], transfer_matrix), [-1, self.hidden_size, self.hidden_size]) pos_head_trans = self.matmul_with_expend_dims(pos_head, rel_matrix) pos_tail_trans = self.matmul_with_expend_dims(pos_tail, rel_matrix) rel_matrix_neg = layers.reshape( self.lookup_table(self.train_neg_input[:, 1], transfer_matrix), [-1, self.hidden_size, self.hidden_size]) neg_head_trans = self.matmul_with_expend_dims(neg_head, rel_matrix_neg) neg_tail_trans = self.matmul_with_expend_dims(neg_tail, rel_matrix_neg) pos_score = self._algorithm(pos_head_trans, pos_rel, pos_tail_trans) neg_score = self._algorithm(neg_head_trans, neg_rel, neg_tail_trans) pos = layers.reduce_sum(layers.abs(pos_score), -1, keep_dim=False) neg = layers.reduce_sum(layers.abs(neg_score), -1, keep_dim=False) neg = layers.reshape(neg, shape=[-1, 1], inplace=True) loss = layers.reduce_mean(layers.relu(pos - neg + self.margin)) return [loss]
def test_forward(self): entity_embedding, relation_embedding = self.create_share_variables() entity = layers.l2_normalize(entity_embedding, axis=-1) relation = layers.l2_normalize(relation_embedding, axis=-1) head_vec = self.lookup_table(self.test_input[0], entity) rel_vec = self.lookup_table(self.test_input[1], relation) tail_vec = self.lookup_table(self.test_input[2], entity) id_replace_head = layers.reduce_sum(layers.abs(entity + rel_vec - tail_vec), dim=1) id_replace_tail = layers.reduce_sum(layers.abs(entity - rel_vec - head_vec), dim=1) return [id_replace_head, id_replace_tail]
def spec_loss(self, decoded, input, num_frames=None): if num_frames is None: l1_loss = F.reduce_mean(F.abs(decoded - input)) else: # mask the <pad> part of the decoder num_channels = decoded.shape[-1] l1_loss = F.abs(decoded - input) mask = F.sequence_mask(num_frames, dtype="float32") l1_loss *= F.unsqueeze(mask, axes=[-1]) l1_loss = F.reduce_sum(l1_loss) / F.scale(F.reduce_sum(mask), num_channels) return l1_loss
def train_forward(self): entity_embedding, relation_embedding = self.create_share_variables() pos_head = self.lookup_table(self.train_pos_input[:, 0], entity_embedding) pos_tail = self.lookup_table(self.train_pos_input[:, 2], entity_embedding) pos_rel = self.lookup_table(self.train_pos_input[:, 1], relation_embedding) neg_head = self.lookup_table(self.train_neg_input[:, 0], entity_embedding) neg_tail = self.lookup_table(self.train_neg_input[:, 2], entity_embedding) neg_rel = self.lookup_table(self.train_neg_input[:, 1], relation_embedding) pos_score = self._algorithm(pos_head, pos_rel, pos_tail) neg_score = self._algorithm(neg_head, neg_rel, neg_tail) pos = layers.reduce_sum(layers.abs(pos_score), 1, keep_dim=False) neg = layers.reduce_sum(layers.abs(neg_score), 1, keep_dim=False) neg = layers.reshape(neg, shape=[-1, 1], inplace=True) loss = layers.reduce_mean(layers.relu(pos - neg + self.margin)) return [loss]
def compute_mask_loss(self, occ_mask, warped_image, tgt_image): """ Compute losses on the generated occlusion mask. Args: occ_mask (tensor): Generated occlusion masks. warped_image (tensor): Warped image using the flow map. tgt_image (tensor): Target image for the warped image. Returns: (tensor): Loss for the mask. """ loss_mask = dg.to_variable(np.zeros((1, )).astype("float32")) if occ_mask is not None: dummy0 = L.zeros_like(occ_mask) dummy1 = L.ones_like(occ_mask) # Compute the confidence map based L1 distance between warped and GT image. img_diff = L.reduce_sum(L.abs(warped_image - tgt_image), 1, keep_dim=True) conf = L.clip(1 - img_diff, 0, 1) # Force mask value to be small if warped image is similar to GT, and vice versa. loss_mask = self.criterionMasked(occ_mask, dummy0, conf) loss_mask += self.criterionMasked(occ_mask, dummy1, 1 - conf) return loss_mask
def test_abs(self): program = Program() with program_guard(program): input = layers.data(name="input", shape=[16], dtype="float32") out = layers.abs(input, name='abs') self.assertIsNotNone(out) print(str(program))
def compute_flow_losses(self, flow, warped_images, tgt_image, flow_gt, flow_conf_gt, fg_mask, tgt_label, ref_label): """ Compute losses on the generated flow maps. Args: flow (tensor or list of tensors): Generated flow maps. warped_image (tensor or list of tensors): Warped images using the flow maps. tgt_image (tensor): Target image for the warped image. flow_gt (tensor or list of tensors): Ground truth flow maps. flow_conf_gt (tensor or list of tensors): Configence for the ground truth flow maps. fg_mask (tensor): Foreground mask for the target image. tgt_label (tensor): Target label map. ref_label (tensor): Reference label map. Returns: (dict): - loss_flow_L1 (tensor): L1 loss compared to ground truth flow. - loss_flow_warp (tensor): L1 loss between the warped image and the target image when using the flow to warp. - body_mask_diff (tensor): Difference between warped body part map and target body part map. Used for pose dataset only. """ loss_flow_L1 = dg.to_variable(np.zeros((1, ))) loss_flow_warp = dg.to_variable(np.zeros((1, ))) if isinstance(flow, list): # Compute flow losses for both warping reference -> target and previous -> target. for i in range(len(flow)): loss_flow_L1_i, loss_flow_warp_i = self.compute_flow_loss( flow[i], warped_images[i], tgt_image, flow_gt[i], flow_conf_gt[i], fg_mask) loss_flow_L1 += loss_flow_L1_i loss_flow_warp += loss_flow_warp_i else: # Compute loss for warping either reference or previous images. loss_flow_L1, loss_flow_warp = self.compute_flow_loss( flow, warped_images, tgt_image, flow_gt[-1], flow_conf_gt[-1], fg_mask) # For pose dataset only. body_mask_diff = None if self.warp_ref: if self.for_pose_dataset: # Warped reference body part map should similar to target body part map. body_mask = get_part_mask(tgt_label[:, 2]) ref_body_mask = get_part_mask(ref_label[:, 2]) warped_ref_body_mask = resample(ref_body_mask, flow[0]) loss_flow_warp += self.criterion(warped_ref_body_mask, body_mask) body_mask_diff = L.reduce_sum(L.abs(warped_ref_body_mask - body_mask), 1, keep_dim=True) if self.has_fg: # Warped reference foreground map should be similar to target foreground map. fg_mask, ref_fg_mask = get_fg_mask([tgt_label, ref_label], True) warped_ref_fg_mask = resample(ref_fg_mask, flow[0]) loss_flow_warp += self.criterion(warped_ref_fg_mask, fg_mask) return loss_flow_L1, loss_flow_warp, body_mask_diff
def get_enc_bias(source_inputs): """ get_enc_bias """ source_inputs = layers.cast(source_inputs, 'float32') emb_sum = layers.reduce_sum(layers.abs(source_inputs), dim=-1) zero = layers.fill_constant([1], 'float32', value=0) bias = layers.cast(layers.equal(emb_sum, zero), 'float32') * -1e9 return layers.unsqueeze(layers.unsqueeze(bias, axes=[1]), axes=[1])
def binary_op(u_embed, v_embed, binary_op_type): if binary_op_type == "Average": edge_embed = (u_embed + v_embed) / 2 elif binary_op_type == "Hadamard": edge_embed = u_embed * v_embed elif binary_op_type == "Weighted-L1": edge_embed = l.abs(u_embed - v_embed) elif binary_op_type == "Weighted-L2": edge_embed = (u_embed - v_embed) * (u_embed - v_embed) else: raise ValueError(binary_op_type + " binary_op_type doesn't exists") return edge_embed
def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg): n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[ 'num_attention_heads'] head_importance = L.zeros(shape=[n_layers, n_heads], dtype='float32') head_mask = L.ones(shape=[n_layers, n_heads], dtype='float32') head_mask.stop_gradient = False intermediate_weight = [] intermediate_bias = [] output_weight = [] for name, w in model.named_parameters(): if 'ffn.i' in name: if len(w.shape) > 1: intermediate_weight.append(w) else: intermediate_bias.append(w) if 'ffn.o' in name: if len(w.shape) > 1: output_weight.append(w) neuron_importance = [] for w in intermediate_weight: neuron_importance.append(np.zeros(shape=[w.shape[1]], dtype='float32')) eval_task_names = ('mnli', 'mnli-mm') if args.task == 'mnli' else (args.task, ) for eval_task in eval_task_names: for batch in dev_ds.start(place): ids, sids, label = batch out = model(ids, sids, labels=label, head_mask=head_mask, num_layers=model_cfg['num_hidden_layers']) loss = out[0] loss.backward() head_importance += L.abs(FD.to_variable(head_mask.gradient())) for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight, neuron_importance): current_importance += np.abs( (np.sum(w1.numpy() * w1.gradient(), axis=0) + b1.numpy() * b1.gradient())) current_importance += np.abs( np.sum(w2.numpy() * w2.gradient(), axis=1)) return head_importance, neuron_importance
def get_feature(self, im: np.ndarray): """Get the feature. Generally, call this function. args: im: image patch """ # Return empty tensor if it should not be used is_color = im.shape[1] == 3 if is_color and not self.use_for_color or not is_color and not self.use_for_gray: return np.array([]) feat_list = self.extract(im) output_sz = [None] * len( feat_list) if self.output_size is None else self.output_size # Pool/downsample with fluid.dygraph.guard(): feat_list = [n2p(f) for f in feat_list] for i, (sz, s) in enumerate(zip(output_sz, self.pool_stride)): if sz is not None: feat_list[i] = layers.adaptive_pool2d(feat_list[i], sz, pool_type='avg') elif s != 1: feat_list[i] = layers.pool2d(feat_list[i], s, pool_stride=s, pool_type='avg') # Normalize if self.normalize_power is not None: new_feat_list = [] for feat in feat_list: norm = (layers.reduce_sum(layers.reshape( layers.abs(feat), [feat.shape[0], 1, 1, -1])** self.normalize_power, dim=3, keep_dim=True) / (feat.shape[1] * feat.shape[2] * feat.shape[3]) + 1e-10)**(1 / self.normalize_power) feat = broadcast_op(feat, norm, 'div') new_feat_list.append(feat) feat_list = new_feat_list # To numpy feat_list = TensorList([f.numpy() for f in feat_list]) return feat_list
def test_forward(self): entity_embedding, relation_embedding, transfer_matrix = self.create_share_variables( ) rel_matrix = layers.reshape( self.lookup_table(self.test_input[1], transfer_matrix), [self.hidden_size, self.hidden_size]) entity_embedding_trans = layers.matmul(entity_embedding, rel_matrix, False, False) rel_vec = self.lookup_table(self.test_input[1], relation_embedding) entity_embedding_trans = layers.l2_normalize(entity_embedding_trans, axis=-1) rel_vec = layers.l2_normalize(rel_vec, axis=-1) head_vec = self.lookup_table(self.test_input[0], entity_embedding_trans) tail_vec = self.lookup_table(self.test_input[2], entity_embedding_trans) id_replace_head = layers.reduce_sum(layers.abs(entity_embedding_trans + rel_vec - tail_vec), dim=1) id_replace_tail = layers.reduce_sum(layers.abs(entity_embedding_trans - rel_vec - head_vec), dim=1) return [id_replace_head, id_replace_tail]
def forward(self, img, label, mask=None, return_loss=True): outs = self.backbone(img) outs = self.neck(outs) if return_loss: s = img + outs[-1] cls_out = self.avgpool(self.head(s)[-1]) cls_out = L.squeeze(cls_out, axes=[2, 3]) if self.dropout: cls_out = L.dropout(cls_out, dropout_prob=self.dropout) cls_out = self.fc(cls_out) losses = self.get_losses(outs, cls_out, mask, label) return losses else: cue = L.abs(outs[-1]).numpy() return cue
def func(self, place): # the shape of input variable should be clearly specified, not inlcude -1. shape = [2, 3, 7, 9] eps = 1e-6 dtype = np.float64 x = layers.data('x', shape, False, dtype) x.persistable = True y = layers.abs(x) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) # Because we set delta = 0.005 in calculating numeric gradient, # if x is too small, the numeric gradient is inaccurate. # we should avoid this x_arr[np.abs(x_arr) < 0.005] = 0.02 gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps)
def get_losses(self, out, cls_out, mask, gt_labels): loss_cls = L.mean(L.cross_entropy(cls_out, gt_labels)) * self.train_cfg['w_cls'] cue = out[-1] if self.train_cfg['with_mask'] else L.elementwise_mul( out[-1], L.cast(gt_labels, 'float32'), axis=0) num_reg = L.cast( L.reduce_sum(gt_labels) * cue.shape[1] * cue.shape[2] * cue.shape[3], 'float32') loss_reg = L.reduce_sum( L.abs(mask - cue)) / (num_reg + 1e-8) * self.train_cfg['w_reg'] loss_tir = 0 for feat in out[:-1]: feat = L.squeeze(self.avgpool(feat), axes=[2, 3]) loss_tir += self.triple_loss(feat, gt_labels) * self.train_cfg['w_tri'] loss = loss_cls + loss_reg + loss_tir return dict(loss_cls=loss_cls, loss_reg=loss_reg, loss_tir=loss_tir, loss=loss)
def get_feature(self, im: np.ndarray): """Get the feature. Generally, call this function. args: im: image patch """ # Return empty tensor if it should not be used is_color = im.shape[1] == 3 if is_color and not self.use_for_color or not is_color and not self.use_for_gray: return np.array([]) # Extract feature feat = self.extract(im) # Pool/downsample with fluid.dygraph.guard(): feat = n2p(feat) if self.output_size is not None: feat = layers.adaptive_pool2d(feat, self.output_size, 'avg') elif self.pool_stride != 1: feat = layers.pool2d( feat, self.pool_stride, pool_stride=self.pool_stride, pool_type='avg') # Normalize if self.normalize_power is not None: feat /= ( layers.reduce_sum( layers.reshape( layers.abs(feat), [feat.shape[0], 1, 1, -1])** self.normalize_power, dim=3, keep_dim=True) / (feat.shape[1] * feat.shape[2] * feat.shape[3]) + 1e-10)**( 1 / self.normalize_power) feat = feat.numpy() return feat
def l1_loss(self, prediction, target, mask, priority_bin=None): """L1 loss for spectrogram. Args: prediction (Variable): shape(B, T, C), dtype float32, predicted spectrogram. target (Variable): shape(B, T, C), dtype float32, target spectrogram. mask (Variable): shape(B, T), mask. priority_bin (int, optional): frequency bands for linear spectrogram loss to be prioritized. Defaults to None. Returns: Variable: shape(1,), dtype float32, l1 loss(with mask and possibly priority bin applied.) """ abs_diff = F.abs(prediction - target) # basic mask-weighted l1 loss w = self.masked_weight if w > 0 and mask is not None: base_l1_loss = w * masked_mean(abs_diff, mask) \ + (1 - w) * F.reduce_mean(abs_diff) else: base_l1_loss = F.reduce_mean(abs_diff) if self.priority_weight > 0 and priority_bin is not None: # mask-weighted priority channels' l1-loss priority_abs_diff = abs_diff[:, :, :priority_bin] if w > 0 and mask is not None: priority_loss = w * masked_mean(priority_abs_diff, mask) \ + (1 - w) * F.reduce_mean(priority_abs_diff) else: priority_loss = F.reduce_mean(priority_abs_diff) # priority weighted sum p = self.priority_weight loss = p * priority_loss + (1 - p) * base_l1_loss else: loss = base_l1_loss return loss
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader() for epoch in range(cfg['train']['max_epochs']): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) character, mel, mel_input, pos_text, pos_mel = data global_step += 1 mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss # Note: When used stop token loss the learning did not work. if cfg['network']['stop_token']: label = (pos_mel == 0).astype(np.float32) stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss if local_rank == 0: writer.add_scalars( 'training_loss', { 'mel_loss': mel_loss.numpy(), 'post_mel_loss': post_mel_loss.numpy() }, global_step) if cfg['network']['stop_token']: writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalars( 'alphas', { 'encoder_alpha': model._layers.encoder.alpha.numpy(), 'decoder_alpha': model._layers.decoder.alpha.numpy(), }, global_step) else: writer.add_scalars( 'alphas', { 'encoder_alpha': model.encoder.alpha.numpy(), 'decoder_alpha': model.decoder.alpha.numpy(), }, global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_enc_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_dec_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) model = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'], cfg['audio']['num_mels'], cfg['audio']['n_fft']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, is_vocoder=True).reader() for epoch in range(cfg['train']['max_iteration']): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) mel, mag = data mag = dg.to_variable(mag.numpy()) mel = dg.to_variable(mel.numpy()) global_step += 1 mag_pred = model(mel) loss = layers.mean( layers.abs(layers.elementwise_sub(mag_pred, mag))) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() if local_rank == 0: writer.add_scalar('training_loss/loss', loss.numpy(), global_step) # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def position_id(x, r=0): pid = layers.arange(0, x.shape[1], dtype="int32") pid = layers.unsqueeze(pid, 0) r = layers.cast(layers.ones_like(x), dtype="int32") * r return layers.cast(layers.abs(layers.elementwise_sub(pid, r)), dtype='int64')
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader( cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) global_step += 1 while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss stop_loss = cross_entropy( stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight']) loss = loss + stop_loss if local_rank == 0: writer.add_scalar('training_loss/mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('training_loss/post_mel_loss', post_mel_loss.numpy(), global_step) writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalar('alphas/encoder_alpha', model._layers.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model._layers.decoder.alpha.numpy(), global_step) else: writer.add_scalar('alphas/encoder_alpha', model.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model.decoder.alpha.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_enc_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_dec_%d_0' % global_step, x, i * 4 + j) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) global_step += 1 if local_rank == 0: writer.close()
def _abs(x): if isinstance(x, PTensor): return layers.abs(x) else: return np.abs(x)
def main(args): local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'fastspeech') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): with fluid.unique_name.guard(): transformer_tts = TransformerTTS(cfg) model_dict, _ = load_checkpoint( str(args.transformer_step), os.path.join(args.transtts_path, "transformer")) transformer_tts.set_dict(model_dict) transformer_tts.eval() model = FastSpeech(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / ( cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), parameter_list=model.parameters()) reader = LJSpeechLoader( cfg, args, nranks, local_rank, shuffle=True).reader() if args.checkpoint_path is not None: model_dict, opti_dict = load_checkpoint( str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.fastspeech_step print("load checkpoint!!!") if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) (character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask) = data _, _, attn_probs, _, _, _ = transformer_tts( character, mel_input, pos_text, pos_mel, dec_slf_mask=dec_slf_mask, enc_slf_mask=enc_slf_mask, enc_query_mask=enc_query_mask, enc_dec_mask=enc_dec_mask, dec_query_slf_mask=dec_query_slf_mask, dec_query_mask=dec_query_mask) alignment, max_attn = get_alignment(attn_probs, mel_lens, cfg['transformer_head']) alignment = dg.to_variable(alignment).astype(np.float32) if local_rank == 0 and global_step % 5 == 1: x = np.uint8( cm.viridis(max_attn[8, :mel_lens.numpy()[8]]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, 0, dataformats="HWC") global_step += 1 #Forward result = model( character, pos_text, mel_pos=pos_mel, length_target=alignment, enc_non_pad_mask=enc_query_mask, enc_slf_attn_mask=enc_slf_mask, dec_non_pad_mask=dec_query_slf_mask, dec_slf_attn_mask=dec_slf_mask) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) duration_loss = layers.mean( layers.abs( layers.elementwise_sub(duration_predictor_output, alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank == 0: writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if args.use_data_parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() optimizer.minimize( total_loss, grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[ 'grad_clip_thresh'])) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) save_path = os.path.join(args.save_path, 'fastspeech/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'transformer') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): model = TransformerTTS(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), parameter_list=model.parameters()) if args.checkpoint_path is not None: model_dict, opti_dict = load_checkpoint( str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.transformer_step print("load checkpoint!!!") if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) character, mel, mel_input, pos_text, pos_mel, text_length, _, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask = data global_step += 1 mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel, dec_slf_mask=dec_slf_mask, enc_slf_mask=enc_slf_mask, enc_query_mask=enc_query_mask, enc_dec_mask=enc_dec_mask, dec_query_slf_mask=dec_query_slf_mask, dec_query_mask=dec_query_mask) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss # Note: When used stop token loss the learning did not work. if args.stop_token: label = (pos_mel == 0).astype(np.float32) stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss if local_rank == 0: writer.add_scalars( 'training_loss', { 'mel_loss': mel_loss.numpy(), 'post_mel_loss': post_mel_loss.numpy() }, global_step) if args.stop_token: writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if args.use_data_parallel: writer.add_scalars( 'alphas', { 'encoder_alpha': model._layers.encoder.alpha.numpy(), 'decoder_alpha': model._layers.decoder.alpha.numpy(), }, global_step) else: writer.add_scalars( 'alphas', { 'encoder_alpha': model.encoder.alpha.numpy(), 'decoder_alpha': model.decoder.alpha.numpy(), }, global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % args.image_step == 1: for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8( cm.viridis(prob.numpy()[j * args.batch_size // 2]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(4): x = np.uint8( cm.viridis(prob.numpy()[j * args.batch_size // 2]) * 255) writer.add_image('Attention_enc_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_dec): for j in range(4): x = np.uint8( cm.viridis(prob.numpy()[j * args.batch_size // 2]) * 255) writer.add_image('Attention_dec_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize( loss, grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm( cfg['grad_clip_thresh'])) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) save_path = os.path.join(args.save_path, 'transformer/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'vocoder') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): model = Vocoder(cfg, args.batch_size) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), parameter_list=model.parameters()) if args.checkpoint_path is not None: model_dict, opti_dict = load_checkpoint( str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.vocoder_step print("load checkpoint!!!") if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader() for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) mel, mag = data mag = dg.to_variable(mag.numpy()) mel = dg.to_variable(mel.numpy()) global_step += 1 mag_pred = model(mel) loss = layers.mean( layers.abs(layers.elementwise_sub(mag_pred, mag))) if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize( loss, grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm( cfg['grad_clip_thresh'])) model.clear_gradients() if local_rank == 0: writer.add_scalars('training_loss', { 'loss': loss.numpy(), }, global_step) if global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) save_path = os.path.join(args.save_path, 'vocoder/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) if local_rank == 0: writer.close()
def forward(self, x, y): return L.mean(L.abs(x - y))
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(dg.parallel.Env() .dev_id) if args.use_gpu else fluid.CPUPlace() fluid.enable_dygraph(place) if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) reader = LJSpeechLoader( cfg['audio'], place, args.data, args.alignments_path, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) (character, mel, pos_text, pos_mel, alignment) = batch global_step += 1 #Forward result = model( character, pos_text, mel_pos=pos_mel, length_target=alignment) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) duration_loss = layers.mean( layers.abs( layers.elementwise_sub(duration_predictor_output, alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank == 0: writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() optimizer.minimize(total_loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()