def layer_norm(x, begin_norm_axis=1, epsilon=1e-12, param_attr=None, bias_attr=None): """ Replace build-in layer_norm op with this function """ helper = LayerHelper('layer_norm', **locals()) mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True) shift_x = layers.elementwise_sub(x=x, y=mean, axis=0) variance = layers.reduce_mean( layers.square(shift_x), dim=begin_norm_axis, keep_dim=True) r_stdev = layers.rsqrt(variance + epsilon) norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0) param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])] param_dtype = norm_x.dtype scale = helper.create_parameter( attr=param_attr, shape=param_shape, dtype=param_dtype, default_initializer=fluid.initializer.Constant(1.)) bias = helper.create_parameter( attr=bias_attr, shape=param_shape, dtype=param_dtype, is_bias=True, default_initializer=fluid.initializer.Constant(0.)) out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1) out = layers.elementwise_add(x=out, y=bias, axis=-1) return out
def log_prob(self, value): """Log probability density/mass function. Args: value (Tensor): The input tensor. Returns: Tensor: log probability.The data type is same with value. """ value = self._check_values_dtype_in_probs(self.low, value) if _non_static_mode(): # ensure value in [low, high] lb_bool = self.low < value ub_bool = value < self.high lb = _C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype) ub = _C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype) return nn.log(lb * ub) - nn.log(self.high - self.low) name = self.name + '_log_prob' lb_bool = self.low < value ub_bool = value < self.high lb = tensor.cast(lb_bool, dtype=value.dtype) ub = tensor.cast(ub_bool, dtype=value.dtype) return elementwise_sub(nn.log(lb * ub), nn.log(self.high - self.low), name=name)
def batch_scatter(ref, indices, updates, in_place=False, overwrite=False): """Scatter updates to ref, according to corrensponding index in indices in each batch. Currently, it only support 2d Tensor. Args: ref (Variable): with shape [batch_size, ...] indices (Variable): with shape [batch_size, 1] updates (Variable): with shape [batch_size] in_place (bool): if True, scatter result will be assign to ref. otherwise, a new Tensor will be returned. Default is False. overwrite (bool): if True, scatter will over write corrensponding elements. Default is False. Returns: TODO Raises: NULL Examples: ref [[1, 1, 1], [1, 1, 1]] indices [[2], [1]] updates [2, 3] return [[1, 1, 2], [1, 3, 1]] """ ref_dtype = ref.dtype if ref_dtype not in PaddleVarType.floats: ref_in = layers.cast(ref, dtype='float32') else: ref_in = ref if updates.dtype != ref_in.dtype: updates = layers.cast(updates, dtype=ref_in.dtype) batch_size = layers.cast(layers.shape(ref_in)[0], dtype=indices.dtype) zero = layers.fill_constant(shape=[1], dtype=indices.dtype, value=0) one = layers.fill_constant(shape=[1], dtype=indices.dtype, value=1) batch_indices = layers.unsqueeze( layers.range(zero, batch_size, one, dtype=indices.dtype), [1]) coord = layers.concat([batch_indices, indices], axis=1) if overwrite: mask = layers.gather_nd(ref_in, coord) mask = layers.elementwise_sub(layers.zeros_like(mask), mask) ref_in = layers.scatter_nd_add(ref_in, coord, mask) output = layers.scatter_nd_add(ref_in, coord, updates) if ref_dtype not in PaddleVarType.floats: output = layers.cast(output, dtype=ref_dtype) if in_place: layers.assign(output, ref) return ref else: return output
def forward(self, x): """ Forward process of LayerNorm. """ mean = layers.reduce_mean(x, dim=list(range(self._begin_norm_axis, len(x.shape))), keep_dim=True) shift_x = layers.elementwise_sub(x=x, y=mean, axis=0) variance = layers.reduce_mean(layers.square(shift_x), dim=list(range(self._begin_norm_axis, len(x.shape))), keep_dim=True) r_stdev = layers.rsqrt(variance + self._epsilon) norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0) out = layers.elementwise_mul(x=norm_x, y=self._scale_w, axis=-1) out = layers.elementwise_add(x=out, y=self._bias_w, axis=-1) return out
def pop(cls, stack_data, mask=True, in_place=True): """pop data in stack_data Args: stack_data (StackData): (data, pos) with shape ([batch_size, stack_len], [batch_size, 1]) mask (bool): 是否 mask 空栈的返回值。默认为 True in_place (bool): 默认为 True Returns: (Variable1, Variable2) Variable1: pop 得到的值 dtype=stack_data.data.dtype shape=[-1] Variable2: 对应位置的值是否合法。入参已经为空的栈,此处为 False。 dtype=bool shape=[-1] Raises: NULL """ data = stack_data.data pos = stack_data.pos # 只有非空的栈才能pop(才合法) valid_pos = layers.logical_not(cls.empty(stack_data)) new_pos_delta = layers.cast(valid_pos, dtype=pos.dtype) new_pos = layers.elementwise_sub(pos, new_pos_delta) # shape = [batch_size] output = nn_utils.batch_gather(data, new_pos) # mask 空栈的返回值 if mask: # shape = [batch_size, 1] mask_tag = layers.cast( new_pos_delta, dtype=data.dtype) if data.dtype != pos.dtype else new_pos_delta mask_tag = layers.squeeze(mask_tag, [1]) output = layers.elementwise_mul(output, mask_tag) # 出栈后原位置置为0 updates = layers.zeros_like(output) new_data = nn_utils.batch_scatter(data, new_pos, updates, overwrite=True, in_place=in_place) if in_place: layers.assign(new_pos, pos) return output, valid_pos, stack_data else: return output, valid_pos, StackData(new_data, new_pos)
def func(self, place): # the shape of input variable should be clearly specified, not inlcude -1. shape = [2, 3, 4, 5] eps = 0.005 dtype = np.float64 x = layers.data('x', shape, False, dtype) y = layers.data('y', shape[:-1], False, dtype) x.persistable = True y.persistable = True out = layers.elementwise_sub(x, y, axis=0) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype) gradient_checker.double_grad_check( [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
def log_prob(self, value): """Log probability density/mass function. Args: value (Tensor): The input tensor. Returns: Tensor: log probability.The data type is same with value. """ name = self.name + '_log_prob' value = self._check_values_dtype_in_probs(self.loc, value) var = self.scale * self.scale log_scale = nn.log(self.scale) return elementwise_sub(-1. * ((value - self.loc) * (value - self.loc)) / (2. * var), log_scale + math.log(math.sqrt(2. * math.pi)), name=name)
def forward(self, output1, output2, label): """ :param output1: [n, 128] :param output2: [n, 128] :param label: [n, 1] :return: [1] """ distance = layers.elementwise_sub(output1, output2) distance = layers.square(distance) euclidean_distance = layers.reduce_sum(distance, dim=1, keep_dim=True) euclidean_distance = layers.sqrt(euclidean_distance) loss_contrastive = layers.elementwise_mul( 1 - label, layers.square(euclidean_distance), axis=0) + layers.elementwise_mul( label, layers.square( layers.clamp(self.margin - euclidean_distance, min=0.0)), axis=0) return loss_contrastive, euclidean_distance.numpy(), label.numpy()
def _push_to_stack(gmr_desc, gmr_pos, gmr_lens, gmr_stack_info): """push grammar id in gmr_desc from gmr_pos to gmr_lens to gmr_stack. and update step_gmr_pos Args: gmr_desc (TYPE): NULL gmr_pos (TYPE): NULL gmr_lens (TYPE): NULL gmr_stack_info (tuple): [in/out] (gmr_stack, gmr_stack_pos) Returns: tuple (gmr_stack, gmr_stack_pos) Raises: NULL """ gmr_stack, gmr_stack_pos = gmr_stack_info mv_step = layers.cast(layers.greater_than(gmr_lens, layers.zeros_like(gmr_lens)), dtype=gmr_lens.dtype) gmr_mv_pos = layers.elementwise_sub(gmr_lens, mv_step) cond = layers.reduce_any(layers.greater_than(gmr_mv_pos, gmr_pos)) while_op = layers.While(cond) with while_op.block(): gmr_ids = nn_utils.batch_gather(gmr_desc, gmr_mv_pos) gmr_stack_tmp, gmr_stack_pos_tmp = data_structure.Stack.push( gmr_stack_info, gmr_ids, in_place=False) mv_cond = layers.greater_than(gmr_mv_pos, gmr_pos) gmr_mv_pos_tmp = fluider.elementwise_sub(gmr_mv_pos, mv_cond, force=True) new_gmr_stack, new_gmr_stack_pos = nn_utils.ifelse( mv_cond, [gmr_stack_tmp, gmr_stack_pos_tmp], [gmr_stack, gmr_stack_pos]) layers.utils.map_structure(layers.assign, [new_gmr_stack, new_gmr_stack_pos], [gmr_stack, gmr_stack_pos]) layers.assign(gmr_mv_pos_tmp, gmr_mv_pos) layers.assign( layers.reduce_any(layers.greater_than(gmr_mv_pos, gmr_pos)), cond) return gmr_stack, gmr_stack_pos
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader() for epoch in range(cfg['train']['max_epochs']): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) character, mel, mel_input, pos_text, pos_mel = data global_step += 1 mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss # Note: When used stop token loss the learning did not work. if cfg['network']['stop_token']: label = (pos_mel == 0).astype(np.float32) stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss if local_rank == 0: writer.add_scalars( 'training_loss', { 'mel_loss': mel_loss.numpy(), 'post_mel_loss': post_mel_loss.numpy() }, global_step) if cfg['network']['stop_token']: writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalars( 'alphas', { 'encoder_alpha': model._layers.encoder.alpha.numpy(), 'decoder_alpha': model._layers.decoder.alpha.numpy(), }, global_step) else: writer.add_scalars( 'alphas', { 'encoder_alpha': model.encoder.alpha.numpy(), 'decoder_alpha': model.decoder.alpha.numpy(), }, global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_enc_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_dec_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) model = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'], cfg['audio']['num_mels'], cfg['audio']['n_fft']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, is_vocoder=True).reader() for epoch in range(cfg['train']['max_iteration']): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) mel, mag = data mag = dg.to_variable(mag.numpy()) mel = dg.to_variable(mel.numpy()) global_step += 1 mag_pred = model(mel) loss = layers.mean( layers.abs(layers.elementwise_sub(mag_pred, mag))) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() if local_rank == 0: writer.add_scalar('training_loss/loss', loss.numpy(), global_step) # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def log_softmax(x): """ log softmax """ t1 = layers.exp(x) t1 = layers.reduce_sum(t1, dim=-1) t1 = layers.log(t1) return layers.elementwise_sub(x, t1, axis=0)
def position_id(x, r=0): pid = layers.arange(0, x.shape[1], dtype="int32") pid = layers.unsqueeze(pid, 0) r = layers.cast(layers.ones_like(x), dtype="int32") * r return layers.cast(layers.abs(layers.elementwise_sub(pid, r)), dtype='int64')
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(dg.parallel.Env() .dev_id) if args.use_gpu else fluid.CPUPlace() fluid.enable_dygraph(place) if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) reader = LJSpeechLoader( cfg['audio'], place, args.data, args.alignments_path, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) (character, mel, pos_text, pos_mel, alignment) = batch global_step += 1 #Forward result = model( character, pos_text, mel_pos=pos_mel, length_target=alignment) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) duration_loss = layers.mean( layers.abs( layers.elementwise_sub(duration_predictor_output, alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank == 0: writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() optimizer.minimize(total_loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader( cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) global_step += 1 while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss stop_loss = cross_entropy( stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight']) loss = loss + stop_loss if local_rank == 0: writer.add_scalar('training_loss/mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('training_loss/post_mel_loss', post_mel_loss.numpy(), global_step) writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalar('alphas/encoder_alpha', model._layers.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model._layers.decoder.alpha.numpy(), global_step) else: writer.add_scalar('alphas/encoder_alpha', model.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model.decoder.alpha.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_enc_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_dec_%d_0' % global_step, x, i * 4 + j) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) global_step += 1 if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'vocoder') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): model = Vocoder(cfg, args.batch_size) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), parameter_list=model.parameters()) if args.checkpoint_path is not None: model_dict, opti_dict = load_checkpoint( str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.vocoder_step print("load checkpoint!!!") if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader() for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) mel, mag = data mag = dg.to_variable(mag.numpy()) mel = dg.to_variable(mel.numpy()) global_step += 1 mag_pred = model(mel) loss = layers.mean( layers.abs(layers.elementwise_sub(mag_pred, mag))) if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize( loss, grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm( cfg['grad_clip_thresh'])) model.clear_gradients() if local_rank == 0: writer.add_scalars('training_loss', { 'loss': loss.numpy(), }, global_step) if global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) save_path = os.path.join(args.save_path, 'vocoder/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) if local_rank == 0: writer.close()
def less_than_branch(i, a): return layers.cond(i >= 3.0, lambda: layers.elementwise_add(a, a), lambda: layers.elementwise_sub(a, a))
def main(args): local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'fastspeech') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): with fluid.unique_name.guard(): transformer_tts = TransformerTTS(cfg) model_dict, _ = load_checkpoint( str(args.transformer_step), os.path.join(args.transtts_path, "transformer")) transformer_tts.set_dict(model_dict) transformer_tts.eval() model = FastSpeech(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / ( cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), parameter_list=model.parameters()) reader = LJSpeechLoader( cfg, args, nranks, local_rank, shuffle=True).reader() if args.checkpoint_path is not None: model_dict, opti_dict = load_checkpoint( str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.fastspeech_step print("load checkpoint!!!") if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) (character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask) = data _, _, attn_probs, _, _, _ = transformer_tts( character, mel_input, pos_text, pos_mel, dec_slf_mask=dec_slf_mask, enc_slf_mask=enc_slf_mask, enc_query_mask=enc_query_mask, enc_dec_mask=enc_dec_mask, dec_query_slf_mask=dec_query_slf_mask, dec_query_mask=dec_query_mask) alignment, max_attn = get_alignment(attn_probs, mel_lens, cfg['transformer_head']) alignment = dg.to_variable(alignment).astype(np.float32) if local_rank == 0 and global_step % 5 == 1: x = np.uint8( cm.viridis(max_attn[8, :mel_lens.numpy()[8]]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, 0, dataformats="HWC") global_step += 1 #Forward result = model( character, pos_text, mel_pos=pos_mel, length_target=alignment, enc_non_pad_mask=enc_query_mask, enc_slf_attn_mask=enc_slf_mask, dec_non_pad_mask=dec_query_slf_mask, dec_slf_attn_mask=dec_slf_mask) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) duration_loss = layers.mean( layers.abs( layers.elementwise_sub(duration_predictor_output, alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank == 0: writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if args.use_data_parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() optimizer.minimize( total_loss, grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[ 'grad_clip_thresh'])) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) save_path = os.path.join(args.save_path, 'fastspeech/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'transformer') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): model = TransformerTTS(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), parameter_list=model.parameters()) if args.checkpoint_path is not None: model_dict, opti_dict = load_checkpoint( str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.transformer_step print("load checkpoint!!!") if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) character, mel, mel_input, pos_text, pos_mel, text_length, _, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask = data global_step += 1 mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel, dec_slf_mask=dec_slf_mask, enc_slf_mask=enc_slf_mask, enc_query_mask=enc_query_mask, enc_dec_mask=enc_dec_mask, dec_query_slf_mask=dec_query_slf_mask, dec_query_mask=dec_query_mask) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss # Note: When used stop token loss the learning did not work. if args.stop_token: label = (pos_mel == 0).astype(np.float32) stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss if local_rank == 0: writer.add_scalars( 'training_loss', { 'mel_loss': mel_loss.numpy(), 'post_mel_loss': post_mel_loss.numpy() }, global_step) if args.stop_token: writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if args.use_data_parallel: writer.add_scalars( 'alphas', { 'encoder_alpha': model._layers.encoder.alpha.numpy(), 'decoder_alpha': model._layers.decoder.alpha.numpy(), }, global_step) else: writer.add_scalars( 'alphas', { 'encoder_alpha': model.encoder.alpha.numpy(), 'decoder_alpha': model.decoder.alpha.numpy(), }, global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % args.image_step == 1: for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8( cm.viridis(prob.numpy()[j * args.batch_size // 2]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(4): x = np.uint8( cm.viridis(prob.numpy()[j * args.batch_size // 2]) * 255) writer.add_image('Attention_enc_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_dec): for j in range(4): x = np.uint8( cm.viridis(prob.numpy()[j * args.batch_size // 2]) * 255) writer.add_image('Attention_dec_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize( loss, grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm( cfg['grad_clip_thresh'])) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) save_path = os.path.join(args.save_path, 'transformer/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) if local_rank == 0: writer.close()