def initialize_training(data_root, meta_text, checkpoint_dir=None, model_name=None): dataloader = Dataloader(data_root, meta_text) model = Tacotron(n_vocab=len(symbols), embedding_dim=config.embedding_dim, mel_dim=config.num_mels, linear_dim=config.num_freq, r=config.outputs_per_step, padding_idx=config.padding_idx, attention=config.attention, use_mask=config.use_mask) optimizer = optim.Adam(model.parameters(), lr=config.initial_learning_rate, betas=(config.adam_beta1, config.adam_beta2), weight_decay=config.weight_decay) # Load checkpoint if model_name != None: model, optimizer = warm_from_ckpt(checkpoint_dir, model_name, model, optimizer) return model, optimizer, dataloader
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--input_dir', help='Absolute path to base preprocessed data directory'), parser.add_argument('--output_dir', help='Absolute path to the base output directory') parser.add_argument( '--dataset_name', help='The given dataset has to exist in the given input directory') parser.add_argument( '--model_name', help= 'name of model to be trained on, defaults to main. This is just used for file-keeping.', default='main') parser.add_argument( '--hparams', default='', help= 'Hyperparameter overrides as a comma-separated list of name=value pairs' ) parser.add_argument('--restore_step', type=int, help='Global step to restore from checkpoint.') parser.add_argument('--summary_interval', type=int, default=100, help='Steps between running summary ops.') parser.add_argument('--checkpoint_interval', type=int, default=1000, help='Steps between writing checkpoints.') parser.add_argument('--msg_interval', type=int, default=100, help='Interval of general training messages') # used for broadcasting training updates to slack. parser.add_argument('--slack_url', help='Slack webhook URL to get periodic reports.') parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') args = parser.parse_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) args.in_dir = os.path.join(args.input_dir, args.dataset_name) args.out_dir = os.path.join(args.output_dir, args.model_name) args.log_dir = os.path.join(args.out_dir, 'logs') args.meta_dir = os.path.join(args.out_dir, 'meta') args.sample_dir = os.path.join(args.out_dir, 'samples') # create the output directories if needed os.makedirs(args.log_dir, exist_ok=True) os.makedirs(args.meta_dir, exist_ok=True) os.makedirs(args.sample_dir, exist_ok=True) hparams.parse(args.hparams) model = Tacotron(hparams) model.train(args)
class Synthesizer(): """ Synthesizer """ def init(self, checkpoint_path): """ Initialize Synthesizer @type checkpoint_path str @param checkpoint_path path to checkpoint to be restored """ print('Constructing Tacotron Model ...') inputs = tf.compat.v1.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.compat.v1.placeholder(tf.int32, [1], 'input_lengths') with tf.compat.v1.variable_scope('model'): self.model = Tacotron() self.model.init(inputs, input_lengths) self.wav_output = audio.spectrogram_to_wav_tf( self.model.linear_outputs[0]) print('Loading checkpoint: %s' % checkpoint_path) self.session = tf.compat.v1.Session() self.session.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() saver.restore(self.session, checkpoint_path) def synthesize(self, text): """ Convert the text into synthesized speech @type text str @param text text to be synthesized @rtype object @return synthesized speech """ seq = text_to_sequence(text) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_audio(wav, out) return out.getvalue()
def create_model(hparams): # Model config with open(hparams.tacotron_config, 'r') as f: model_cfg = json.load(f) if hparams.tacotron_version == "1": # Tacotron model model = Tacotron(n_vocab=hparams.num_symbols, embed_dim=hparams.symbols_embed_dim, mel_dim=hparams.mel_dim, linear_dim=hparams.mel_dim, max_decoder_steps=hparams.max_decoder_steps, stop_threshold=hparams.stop_threshold, r=hparams.r, model_cfg=model_cfg) # Loss criterion criterion = TacotronLoss() elif hparams.tacotron_version == "2": # Tacotron2 model model = Tacotron2(n_vocab=hparams.num_symbols, embed_dim=hparams.symbols_embed_dim, mel_dim=hparams.mel_dim, max_decoder_steps=hparams.max_decoder_steps, stop_threshold=hparams.stop_threshold, r=hparams.r, model_cfg=model_cfg) # Loss criterion criterion = Tacotron2Loss() else: raise ValueError("Unsupported Tacotron version: {} ".format( hparams.tacotron_version)) # return model, criterion
class Synthesizer: def load(self, checkpoint_dir, restore_step, model_name='tacotron'): print('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') # create a batch with a single input and no spectrograms b = Batch((inputs, input_lengths, None, None), prep=False) with tf.variable_scope('model') as scope: self.model = Tacotron(hparams=hparams) self.model.initialize(b) self.wav_output = audio.spectrogram_tensorflow_inv( self.model.linear_outputs[0]) print('Loading checkpoint: %s' % checkpoint_dir) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) # Restore from a checkpoint if the user requested it. restore_dir = '%s-%d' % (checkpoint_dir, restore_step) saver = tf.train.Saver() saver.restore(self.session, restore_dir) def synthesize(self, text, synth_dir): seq = text_to_onehot(text, 'basic_cleaners') feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.pre_emphasis_inv(wav) wav = wav[:audio.find_endpoint(wav)] # create subfolders if not existing os.makedirs(os.path.join(synth_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(synth_dir, 'text'), exist_ok=True) index_file = open(os.path.join(synth_dir, 'index.txt'), 'a+') index = sum(1 for line in open(os.path.join(synth_dir, 'index.txt'))) + 1 wav_path = os.path.join(synth_dir, 'wavs', 'synth-%03d.wav' % index) txt_path = os.path.join(synth_dir, 'text', 'text-%03d.txt' % index) index_file.write(txt_path + '|' + wav_path + '|' + text + '\n') txt_file = open(txt_path, 'w') txt_file.write(text) audio.save_wav(wav, wav_path) print('Sentence has been synthesized and is available at: ', wav_path)
def initialize_training(checkpoint_path): # Input dataset definitions X = FileSourceDataset(TextDataSource()) Mel = FileSourceDataset(MelSpecDataSource()) Y = FileSourceDataset(LinearSpecDataSource()) # Dataset and Dataloader setup dataset = PyTorchDataset(X, Mel, Y) data_loader = data.DataLoader(dataset, batch_size=config.batch_size, num_workers=config.num_workers, shuffle=True, collate_fn=collate_fn, pin_memory=config.pin_memory) # Model model = Tacotron(n_vocab=len(symbols), embedding_dim=config.embedding_dim, mel_dim=config.num_mels, linear_dim=config.num_freq, r=config.outputs_per_step, padding_idx=config.padding_idx, use_memory_mask=config.use_memory_mask) optimizer = optim.Adam(model.parameters(), lr=config.initial_learning_rate, betas=(config.adam_beta1, config.adam_beta2), weight_decay=config.weight_decay) # Load checkpoint if checkpoint_path != None: print("Load checkpoint from: {}".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) try: global_step = checkpoint["global_step"] global_epoch = checkpoint["global_epoch"] except: print('Warning: global step and global epoch unable to restore!') sys.exit(0) return model, optimizer, data_loader
def load(self, checkpoint_dir, restore_step, model_name='tacotron'): print('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') # create a batch with a single input and no spectrograms b = Batch((inputs, input_lengths, None, None), prep=False) with tf.variable_scope('model') as scope: self.model = Tacotron(hparams=hparams) self.model.initialize(b) self.wav_output = audio.spectrogram_tensorflow_inv( self.model.linear_outputs[0]) print('Loading checkpoint: %s' % checkpoint_dir) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) # Restore from a checkpoint if the user requested it. restore_dir = '%s-%d' % (checkpoint_dir, restore_step) saver = tf.train.Saver() saver.restore(self.session, restore_dir)
def __init__(self, checkpoint_path, device="cuda"): self.checkpoint_path = checkpoint_path assert exists(checkpoint_path) self.device = torch.device(device) print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model self.tacotron = tts_model = Tacotron( embed_dims=hp.embed_dims, num_chars=len(symbols), encoder_dims=hp.encoder_dims, decoder_dims=hp.decoder_dims, n_mels=hp.n_mels, fft_bins=hp.fft_bins, postnet_dims=hp.postnet_dims, encoder_K=hp.encoder_K, lstm_dims=hp.lstm_dims, postnet_K=hp.postnet_K, num_highways=hp.num_highways, dropout=hp.dropout, speaker_latent_dims=hp.speaker_latent_dims, speaker_encoder_dims=hp.speaker_encoder_dims, n_speakers=hp.n_speakers, noise_latent_dims=hp.noise_latent_dims, noise_encoder_dims=hp.noise_encoder_dims).to(device=self.device) print("\nInitializing STFT Model...\n") self.stft = MelSTFT(filter_length=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length, n_mel_channels=hp.n_mels, sampling_rate=hp.sampling_rate, mel_fmin=hp.min_f, mel_fmax=hp.max_f).to(device=self.device) tts_model.restore(self.checkpoint_path) tts_model.eval() # print some information self.tts_k = tts_model.get_step() // 1000 r = tts_model.get_r() simple_table([ (f'Tacotron(r={r})', str(self.tts_k) + 'k'), ("Sample Rate", hp.sampling_rate), ("NFFT", hp.n_fft), ("NMel", hp.n_mels), ("Speakers", hp.n_speakers), ("SPKD", hp.speaker_latent_dims), ("NOID", hp.noise_latent_dims), ])
def init(self, checkpoint_path): """ Initialize Synthesizer @type checkpoint_path str @param checkpoint_path path to checkpoint to be restored """ print('Constructing Tacotron Model ...') inputs = tf.compat.v1.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.compat.v1.placeholder(tf.int32, [1], 'input_lengths') with tf.compat.v1.variable_scope('model'): self.model = Tacotron() self.model.init(inputs, input_lengths) self.wav_output = audio.spectrogram_to_wav_tf( self.model.linear_outputs[0]) print('Loading checkpoint: %s' % checkpoint_path) self.session = tf.compat.v1.Session() self.session.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() saver.restore(self.session, checkpoint_path)
def create_model(is_training=True): encoder = Encoder() decoder = Decoder() postnet = PostNet() post_cbhg = PostCBHG() model = Tacotron(encoder=encoder, decoder=decoder, postnet=postnet, post_cbhg=post_cbhg) if is_training: model.train() else: model.eval() return model
def __init__(self, batch_size: int = 32, num_epoch: int = 100, train_split: float = 0.9, log_interval: int = 1000, log_audio_factor: int = 5, lr: float = 0.001, num_data: int = None, log_root: str = './tb_logs', save_root: str = './checkpoints', num_workers: int = 4, version: int = None, num_test_samples: int = 5): """ Initialize tacotron trainer Args: batch_size: batch size num_epoch: total number of epochs to train train_split: train ratio of train-val split log_interval: interval for test sample logging to tensorboard in epoch unit log_audio_factor: number of log_interval for logging audio which requires quite a lot of overhead num_data: number of datapoints to load in the dataset log_root: root directory for the tensorboard logging save_root: root directory for saving model num_workers: number of workers for dataloader version: version of training num_test_samples: number of test samples to generate for each logging """ if not os.path.exists(log_root): os.makedirs(log_root) if not os.path.exists(save_root): os.makedirs(save_root) is_cuda = torch.cuda.is_available() self.device = torch.device('cuda' if is_cuda else 'cpu') self.train_split = train_split self.epoch_num = num_epoch self.splitted_dataset = self.__split_dataset( TorchLJSpeechDataset(num_data=num_data)) self.dataloaders = self.__get_dataloaders( batch_size, num_workers=num_workers) self.tacotron = Tacotron() self.tacotron.to(self.device) self.loss = TacotronLoss() self.optimizer = Adam(self.tacotron.parameters(), lr=lr) self.lr_scheduler = StepLR( optimizer=self.optimizer, step_size=10000, gamma=0.9) if version is None: versions = os.listdir(log_root) if not versions: self.version = 0 else: self.version = max([int(ver[-1]) for ver in versions]) + 1 log_dir = os.path.join( log_root, self.VERSION_FORMAT.format(self.version)) if os.path.exists(log_dir): os.remove(log_dir) self.logger = SummaryWriter(log_dir) self.save_root = save_root self.log_interval = log_interval self.log_audio_factor = log_audio_factor self.global_step = 0 self.running_count = {self.TRAIN_STAGE: 0, self.VAL_STAGE: 0} self.running_loss = {self.TRAIN_STAGE: 0, self.VAL_STAGE: 0} self.sample_indices = list(range(num_test_samples))
class TacotronTrainer: TRAIN_STAGE = 'train' VAL_STAGE = 'val' VERSION_FORMAT = 'VERSION_{}' MODEL_SAVE_FORMAT = 'version_{version:03}_model_{step:010}.pth' def __init__(self, batch_size: int = 32, num_epoch: int = 100, train_split: float = 0.9, log_interval: int = 1000, log_audio_factor: int = 5, lr: float = 0.001, num_data: int = None, log_root: str = './tb_logs', save_root: str = './checkpoints', num_workers: int = 4, version: int = None, num_test_samples: int = 5): """ Initialize tacotron trainer Args: batch_size: batch size num_epoch: total number of epochs to train train_split: train ratio of train-val split log_interval: interval for test sample logging to tensorboard in epoch unit log_audio_factor: number of log_interval for logging audio which requires quite a lot of overhead num_data: number of datapoints to load in the dataset log_root: root directory for the tensorboard logging save_root: root directory for saving model num_workers: number of workers for dataloader version: version of training num_test_samples: number of test samples to generate for each logging """ if not os.path.exists(log_root): os.makedirs(log_root) if not os.path.exists(save_root): os.makedirs(save_root) is_cuda = torch.cuda.is_available() self.device = torch.device('cuda' if is_cuda else 'cpu') self.train_split = train_split self.epoch_num = num_epoch self.splitted_dataset = self.__split_dataset( TorchLJSpeechDataset(num_data=num_data)) self.dataloaders = self.__get_dataloaders( batch_size, num_workers=num_workers) self.tacotron = Tacotron() self.tacotron.to(self.device) self.loss = TacotronLoss() self.optimizer = Adam(self.tacotron.parameters(), lr=lr) self.lr_scheduler = StepLR( optimizer=self.optimizer, step_size=10000, gamma=0.9) if version is None: versions = os.listdir(log_root) if not versions: self.version = 0 else: self.version = max([int(ver[-1]) for ver in versions]) + 1 log_dir = os.path.join( log_root, self.VERSION_FORMAT.format(self.version)) if os.path.exists(log_dir): os.remove(log_dir) self.logger = SummaryWriter(log_dir) self.save_root = save_root self.log_interval = log_interval self.log_audio_factor = log_audio_factor self.global_step = 0 self.running_count = {self.TRAIN_STAGE: 0, self.VAL_STAGE: 0} self.running_loss = {self.TRAIN_STAGE: 0, self.VAL_STAGE: 0} self.sample_indices = list(range(num_test_samples)) def fit_from_checkpoint(self, checkpoint_file: str): self.tacotron.load(checkpoint_file, self.device) self.fit() def fit(self): for epoch in tqdm.tqdm(range(self.epoch_num), total=self.epoch_num, desc='Epoch'): self.__run_epoch(epoch) def __run_epoch(self, epoch: int): # reset running loss and count after each epoch self.__reset_loss() self.__reset_count() for stage, dataloader in self.dataloaders.items(): prog_bar = tqdm.tqdm(dataloader, desc=f'{stage.capitalize()} in progress', total=len(dataloader)) for batch in dataloader: self.__run_step(batch, stage, prog_bar) # epoch vs global step self.logger.add_scalar('epoch', epoch, global_step=self.global_step) # add loss to logger loss_dict = {stage: self.__calculate_mean_loss(stage) for stage in self.running_loss} self.logger.add_scalars('loss', loss_dict, global_step=epoch) def __run_step(self, batch: TorchLJSpeechBatch, stage: str, prog_bar: tqdm.tqdm): if stage == self.TRAIN_STAGE: self.tacotron.train() self.optimizer.zero_grad() else: self.tacotron.eval() batch = batch.to(self.device) output = self.tacotron.forward_train(batch) loss_val = self.loss(batch.mel_spec, output.pred_mel_spec, batch.lin_spec, output.pred_lin_spec) self.running_loss[stage] += loss_val.item() * batch.mel_spec.size(0) self.running_count[stage] += batch.mel_spec.size(0) if stage == self.TRAIN_STAGE: loss_val.backward() self.optimizer.step() self.lr_scheduler.step() if self.global_step % self.log_interval == 0: self.logger.add_scalar('training_loss', self.__calculate_mean_loss(stage), global_step=self.global_step) log_audio = False if self.global_step % (self.log_interval * self.log_audio_factor) == 0: log_audio = True sample_results = self.__get_sample_results() for sample_result in sample_results: self.__log_sample_results( self.global_step, sample_result, log_audio=log_audio) self.tacotron.train() save_file = os.path.join( self.save_root, self.MODEL_SAVE_FORMAT.format( version=self.version, step=self.global_step) ) torch.save(self.tacotron.state_dict(), save_file) self.global_step += 1 prog_bar.update() prog_bar.set_postfix( {'Running Loss': f'{self.__calculate_mean_loss(stage):.3f}'}) def __log_sample_results(self, steps: int, sample_result: SampleResult, log_mel: bool = True, log_spec: bool = True, log_attention: bool = True, log_audio: bool = True) -> None: """ Log the sample results into tensorboard Args: steps: current step sample_result: sample result to log log_mel: if True, log mel spectrogram log_spec: if True, log spectrogram log_attention: if True, log attention log_audio: if True, log audio """ if log_mel: title = f'Log Mel Spectrogram, Step:{steps}, ' \ f'Uid: {sample_result.uid}' fig = self.__get_spec_plot( pred_spec=sample_result.pred_mel_spec, truth_spec=sample_result.truth_mel_spec, suptitle=title, ylabel='Mel') img_tensor = self.__get_plot_tensor(fig) tag = f'mel_spec/{sample_result.uid}' self.logger.add_image(tag, img_tensor, global_step=steps) if log_spec: title = f'Log Spectrogram, Step:{steps}, ' \ f'Uid: {sample_result.uid}' fig = self.__get_spec_plot( pred_spec=sample_result.pred_lin_spec, truth_spec=sample_result.truth_lin_spec, suptitle=title, ylabel='DFT bins') img_tensor = self.__get_plot_tensor(fig) tag = f'lin_spec/{sample_result.uid}' self.logger.add_image(tag, img_tensor, global_step=steps) if log_attention: title = f'Attention Weight, Epoch :{steps}, ' \ f'Uid: {sample_result.uid}' fig = self.__get_attention_plot( title=title, attention_weight=sample_result.attention_weight) img_tensor = self.__get_plot_tensor(fig) tag = f'attention/{sample_result.uid}' self.logger.add_image(tag, img_tensor, global_step=steps) if log_audio: pred_tag = f'audio/{sample_result.uid}_predicted' truth_tag = f'audio/{sample_result.uid}_truth' self.logger.add_audio( tag=pred_tag, snd_tensor=torch.from_numpy( sample_result.pred_audio).unsqueeze(1), # add channel dim global_step=steps, sample_rate=AudioProcessParam.sr ) self.logger.add_audio( tag=truth_tag, snd_tensor=torch.from_numpy( sample_result.truth_audio).unsqueeze(1), # add channel dim global_step=steps, sample_rate=AudioProcessParam.sr ) def __get_sample_results(self) -> List[SampleResult]: """ Get sample results to show in tensorboard, including 1. Predicted and ground truth spectrogram pairs 2. Predicted and ground truth mel spectrogram pairs 3. Predicted and ground truth audio pairs 4. Attention weight Returns: list of sample results """ val_dataset = self.splitted_dataset[self.VAL_STAGE] self.tacotron.eval() test_insts = [] with torch.no_grad(): for subset_i in self.sample_indices: datapoint: TorchLJSpeechData = val_dataset[subset_i] datapoint: TorchLJSpeechBatch = datapoint.add_batch_dim() datapoint = datapoint.to(self.device) ds_idx = val_dataset.indices[subset_i] uid = val_dataset.dataset.uids[ds_idx] # Transcription transcription = val_dataset.dataset.uid_to_transcription[uid] wav_filepath = os.path.join( val_dataset.dataset.wav_save_dir, f'{uid}.wav') truth_audio = AudioProcessingHelper.load_audio(wav_filepath) taco_output = self.tacotron.forward_train(datapoint) spec = taco_output.pred_lin_spec.squeeze(0).cpu().numpy().T pred_audio = AudioProcessingHelper.spec2audio(spec) test_insts.append( SampleResult( uid=uid, transcription=transcription, truth_lin_spec=datapoint.lin_spec.squeeze(0).cpu().numpy().T, pred_lin_spec=taco_output.pred_lin_spec.squeeze(0).cpu().numpy().T, truth_mel_spec=datapoint.mel_spec.squeeze(0).cpu().numpy().T, pred_mel_spec=taco_output.pred_mel_spec.squeeze(0).cpu().numpy().T, attention_weight=taco_output.attention_weight.squeeze(0).cpu().numpy(), truth_audio=truth_audio, pred_audio=pred_audio ) ) return test_insts @staticmethod def __get_attention_plot( title: str, attention_weight: np.ndarray) -> plt.Figure: """ Get figure handle for attention plot Args: title: title of the plot attention_weight: attention weight to plot Returns: figure object """ fig = plt.figure(figsize=(6, 5), dpi=80) plt.title(title) plt.imshow(attention_weight, aspect='auto') plt.colorbar() plt.xlabel('Encoder seq') plt.ylabel('Decoder seq') plt.gca().invert_yaxis() # Let the x, y axis start from the left-bottom corner plt.close(fig) return fig @staticmethod def __get_spec_plot(pred_spec: np.ndarray, truth_spec: np.ndarray, suptitle: str, ylabel: str) -> plt.Figure: """ Get a juxtaposition two spectrograms with appropriate title Args: pred_spec: predicted spectrogram truth_spec: ground truth spectrogram suptitle: title of the plot ylabel: unit of frequency axis of the spectrograms Returns: figure object """ vmin = min(np.min(truth_spec), np.min(pred_spec)) vmax = max(np.max(truth_spec), np.max(pred_spec)) fig = plt.figure(figsize=(11, 5), dpi=80) plt.suptitle(suptitle) ax1 = plt.subplot(121) plt.title('Ground Truth') plt.xlabel('Frame') plt.ylabel(ylabel) plt.imshow(truth_spec, vmin=vmin, vmax=vmax, aspect='auto') plt.gca().invert_yaxis() # let the x, y axis start from the left-bottom corner ax2 = plt.subplot(122) plt.title('Predicted') plt.xlabel('Frame') im = plt.imshow(pred_spec, vmin=vmin, vmax=vmax, aspect='auto') plt.gca().invert_yaxis() # let the x, y axis start from the left-bottom corner fig.tight_layout() fig.colorbar(im, ax=[ax1, ax2]) plt.close(fig) return fig @staticmethod def __get_plot_tensor(fig) -> torch.Tensor: """ Get tensor for the given figure object Args: fig: the figure object to convert into tensor Returns: tensor of the figure """ buf = io.BytesIO() fig.savefig(buf, format='jpeg') buf.seek(0) image = PIL.Image.open(buf) image = ToTensor()(image) return image def __calculate_mean_loss(self, stage: str) -> float: """ Calculate mean loss for given stage (train/val) Args: stage: train/val Returns: mean loss """ return self.running_loss[stage] / self.running_count[stage] def __reset_loss(self) -> None: self.running_loss = {self.TRAIN_STAGE: 0, self.VAL_STAGE: 0} def __reset_count(self) -> None: self.running_count = {self.TRAIN_STAGE: 0, self.VAL_STAGE: 0} def __split_dataset(self, dataset: TorchLJSpeechDataset) -> Dict[str, Subset]: """ Split the dataset into train/validation set Args: dataset: dataset to split Returns: splitted dataset """ num_train_data = int(len(dataset) * self.train_split) num_val_data = len(dataset) - num_train_data train_dataset, val_dataset = random_split( dataset, [num_train_data, num_val_data]) return {self.TRAIN_STAGE: train_dataset, self.VAL_STAGE: val_dataset} def __get_dataloaders( self, batch_size: int, num_workers: int) -> Dict[str, DataLoader]: return {stage: DataLoader( dataset, shuffle=(stage == self.TRAIN_STAGE), collate_fn=TorchLJSpeechDataset.batch_tacotron_input, pin_memory=True, batch_size=batch_size, num_workers=num_workers) for stage, dataset in self.splitted_dataset.items() }
print('\nInitialising Tacotron Model...\n') # Instantiate MelSTFT Extractor mel_calc = MelSTFT(hp.n_fft, hp.hop_length, hp.win_length, hp.n_mels, hp.sampling_rate, hp.min_f, hp.max_f) # Instantiate Tacotron Model model = Tacotron( embed_dims=hp.embed_dims, num_chars=len(symbols), encoder_dims=hp.encoder_dims, decoder_dims=hp.decoder_dims, n_mels=hp.n_mels, fft_bins=hp.fft_bins, postnet_dims=hp.postnet_dims, encoder_K=hp.encoder_K, lstm_dims=hp.lstm_dims, postnet_K=hp.postnet_K, num_highways=hp.num_highways, dropout=hp.dropout, speaker_encoder_dims=hp.speaker_encoder_dims, speaker_latent_dims=hp.speaker_latent_dims, n_speakers=hp.n_speakers, noise_encoder_dims=hp.noise_encoder_dims, noise_latent_dims=hp.noise_latent_dims).to(device=hp.device) model.restore(str(hp.load_weight_file)) optimizer = optim.Adam(model.parameters()) current_step = model.get_step() stft = MelSTFT(filter_length=hp.n_fft,
def train(log_dir, args): checkpoint_path = os.path.join(log_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, 'training/train.txt') logger.log('Checkpoint path: %s' % checkpoint_path) logger.log('Loading training data from: %s' % input_path) # set up DataFeeder coordi = tf.train.Coordinator() with tf.compat.v1.variable_scope('data_feeder'): feeder = DataFeeder(coordi, input_path) # set up Model global_step = tf.Variable(0, name='global_step', trainable=False) with tf.compat.v1.variable_scope('model'): model = Tacotron() model.init(feeder.inputs, feeder.input_lengths, mel_targets=feeder.mel_targets, linear_targets=feeder.linear_targets) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) # book keeping step = 0 loss_window = ValueWindow(100) time_window = ValueWindow(100) saver = tf.compat.v1.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) # start training already! with tf.compat.v1.Session() as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # initialize parameters sess.run(tf.compat.v1.global_variables_initializer()) # if requested, restore from step if (args.restore_step): restore_path = '%s-%d' % (checkpoint_path, args.restore_step) saver.restore(sess, restore_path) logger.log('Resuming from checkpoint: %s' % restore_path) else: logger.log('Starting a new training!') feeder.start_in_session(sess) while not coordi.should_stop(): start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) msg = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( step, time_window.average, loss, loss_window.average) logger.log(msg) if loss > 100 or math.isnan(loss): # bad situation logger.log('Loss exploded to %.05f at step %d!' % (loss, step)) raise Exception('Loss Exploded') if step % args.summary_interval == 0: # it's time to write summary logger.log('Writing summary at step: %d' % step) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: # it's time to save a checkpoint logger.log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) logger.log('Saving audio and alignment...') input_seq, spectrogram, alignment = sess.run([ model.inputs[0], model.linear_outputs[0], model.alignments[0] ]) # convert spectrogram to waveform waveform = audio.spectrogram_to_wav(spectrogram.T) # save it audio.save_audio( waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) plotter.plot_alignment( alignment, os.path.join(log_dir, 'step-%d-align.png' % step), info='%s, %s, step=%d, loss=%.5f' % ('tacotron', time_string(), step, loss)) logger.log('Input: %s' % sequence_to_text(input_seq)) except Exception as e: logger.log('Exiting due to exception %s' % e) traceback.print_exc() coordi.request_stop(e)
def main(): #---initialize---# args = get_test_args() model = Tacotron(n_vocab=len(symbols), embedding_dim=config.embedding_dim, mel_dim=config.num_mels, linear_dim=config.num_freq, r=config.outputs_per_step, padding_idx=config.padding_idx, use_memory_mask=config.use_memory_mask) #---handle path---# checkpoint_path = os.path.join(args.ckpt_dir, args.checkpoint_name + args.model_name + '.pth') os.makedirs(args.result_dir, exist_ok=True) #---load and set model---# print('Loading model: ', checkpoint_path) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) if args.long_input: model.decoder.max_decoder_steps = 500 # Set large max_decoder steps to handle long sentence outputs else: model.decoder.max_decoder_steps = 50 if args.interactive == True: output_name = args.result_dir + args.model #---testing loop---# while True: try: text = str(input('< Tacotron > Text to speech: ')) text = ch2pinyin(text) print('Model input: ', text) synthesis_speech(model, text=text, figures=args.plot, path=output_name) except KeyboardInterrupt: print() print('Terminating!') break elif args.interactive == False: output_name = args.result_dir + args.model + '/' os.makedirs(output_name, exist_ok=True) #---testing flow---# with open(args.test_file_path, 'r', encoding='utf-8') as f: lines = f.readlines() for idx, line in enumerate(lines): text = ch2pinyin(line) print("{}: {} - {} ({} words, {} chars)".format(idx, line, text, len(line), len(text))) synthesis_speech(model, text=text, figures=args.plot, path=output_name+line) print("Finished! Check out {} for generated audio samples.".format(output_name)) else: raise RuntimeError('Invalid mode!!!') sys.exit(0)