Ejemplo n.º 1
0
def initialize_training(data_root,
                        meta_text,
                        checkpoint_dir=None,
                        model_name=None):

    dataloader = Dataloader(data_root, meta_text)

    model = Tacotron(n_vocab=len(symbols),
                     embedding_dim=config.embedding_dim,
                     mel_dim=config.num_mels,
                     linear_dim=config.num_freq,
                     r=config.outputs_per_step,
                     padding_idx=config.padding_idx,
                     attention=config.attention,
                     use_mask=config.use_mask)

    optimizer = optim.Adam(model.parameters(),
                           lr=config.initial_learning_rate,
                           betas=(config.adam_beta1, config.adam_beta2),
                           weight_decay=config.weight_decay)

    # Load checkpoint
    if model_name != None:
        model, optimizer = warm_from_ckpt(checkpoint_dir, model_name, model,
                                          optimizer)

    return model, optimizer, dataloader
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_dir',
        help='Absolute path to base preprocessed data directory'),
    parser.add_argument('--output_dir',
                        help='Absolute path to the base output directory')
    parser.add_argument(
        '--dataset_name',
        help='The given dataset has to exist in the given input directory')
    parser.add_argument(
        '--model_name',
        help=
        'name of model to be trained on, defaults to main. This is just used for file-keeping.',
        default='main')
    parser.add_argument(
        '--hparams',
        default='',
        help=
        'Hyperparameter overrides as a comma-separated list of name=value pairs'
    )
    parser.add_argument('--restore_step',
                        type=int,
                        help='Global step to restore from checkpoint.')
    parser.add_argument('--summary_interval',
                        type=int,
                        default=100,
                        help='Steps between running summary ops.')
    parser.add_argument('--checkpoint_interval',
                        type=int,
                        default=1000,
                        help='Steps between writing checkpoints.')
    parser.add_argument('--msg_interval',
                        type=int,
                        default=100,
                        help='Interval of general training messages')
    # used for broadcasting training updates to slack.
    parser.add_argument('--slack_url',
                        help='Slack webhook URL to get periodic reports.')
    parser.add_argument('--tf_log_level',
                        type=int,
                        default=1,
                        help='Tensorflow C++ log level.')
    args = parser.parse_args()

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
    args.in_dir = os.path.join(args.input_dir, args.dataset_name)
    args.out_dir = os.path.join(args.output_dir, args.model_name)
    args.log_dir = os.path.join(args.out_dir, 'logs')
    args.meta_dir = os.path.join(args.out_dir, 'meta')
    args.sample_dir = os.path.join(args.out_dir, 'samples')
    # create the output directories if needed
    os.makedirs(args.log_dir, exist_ok=True)
    os.makedirs(args.meta_dir, exist_ok=True)
    os.makedirs(args.sample_dir, exist_ok=True)
    hparams.parse(args.hparams)
    model = Tacotron(hparams)
    model.train(args)
Ejemplo n.º 3
0
class Synthesizer():
    """ Synthesizer """
    def init(self, checkpoint_path):
        """ Initialize Synthesizer 
    
    @type   checkpoint_path   str
    @param  checkpoint_path   path to checkpoint to be restored    
    """
        print('Constructing Tacotron Model ...')

        inputs = tf.compat.v1.placeholder(tf.int32, [1, None], 'inputs')
        input_lengths = tf.compat.v1.placeholder(tf.int32, [1],
                                                 'input_lengths')

        with tf.compat.v1.variable_scope('model'):
            self.model = Tacotron()
            self.model.init(inputs, input_lengths)
            self.wav_output = audio.spectrogram_to_wav_tf(
                self.model.linear_outputs[0])

        print('Loading checkpoint: %s' % checkpoint_path)
        self.session = tf.compat.v1.Session()
        self.session.run(tf.compat.v1.global_variables_initializer())
        saver = tf.compat.v1.train.Saver()
        saver.restore(self.session, checkpoint_path)

    def synthesize(self, text):
        """ Convert the text into synthesized speech 
    
    @type   text    str
    @param  text    text to be synthesized

    @rtype          object
    @return         synthesized speech
    """

        seq = text_to_sequence(text)

        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }

        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()
        audio.save_audio(wav, out)

        return out.getvalue()
Ejemplo n.º 4
0
def create_model(hparams):
    # Model config
    with open(hparams.tacotron_config, 'r') as f:
        model_cfg = json.load(f)
    if hparams.tacotron_version == "1":
        # Tacotron model
        model = Tacotron(n_vocab=hparams.num_symbols,
                         embed_dim=hparams.symbols_embed_dim,
                         mel_dim=hparams.mel_dim,
                         linear_dim=hparams.mel_dim,
                         max_decoder_steps=hparams.max_decoder_steps,
                         stop_threshold=hparams.stop_threshold,
                         r=hparams.r,
                         model_cfg=model_cfg)
        # Loss criterion
        criterion = TacotronLoss()
    elif hparams.tacotron_version == "2":
        # Tacotron2 model
        model = Tacotron2(n_vocab=hparams.num_symbols,
                          embed_dim=hparams.symbols_embed_dim,
                          mel_dim=hparams.mel_dim,
                          max_decoder_steps=hparams.max_decoder_steps,
                          stop_threshold=hparams.stop_threshold,
                          r=hparams.r,
                          model_cfg=model_cfg)
        # Loss criterion
        criterion = Tacotron2Loss()
    else:
        raise ValueError("Unsupported Tacotron version: {} ".format(
            hparams.tacotron_version))
    #
    return model, criterion
Ejemplo n.º 5
0
class Synthesizer:
    def load(self, checkpoint_dir, restore_step, model_name='tacotron'):
        print('Constructing model: %s' % model_name)
        inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
        input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
        # create a batch with a single input and no spectrograms
        b = Batch((inputs, input_lengths, None, None), prep=False)
        with tf.variable_scope('model') as scope:
            self.model = Tacotron(hparams=hparams)
            self.model.initialize(b)
            self.wav_output = audio.spectrogram_tensorflow_inv(
                self.model.linear_outputs[0])

        print('Loading checkpoint: %s' % checkpoint_dir)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        # Restore from a checkpoint if the user requested it.
        restore_dir = '%s-%d' % (checkpoint_dir, restore_step)
        saver = tf.train.Saver()
        saver.restore(self.session, restore_dir)

    def synthesize(self, text, synth_dir):
        seq = text_to_onehot(text, 'basic_cleaners')
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }
        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.pre_emphasis_inv(wav)
        wav = wav[:audio.find_endpoint(wav)]

        # create subfolders if not existing
        os.makedirs(os.path.join(synth_dir, 'wavs'), exist_ok=True)
        os.makedirs(os.path.join(synth_dir, 'text'), exist_ok=True)
        index_file = open(os.path.join(synth_dir, 'index.txt'), 'a+')
        index = sum(1
                    for line in open(os.path.join(synth_dir, 'index.txt'))) + 1
        wav_path = os.path.join(synth_dir, 'wavs', 'synth-%03d.wav' % index)
        txt_path = os.path.join(synth_dir, 'text', 'text-%03d.txt' % index)
        index_file.write(txt_path + '|' + wav_path + '|' + text + '\n')
        txt_file = open(txt_path, 'w')
        txt_file.write(text)
        audio.save_wav(wav, wav_path)
        print('Sentence has been synthesized and is available at: ', wav_path)
Ejemplo n.º 6
0
def initialize_training(checkpoint_path):

    # Input dataset definitions
    X = FileSourceDataset(TextDataSource())
    Mel = FileSourceDataset(MelSpecDataSource())
    Y = FileSourceDataset(LinearSpecDataSource())

    # Dataset and Dataloader setup
    dataset = PyTorchDataset(X, Mel, Y)
    data_loader = data.DataLoader(dataset,
                                  batch_size=config.batch_size,
                                  num_workers=config.num_workers,
                                  shuffle=True,
                                  collate_fn=collate_fn,
                                  pin_memory=config.pin_memory)

    # Model
    model = Tacotron(n_vocab=len(symbols),
                     embedding_dim=config.embedding_dim,
                     mel_dim=config.num_mels,
                     linear_dim=config.num_freq,
                     r=config.outputs_per_step,
                     padding_idx=config.padding_idx,
                     use_memory_mask=config.use_memory_mask)

    optimizer = optim.Adam(model.parameters(),
                           lr=config.initial_learning_rate,
                           betas=(config.adam_beta1, config.adam_beta2),
                           weight_decay=config.weight_decay)

    # Load checkpoint
    if checkpoint_path != None:
        print("Load checkpoint from: {}".format(checkpoint_path))
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        try:
            global_step = checkpoint["global_step"]
            global_epoch = checkpoint["global_epoch"]
        except:
            print('Warning: global step and global epoch unable to restore!')
            sys.exit(0)

    return model, optimizer, data_loader
Ejemplo n.º 7
0
    def load(self, checkpoint_dir, restore_step, model_name='tacotron'):
        print('Constructing model: %s' % model_name)
        inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
        input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
        # create a batch with a single input and no spectrograms
        b = Batch((inputs, input_lengths, None, None), prep=False)
        with tf.variable_scope('model') as scope:
            self.model = Tacotron(hparams=hparams)
            self.model.initialize(b)
            self.wav_output = audio.spectrogram_tensorflow_inv(
                self.model.linear_outputs[0])

        print('Loading checkpoint: %s' % checkpoint_dir)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        # Restore from a checkpoint if the user requested it.
        restore_dir = '%s-%d' % (checkpoint_dir, restore_step)
        saver = tf.train.Saver()
        saver.restore(self.session, restore_dir)
    def __init__(self, checkpoint_path, device="cuda"):
        self.checkpoint_path = checkpoint_path
        assert exists(checkpoint_path)
        self.device = torch.device(device)

        print('\nInitialising Tacotron Model...\n')

        # Instantiate Tacotron Model
        self.tacotron = tts_model = Tacotron(
            embed_dims=hp.embed_dims,
            num_chars=len(symbols),
            encoder_dims=hp.encoder_dims,
            decoder_dims=hp.decoder_dims,
            n_mels=hp.n_mels,
            fft_bins=hp.fft_bins,
            postnet_dims=hp.postnet_dims,
            encoder_K=hp.encoder_K,
            lstm_dims=hp.lstm_dims,
            postnet_K=hp.postnet_K,
            num_highways=hp.num_highways,
            dropout=hp.dropout,
            speaker_latent_dims=hp.speaker_latent_dims,
            speaker_encoder_dims=hp.speaker_encoder_dims,
            n_speakers=hp.n_speakers,
            noise_latent_dims=hp.noise_latent_dims,
            noise_encoder_dims=hp.noise_encoder_dims).to(device=self.device)

        print("\nInitializing STFT Model...\n")

        self.stft = MelSTFT(filter_length=hp.n_fft,
                            hop_length=hp.hop_length,
                            win_length=hp.win_length,
                            n_mel_channels=hp.n_mels,
                            sampling_rate=hp.sampling_rate,
                            mel_fmin=hp.min_f,
                            mel_fmax=hp.max_f).to(device=self.device)

        tts_model.restore(self.checkpoint_path)
        tts_model.eval()
        # print some information
        self.tts_k = tts_model.get_step() // 1000

        r = tts_model.get_r()

        simple_table([
            (f'Tacotron(r={r})', str(self.tts_k) + 'k'),
            ("Sample Rate", hp.sampling_rate),
            ("NFFT", hp.n_fft),
            ("NMel", hp.n_mels),
            ("Speakers", hp.n_speakers),
            ("SPKD", hp.speaker_latent_dims),
            ("NOID", hp.noise_latent_dims),
        ])
Ejemplo n.º 9
0
    def init(self, checkpoint_path):
        """ Initialize Synthesizer 
    
    @type   checkpoint_path   str
    @param  checkpoint_path   path to checkpoint to be restored    
    """
        print('Constructing Tacotron Model ...')

        inputs = tf.compat.v1.placeholder(tf.int32, [1, None], 'inputs')
        input_lengths = tf.compat.v1.placeholder(tf.int32, [1],
                                                 'input_lengths')

        with tf.compat.v1.variable_scope('model'):
            self.model = Tacotron()
            self.model.init(inputs, input_lengths)
            self.wav_output = audio.spectrogram_to_wav_tf(
                self.model.linear_outputs[0])

        print('Loading checkpoint: %s' % checkpoint_path)
        self.session = tf.compat.v1.Session()
        self.session.run(tf.compat.v1.global_variables_initializer())
        saver = tf.compat.v1.train.Saver()
        saver.restore(self.session, checkpoint_path)
Ejemplo n.º 10
0
def create_model(is_training=True):
    encoder = Encoder()
    decoder = Decoder()
    postnet = PostNet()
    post_cbhg = PostCBHG()

    model = Tacotron(encoder=encoder,
                     decoder=decoder,
                     postnet=postnet,
                     post_cbhg=post_cbhg)
    if is_training:
        model.train()
    else:
        model.eval()
    return model
Ejemplo n.º 11
0
    def __init__(self,
                 batch_size: int = 32,
                 num_epoch: int = 100,
                 train_split: float = 0.9,
                 log_interval: int = 1000,
                 log_audio_factor: int = 5,
                 lr: float = 0.001,
                 num_data: int = None,
                 log_root: str = './tb_logs',
                 save_root: str = './checkpoints',
                 num_workers: int = 4,
                 version: int = None,
                 num_test_samples: int = 5):
        """
        Initialize tacotron trainer
        Args:
            batch_size: batch size
            num_epoch: total number of epochs to train
            train_split: train ratio of train-val split
            log_interval: interval for test sample logging to tensorboard in
                epoch unit
            log_audio_factor: number of log_interval for logging audio
                which requires quite a lot of overhead
            num_data: number of datapoints to load in the dataset
            log_root: root directory for the tensorboard logging
            save_root: root directory for saving model
            num_workers: number of workers for dataloader
            version: version of training
            num_test_samples: number of test samples to generate
                for each logging
        """
        if not os.path.exists(log_root):
            os.makedirs(log_root)

        if not os.path.exists(save_root):
            os.makedirs(save_root)

        is_cuda = torch.cuda.is_available()
        self.device = torch.device('cuda' if is_cuda else 'cpu')

        self.train_split = train_split

        self.epoch_num = num_epoch

        self.splitted_dataset = self.__split_dataset(
            TorchLJSpeechDataset(num_data=num_data))

        self.dataloaders = self.__get_dataloaders(
            batch_size, num_workers=num_workers)

        self.tacotron = Tacotron()
        self.tacotron.to(self.device)

        self.loss = TacotronLoss()
        self.optimizer = Adam(self.tacotron.parameters(), lr=lr)
        self.lr_scheduler = StepLR(
            optimizer=self.optimizer,
            step_size=10000,
            gamma=0.9)

        if version is None:
            versions = os.listdir(log_root)
            if not versions:
                self.version = 0
            else:
                self.version = max([int(ver[-1]) for ver in versions]) + 1

        log_dir = os.path.join(
            log_root, self.VERSION_FORMAT.format(self.version))
        if os.path.exists(log_dir):
            os.remove(log_dir)

        self.logger = SummaryWriter(log_dir)

        self.save_root = save_root

        self.log_interval = log_interval
        self.log_audio_factor = log_audio_factor

        self.global_step = 0
        self.running_count = {self.TRAIN_STAGE: 0,
                              self.VAL_STAGE: 0}
        self.running_loss = {self.TRAIN_STAGE: 0,
                             self.VAL_STAGE: 0}

        self.sample_indices = list(range(num_test_samples))
Ejemplo n.º 12
0
class TacotronTrainer:
    TRAIN_STAGE = 'train'
    VAL_STAGE = 'val'
    VERSION_FORMAT = 'VERSION_{}'
    MODEL_SAVE_FORMAT = 'version_{version:03}_model_{step:010}.pth'

    def __init__(self,
                 batch_size: int = 32,
                 num_epoch: int = 100,
                 train_split: float = 0.9,
                 log_interval: int = 1000,
                 log_audio_factor: int = 5,
                 lr: float = 0.001,
                 num_data: int = None,
                 log_root: str = './tb_logs',
                 save_root: str = './checkpoints',
                 num_workers: int = 4,
                 version: int = None,
                 num_test_samples: int = 5):
        """
        Initialize tacotron trainer
        Args:
            batch_size: batch size
            num_epoch: total number of epochs to train
            train_split: train ratio of train-val split
            log_interval: interval for test sample logging to tensorboard in
                epoch unit
            log_audio_factor: number of log_interval for logging audio
                which requires quite a lot of overhead
            num_data: number of datapoints to load in the dataset
            log_root: root directory for the tensorboard logging
            save_root: root directory for saving model
            num_workers: number of workers for dataloader
            version: version of training
            num_test_samples: number of test samples to generate
                for each logging
        """
        if not os.path.exists(log_root):
            os.makedirs(log_root)

        if not os.path.exists(save_root):
            os.makedirs(save_root)

        is_cuda = torch.cuda.is_available()
        self.device = torch.device('cuda' if is_cuda else 'cpu')

        self.train_split = train_split

        self.epoch_num = num_epoch

        self.splitted_dataset = self.__split_dataset(
            TorchLJSpeechDataset(num_data=num_data))

        self.dataloaders = self.__get_dataloaders(
            batch_size, num_workers=num_workers)

        self.tacotron = Tacotron()
        self.tacotron.to(self.device)

        self.loss = TacotronLoss()
        self.optimizer = Adam(self.tacotron.parameters(), lr=lr)
        self.lr_scheduler = StepLR(
            optimizer=self.optimizer,
            step_size=10000,
            gamma=0.9)

        if version is None:
            versions = os.listdir(log_root)
            if not versions:
                self.version = 0
            else:
                self.version = max([int(ver[-1]) for ver in versions]) + 1

        log_dir = os.path.join(
            log_root, self.VERSION_FORMAT.format(self.version))
        if os.path.exists(log_dir):
            os.remove(log_dir)

        self.logger = SummaryWriter(log_dir)

        self.save_root = save_root

        self.log_interval = log_interval
        self.log_audio_factor = log_audio_factor

        self.global_step = 0
        self.running_count = {self.TRAIN_STAGE: 0,
                              self.VAL_STAGE: 0}
        self.running_loss = {self.TRAIN_STAGE: 0,
                             self.VAL_STAGE: 0}

        self.sample_indices = list(range(num_test_samples))

    def fit_from_checkpoint(self, checkpoint_file: str):
        self.tacotron.load(checkpoint_file, self.device)
        self.fit()

    def fit(self):
        for epoch in tqdm.tqdm(range(self.epoch_num),
                               total=self.epoch_num,
                               desc='Epoch'):
            self.__run_epoch(epoch)

    def __run_epoch(self, epoch: int):
        # reset running loss and count after each epoch
        self.__reset_loss()
        self.__reset_count()

        for stage, dataloader in self.dataloaders.items():
            prog_bar = tqdm.tqdm(dataloader,
                                 desc=f'{stage.capitalize()} in progress',
                                 total=len(dataloader))
            for batch in dataloader:
                self.__run_step(batch, stage, prog_bar)

        # epoch vs global step
        self.logger.add_scalar('epoch', epoch, global_step=self.global_step)

        # add loss to logger
        loss_dict = {stage: self.__calculate_mean_loss(stage)
                     for stage in self.running_loss}
        self.logger.add_scalars('loss', loss_dict, global_step=epoch)


    def __run_step(self, batch: TorchLJSpeechBatch, stage: str,
                   prog_bar: tqdm.tqdm):
        if stage == self.TRAIN_STAGE:
            self.tacotron.train()
            self.optimizer.zero_grad()
        else:
            self.tacotron.eval()

        batch = batch.to(self.device)

        output = self.tacotron.forward_train(batch)
        loss_val = self.loss(batch.mel_spec, output.pred_mel_spec,
                             batch.lin_spec, output.pred_lin_spec)

        self.running_loss[stage] += loss_val.item() * batch.mel_spec.size(0)
        self.running_count[stage] += batch.mel_spec.size(0)

        if stage == self.TRAIN_STAGE:
            loss_val.backward()
            self.optimizer.step()
            self.lr_scheduler.step()

            if self.global_step % self.log_interval == 0:
                self.logger.add_scalar('training_loss',
                                       self.__calculate_mean_loss(stage),
                                       global_step=self.global_step)
                log_audio = False
                if self.global_step % (self.log_interval * self.log_audio_factor) == 0:
                    log_audio = True
                sample_results = self.__get_sample_results()
                for sample_result in sample_results:
                    self.__log_sample_results(
                        self.global_step, sample_result, log_audio=log_audio)

                self.tacotron.train()
                save_file = os.path.join(
                    self.save_root,
                    self.MODEL_SAVE_FORMAT.format(
                        version=self.version, step=self.global_step)
                )
                torch.save(self.tacotron.state_dict(), save_file)

            self.global_step += 1

        prog_bar.update()
        prog_bar.set_postfix(
            {'Running Loss': f'{self.__calculate_mean_loss(stage):.3f}'})

    def __log_sample_results(self, steps: int,
                             sample_result: SampleResult,
                             log_mel: bool = True,
                             log_spec: bool = True,
                             log_attention: bool = True,
                             log_audio: bool = True) -> None:
        """
        Log the sample results into tensorboard
        Args:
            steps: current step
            sample_result: sample result to log
            log_mel: if True, log mel spectrogram
            log_spec: if True, log spectrogram
            log_attention: if True, log attention
            log_audio: if True, log audio

        """
        if log_mel:
            title = f'Log Mel Spectrogram, Step:{steps}, ' \
                    f'Uid: {sample_result.uid}'

            fig = self.__get_spec_plot(
                pred_spec=sample_result.pred_mel_spec,
                truth_spec=sample_result.truth_mel_spec,
                suptitle=title,
                ylabel='Mel')
            img_tensor = self.__get_plot_tensor(fig)
            tag = f'mel_spec/{sample_result.uid}'
            self.logger.add_image(tag, img_tensor, global_step=steps)

        if log_spec:
            title = f'Log Spectrogram, Step:{steps}, ' \
                    f'Uid: {sample_result.uid}'
            fig = self.__get_spec_plot(
                pred_spec=sample_result.pred_lin_spec,
                truth_spec=sample_result.truth_lin_spec,
                suptitle=title,
                ylabel='DFT bins')
            img_tensor = self.__get_plot_tensor(fig)
            tag = f'lin_spec/{sample_result.uid}'
            self.logger.add_image(tag, img_tensor, global_step=steps)

        if log_attention:
            title = f'Attention Weight, Epoch :{steps}, ' \
                    f'Uid: {sample_result.uid}'
            fig = self.__get_attention_plot(
                title=title,
                attention_weight=sample_result.attention_weight)
            img_tensor = self.__get_plot_tensor(fig)
            tag = f'attention/{sample_result.uid}'
            self.logger.add_image(tag, img_tensor, global_step=steps)

        if log_audio:
            pred_tag = f'audio/{sample_result.uid}_predicted'
            truth_tag = f'audio/{sample_result.uid}_truth'

            self.logger.add_audio(
                tag=pred_tag,
                snd_tensor=torch.from_numpy(
                    sample_result.pred_audio).unsqueeze(1),  # add channel dim
                global_step=steps,
                sample_rate=AudioProcessParam.sr
            )

            self.logger.add_audio(
                tag=truth_tag,
                snd_tensor=torch.from_numpy(
                    sample_result.truth_audio).unsqueeze(1),  # add channel dim
                global_step=steps,
                sample_rate=AudioProcessParam.sr
            )

    def __get_sample_results(self) -> List[SampleResult]:
        """
        Get sample results to show in tensorboard, including
            1. Predicted and ground truth spectrogram pairs
            2. Predicted and ground truth mel spectrogram pairs
            3. Predicted and ground truth audio pairs
            4. Attention weight
        Returns:
            list of sample results

        """
        val_dataset = self.splitted_dataset[self.VAL_STAGE]
        self.tacotron.eval()

        test_insts = []
        with torch.no_grad():
            for subset_i in self.sample_indices:
                datapoint: TorchLJSpeechData = val_dataset[subset_i]
                datapoint: TorchLJSpeechBatch = datapoint.add_batch_dim()
                datapoint = datapoint.to(self.device)

                ds_idx = val_dataset.indices[subset_i]
                uid = val_dataset.dataset.uids[ds_idx]

                # Transcription
                transcription = val_dataset.dataset.uid_to_transcription[uid]

                wav_filepath = os.path.join(
                    val_dataset.dataset.wav_save_dir, f'{uid}.wav')
                truth_audio = AudioProcessingHelper.load_audio(wav_filepath)

                taco_output = self.tacotron.forward_train(datapoint)

                spec = taco_output.pred_lin_spec.squeeze(0).cpu().numpy().T
                pred_audio = AudioProcessingHelper.spec2audio(spec)

                test_insts.append(
                    SampleResult(
                        uid=uid,
                        transcription=transcription,
                        truth_lin_spec=datapoint.lin_spec.squeeze(0).cpu().numpy().T,
                        pred_lin_spec=taco_output.pred_lin_spec.squeeze(0).cpu().numpy().T,
                        truth_mel_spec=datapoint.mel_spec.squeeze(0).cpu().numpy().T,
                        pred_mel_spec=taco_output.pred_mel_spec.squeeze(0).cpu().numpy().T,
                        attention_weight=taco_output.attention_weight.squeeze(0).cpu().numpy(),
                        truth_audio=truth_audio,
                        pred_audio=pred_audio
                    )
                )

        return test_insts

    @staticmethod
    def __get_attention_plot(
            title: str, attention_weight: np.ndarray) -> plt.Figure:
        """
        Get figure handle for attention plot

        Args:
            title: title of the plot
            attention_weight: attention weight to plot

        Returns:
            figure object

        """
        fig = plt.figure(figsize=(6, 5), dpi=80)
        plt.title(title)
        plt.imshow(attention_weight, aspect='auto')
        plt.colorbar()
        plt.xlabel('Encoder seq')
        plt.ylabel('Decoder seq')
        plt.gca().invert_yaxis()  # Let the x, y axis start from the left-bottom corner
        plt.close(fig)
        return fig

    @staticmethod
    def __get_spec_plot(pred_spec: np.ndarray, truth_spec: np.ndarray,
                        suptitle: str, ylabel: str) -> plt.Figure:
        """
        Get a juxtaposition two spectrograms with appropriate title
        Args:
            pred_spec: predicted spectrogram
            truth_spec: ground truth spectrogram
            suptitle: title of the plot
            ylabel: unit of frequency axis of the spectrograms

        Returns:
            figure object

        """
        vmin = min(np.min(truth_spec), np.min(pred_spec))
        vmax = max(np.max(truth_spec), np.max(pred_spec))

        fig = plt.figure(figsize=(11, 5), dpi=80)
        plt.suptitle(suptitle)

        ax1 = plt.subplot(121)
        plt.title('Ground Truth')
        plt.xlabel('Frame')
        plt.ylabel(ylabel)
        plt.imshow(truth_spec, vmin=vmin, vmax=vmax, aspect='auto')
        plt.gca().invert_yaxis()  # let the x, y axis start from the left-bottom corner

        ax2 = plt.subplot(122)
        plt.title('Predicted')
        plt.xlabel('Frame')
        im = plt.imshow(pred_spec, vmin=vmin, vmax=vmax, aspect='auto')
        plt.gca().invert_yaxis()  # let the x, y axis start from the left-bottom corner

        fig.tight_layout()
        fig.colorbar(im, ax=[ax1, ax2])
        plt.close(fig)

        return fig

    @staticmethod
    def __get_plot_tensor(fig) -> torch.Tensor:
        """
        Get tensor for the given figure object
        Args:
            fig: the figure object to convert into tensor

        Returns:
            tensor of the figure

        """
        buf = io.BytesIO()
        fig.savefig(buf, format='jpeg')
        buf.seek(0)
        image = PIL.Image.open(buf)
        image = ToTensor()(image)
        return image

    def __calculate_mean_loss(self, stage: str) -> float:
        """
        Calculate mean loss for given stage (train/val)
        Args:
            stage: train/val

        Returns:
            mean loss

        """
        return self.running_loss[stage] / self.running_count[stage]

    def __reset_loss(self) -> None:
        self.running_loss = {self.TRAIN_STAGE: 0,
                             self.VAL_STAGE: 0}

    def __reset_count(self) -> None:
        self.running_count = {self.TRAIN_STAGE: 0,
                              self.VAL_STAGE: 0}

    def __split_dataset(self, dataset: TorchLJSpeechDataset) -> Dict[str, Subset]:
        """
        Split the dataset into train/validation set
        Args:
            dataset: dataset to split

        Returns:
            splitted dataset

        """
        num_train_data = int(len(dataset) * self.train_split)
        num_val_data = len(dataset) - num_train_data
        train_dataset, val_dataset = random_split(
            dataset, [num_train_data, num_val_data])

        return {self.TRAIN_STAGE: train_dataset,
                self.VAL_STAGE: val_dataset}

    def __get_dataloaders(
            self, batch_size: int, num_workers: int) -> Dict[str, DataLoader]:
        return {stage: DataLoader(
            dataset, shuffle=(stage == self.TRAIN_STAGE),
            collate_fn=TorchLJSpeechDataset.batch_tacotron_input,
            pin_memory=True, batch_size=batch_size,
            num_workers=num_workers)
            for stage, dataset in self.splitted_dataset.items()
        }
Ejemplo n.º 13
0
    print('\nInitialising Tacotron Model...\n')

    # Instantiate MelSTFT Extractor
    mel_calc = MelSTFT(hp.n_fft, hp.hop_length, hp.win_length, hp.n_mels,
                       hp.sampling_rate, hp.min_f, hp.max_f)

    # Instantiate Tacotron Model
    model = Tacotron(
        embed_dims=hp.embed_dims,
        num_chars=len(symbols),
        encoder_dims=hp.encoder_dims,
        decoder_dims=hp.decoder_dims,
        n_mels=hp.n_mels,
        fft_bins=hp.fft_bins,
        postnet_dims=hp.postnet_dims,
        encoder_K=hp.encoder_K,
        lstm_dims=hp.lstm_dims,
        postnet_K=hp.postnet_K,
        num_highways=hp.num_highways,
        dropout=hp.dropout,
        speaker_encoder_dims=hp.speaker_encoder_dims,
        speaker_latent_dims=hp.speaker_latent_dims,
        n_speakers=hp.n_speakers,
        noise_encoder_dims=hp.noise_encoder_dims,
        noise_latent_dims=hp.noise_latent_dims).to(device=hp.device)

    model.restore(str(hp.load_weight_file))
    optimizer = optim.Adam(model.parameters())

    current_step = model.get_step()

    stft = MelSTFT(filter_length=hp.n_fft,
Ejemplo n.º 14
0
def train(log_dir, args):
    checkpoint_path = os.path.join(log_dir, 'model.ckpt')
    input_path = os.path.join(args.base_dir, 'training/train.txt')

    logger.log('Checkpoint path: %s' % checkpoint_path)
    logger.log('Loading training data from: %s' % input_path)

    # set up DataFeeder
    coordi = tf.train.Coordinator()
    with tf.compat.v1.variable_scope('data_feeder'):
        feeder = DataFeeder(coordi, input_path)

    # set up Model
    global_step = tf.Variable(0, name='global_step', trainable=False)
    with tf.compat.v1.variable_scope('model'):
        model = Tacotron()
        model.init(feeder.inputs,
                   feeder.input_lengths,
                   mel_targets=feeder.mel_targets,
                   linear_targets=feeder.linear_targets)
        model.add_loss()
        model.add_optimizer(global_step)
        stats = add_stats(model)

    # book keeping
    step = 0
    loss_window = ValueWindow(100)
    time_window = ValueWindow(100)
    saver = tf.compat.v1.train.Saver(max_to_keep=5,
                                     keep_checkpoint_every_n_hours=2)

    # start training already!
    with tf.compat.v1.Session() as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

            # initialize parameters
            sess.run(tf.compat.v1.global_variables_initializer())

            # if requested, restore from step
            if (args.restore_step):
                restore_path = '%s-%d' % (checkpoint_path, args.restore_step)
                saver.restore(sess, restore_path)
                logger.log('Resuming from checkpoint: %s' % restore_path)
            else:
                logger.log('Starting a new training!')

            feeder.start_in_session(sess)

            while not coordi.should_stop():
                start_time = time.time()

                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])

                time_window.append(time.time() - start_time)
                loss_window.append(loss)

                msg = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
                    step, time_window.average, loss, loss_window.average)

                logger.log(msg)

                if loss > 100 or math.isnan(loss):
                    # bad situation
                    logger.log('Loss exploded to %.05f at step %d!' %
                               (loss, step))
                    raise Exception('Loss Exploded')

                if step % args.summary_interval == 0:
                    # it's time to write summary
                    logger.log('Writing summary at step: %d' % step)
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.checkpoint_interval == 0:
                    # it's time to save a checkpoint
                    logger.log('Saving checkpoint to: %s-%d' %
                               (checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    logger.log('Saving audio and alignment...')

                    input_seq, spectrogram, alignment = sess.run([
                        model.inputs[0], model.linear_outputs[0],
                        model.alignments[0]
                    ])

                    # convert spectrogram to waveform
                    waveform = audio.spectrogram_to_wav(spectrogram.T)
                    # save it
                    audio.save_audio(
                        waveform,
                        os.path.join(log_dir, 'step-%d-audio.wav' % step))

                    plotter.plot_alignment(
                        alignment,
                        os.path.join(log_dir, 'step-%d-align.png' % step),
                        info='%s, %s, step=%d, loss=%.5f' %
                        ('tacotron', time_string(), step, loss))

                    logger.log('Input: %s' % sequence_to_text(input_seq))

        except Exception as e:
            logger.log('Exiting due to exception %s' % e)
            traceback.print_exc()
            coordi.request_stop(e)
Ejemplo n.º 15
0
def main():

	#---initialize---#
	args = get_test_args()

	model = Tacotron(n_vocab=len(symbols),
					 embedding_dim=config.embedding_dim,
					 mel_dim=config.num_mels,
					 linear_dim=config.num_freq,
					 r=config.outputs_per_step,
					 padding_idx=config.padding_idx,
					 use_memory_mask=config.use_memory_mask)

	#---handle path---#
	checkpoint_path = os.path.join(args.ckpt_dir, args.checkpoint_name + args.model_name + '.pth')
	os.makedirs(args.result_dir, exist_ok=True)
	
	#---load and set model---#
	print('Loading model: ', checkpoint_path)
	checkpoint = torch.load(checkpoint_path)
	model.load_state_dict(checkpoint["state_dict"])
	
	if args.long_input:
		model.decoder.max_decoder_steps = 500 # Set large max_decoder steps to handle long sentence outputs
	else:
		model.decoder.max_decoder_steps = 50
		
	if args.interactive == True:
		output_name = args.result_dir + args.model

		#---testing loop---#
		while True:
			try:
				text = str(input('< Tacotron > Text to speech: '))
				text = ch2pinyin(text)
				print('Model input: ', text)
				synthesis_speech(model, text=text, figures=args.plot, path=output_name)
			except KeyboardInterrupt:
				print()
				print('Terminating!')
				break

	elif args.interactive == False:
		output_name = args.result_dir + args.model + '/'
		os.makedirs(output_name, exist_ok=True)

		#---testing flow---#
		with open(args.test_file_path, 'r', encoding='utf-8') as f:
			
			lines = f.readlines()
			for idx, line in enumerate(lines):
				text = ch2pinyin(line)
				print("{}: {} - {} ({} words, {} chars)".format(idx, line, text, len(line), len(text)))
				synthesis_speech(model, text=text, figures=args.plot, path=output_name+line)

		print("Finished! Check out {} for generated audio samples.".format(output_name))
	
	else:
		raise RuntimeError('Invalid mode!!!')
		
	sys.exit(0)