Beispiel #1
0
    def test_init_e0_load(self):
        # Try epochs=0, loading existing model.
        hparams = self._get_hparams()
        hparams.use_gpu = True
        hparams.epochs = 0
        hparams.out_dir = os.path.join(
            hparams.out_dir, "test_init_e0_load")  # Add function name to path.
        hparams.model_type = None

        with unittest.mock.patch.object(ModelTrainer.logger,
                                        "warning") as mock_logger:
            trainer = self._get_trainer(hparams)
            mock_logger.assert_called_with(
                "No CUDA device available, use CPU mode instead.")

        target_dir = os.path.join(hparams.out_dir, hparams.networks_dir)
        makedirs_safe(target_dir)
        shutil.copyfile(
            os.path.join("integration", "fixtures",
                         "test_model_in409_out67.nn"),
            os.path.join(target_dir, hparams.model_name))
        trainer.init(hparams)
        self.assertIsNotNone(trainer.model_handler.model)

        shutil.rmtree(hparams.out_dir)
Beispiel #2
0
    def _get_synth_dir(hparams: ExtendedHParams,
                       use_model_name: bool = True,
                       epoch: int = None,
                       step: int = None) -> os.PathLike:
        if hparams.has_value("synth_dir"):
            save_dir = hparams.synth_dir
        else:
            if hparams.has_value("out_dir"):
                save_dir = [hparams.out_dir]
            else:
                save_dir = [os.path.curdir]

            if use_model_name and hparams.has_value("model_name"):
                save_dir.append(hparams.model_name)

            save_dir.append(Synthesiser.SYNTH_SUB_DIR)

            if epoch is not None:
                save_dir.append("e" + str(epoch))
            elif step is not None:
                save_dir.append("s" + str(step))

            save_dir = os.path.join(*save_dir)

        makedirs_safe(save_dir)
        logging.info("Selected {} as synthesis directory.".format(save_dir))
        return save_dir
Beispiel #3
0
    def synthesize(self, file_id_list, synth_output, hparams):

        # Create speaker subdirectories if necessary.
        for id_name in file_id_list:
            path_split = os.path.split(id_name)
            if len(path_split) > 2:
                makedirs_safe(os.path.join(hparams.synth_dir,
                                           *path_split[:-1]))

        if hparams.synth_vocoder == "WORLD":
            Synthesiser.run_world_synth(synth_output, hparams)
        # elif hparams.synth_vocoder == "STRAIGHT":  # Add further vocoders here.

        elif hparams.synth_vocoder == "r9y9wavenet_mulaw_16k_world_feats_English":
            Synthesiser.run_r9y9wavenet_mulaw_world_feats_synth(
                synth_output, hparams)

        elif hparams.synth_vocoder == "raw":
            # The features in the synth_output dictionary are raw waveforms and can be written directly to the file.
            Synthesiser.run_raw_synth(synth_output, hparams)

        elif hparams.synth_vocoder == "80_SSRN_English_GL":
            # Use a pre-trained spectrogram super resolution network for English and Griffin-Lim.
            # The features in the synth_output should be mfbanks.
            raise NotImplementedError()  # TODO

        elif hparams.synth_vocoder == "r9y9wavenet":
            # Synthesise with a pre-trained r9y9 WaveNet. The hyper-parameters have to match the model.
            Synthesiser.run_wavenet_vocoder(synth_output, hparams)
Beispiel #4
0
def main():
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("-w", "--dir_wav", help="Directory containing the wav files.", type=str,
                        dest="dir_wav", required=True)
    parser.add_argument("-o", "--dir_out", help="Directory to save the trimmed files.", type=str,
                        dest="dir_out", required=True)
    parser.add_argument("-f", "--file_id_list", help="Full path to file containing the ids.", type=str,
                        dest="file_id_list", required=True)
    parser.add_argument("--format", help="Format of the audio file, e.g. WAV.", type=str,
                        dest="format", required=False, default='wav')

    # Parse arguments
    args = parser.parse_args()

    # Read which files to process.
    with open(args.file_id_list) as f:
        id_list = f.readlines()
    # Trim entries in-place.
    id_list[:] = [s.strip(' \t\n\r') for s in id_list]

    # Create output directory if missing.
    makedirs_safe(args.dir_out)

    # Start silence removal.
    loudness_normalizer = SingleChannelNoiseReduction()
    loudness_normalizer.process_list(id_list, args.dir_wav, args.dir_out, args.format)
Beispiel #5
0
    def synth(self, hparams, ids_input):
        """
        Synthesise all given ids with the self.synthesize function.

        :param hparams:        Hyper-parameter container.
        :param ids_input:      Can be full path to file with ids, list of ids, or one id.
        :return:               (Dictionary of network outputs, dictionary of post-processed (by self.OutputGen) network outputs)
        """

        assert (self.model_handler
                is not None)  # Check if trainer.init() was called before.
        assert (
            hparams.synth_dir is not None
        )  # Directory to store the generated audio files has to be set.
        makedirs_safe(hparams.synth_dir)
        id_list = ModelTrainer._input_to_str_list(ids_input)

        self.logger.info("Start synthesising [{0}]".format(", ".join(
            str(i) for i in id_list)))
        t_start = timer()
        model_output, model_output_post = self._forward_batched(
            hparams,
            id_list,
            hparams.batch_size_synth,
            load_target=False,
            synth=True,
            benchmark=False,
            gen_figure=hparams.synth_gen_figure)
        t_training = timer() - t_start
        self.logger.info('Synthesis time for {} sample(s): {}'.format(
            len(id_list), timedelta(seconds=t_training)))

        return model_output, model_output_post
Beispiel #6
0
 def save_to_file(self, filename):
     if self.plt is None:
         logging.error(
             "No generated plot exists, please run 'gen_plot()' first.")
     else:
         makedirs_safe(os.path.dirname(filename))
         self.plt.savefig(filename, bbox_inches=0)
         logging.info("Figure saved as " + filename)
Beispiel #7
0
    def test_train_exponential_decay(self):
        # logging.basicConfig(level=logging.INFO)

        hparams = self._get_hparams()
        hparams.out_dir = os.path.join(
            hparams.out_dir,
            "test_train_exponential_decay")  # Add function name to path.
        hparams.epochs = 1
        hparams.model_type = None
        target_dir = os.path.join(hparams.out_dir, hparams.networks_dir)
        makedirs_safe(target_dir)
        shutil.copyfile(
            os.path.join("integration", "fixtures",
                         "test_model_in409_out67.nn"),
            os.path.join(target_dir, hparams.model_name))
        # hparams.model_type = "RNNDYN-1_RELU_32-1_FC_{}".format(3 * hparams.num_coded_sps + 7)
        hparams.seed = 1234
        hparams.optimiser_args["lr"] = 0.001
        hparams.scheduler_type = "Exponential"
        hparams.scheduler_args["gamma"] = 0.9
        trainer = self._get_trainer(hparams)

        trainer.init(hparams)
        for group in trainer.model_handler.optimiser.param_groups:
            group.setdefault('initial_lr', hparams.optimiser_args["lr"]
                             )  # Add missing initial_lr to all groups.
        trainer.total_epoch = 10  # Artificially set the total number higher to compute the decay.
        trainer.train(hparams)

        expected_lr = hparams.optimiser_args["lr"] * hparams.scheduler_args[
            "gamma"]**(11 * len(trainer.id_list_train))
        self.assertEqual(
            expected_lr, trainer.model_handler.optimiser.param_groups[0]["lr"],
            "Exponential decay was not computed based on total number of epochs. "
            "Which should be the case when hparams.use_saved_learning_rate=True."
        )

        # Try again with reset learning rate.
        trainer = self._get_trainer(hparams)
        hparams.use_saved_learning_rate = False
        trainer.init(hparams)
        for group in trainer.model_handler.optimiser.param_groups:
            group.setdefault('initial_lr', hparams.optimiser_args["lr"]
                             )  # Add missing initial_lr to all groups.
        trainer.total_epoch = 10  # Artificially set the total number higher to compute the decay.
        trainer.train(hparams)

        expected_lr = hparams.optimiser_args["lr"] * hparams.scheduler_args[
            "gamma"]**len(trainer.id_list_train)
        self.assertEqual(
            expected_lr, trainer.model_handler.optimiser.param_groups[0]["lr"],
            "Exponential decay was not reset for this training loop, "
            "which should be the case when hparam.use_saved_learning_rate=False."
        )

        shutil.rmtree(hparams.out_dir)
Beispiel #8
0
    def process_file(self, file, dir_audio, dir_out):

        raw, fs = soundfile.read(os.path.join(dir_audio, file))

        raw = self.highpass_filter(raw, fs)

        out_file = os.path.join(dir_out, file)
        makedirs_safe(os.path.dirname(out_file))
        soundfile.write(out_file, raw, samplerate=fs)

        return raw
    def save_checkpoint(self,
                        model_path: Union[str, os.PathLike],
                        best_loss: np.ndarray = np.inf,
                        epoch: int = None, step: int = None,
                        save_as_best_model: bool = False,
                        save_as_epoch: bool = True,
                        save_as_last_model: bool = False,
                        save_as_step: bool = True):
        assert save_as_best_model or save_as_last_model or step is not None \
            or epoch is not None, "Epoch or step needs to be given."
        assert model_path is not None, "Given model_path cannot be None."

        if save_as_best_model:
            suffix = "best"
        elif save_as_last_model:
            suffix = "last"
        elif epoch is not None and save_as_epoch:
            suffix = "e{}".format(epoch)
        elif step is not None and save_as_step:
            suffix = "s{}".format(step)
        else:
            raise NotImplementedError()
        self.logger.info("Save {} checkpoint to {}.".format(suffix, model_path))
        makedirs_safe(model_path)

        config_json = self.model.get_config_as_json()
        # TODO: Dump hparams in it as well?
        with open(os.path.join(model_path, "config.json"), "w") as f:
            f.write(config_json)

        params = self.model.state_dict()
        if self.ema:
            # Update only the parameters which are shadowed.
            params.update(self.ema.shadow)
            self.logger.info("Updated checkpoint with EMA model parameters {}."
                             .format(", ".join(self.ema.shadow.keys())))
        else:
            params = self.model.state_dict()

        checkpoint = {"params": params, "epoch": epoch, "step": step}
        torch.save(checkpoint, os.path.join(model_path, "params_" + suffix))

        if self.optimiser is not None:
            opt_params = self.optimiser.state_dict()
            checkpoint = {"params": opt_params, "epoch": epoch, "step": step,
                          "best_loss": best_loss}
            torch.save(checkpoint, os.path.join(model_path, "optimiser_" + suffix))

        if self.scheduler is not None:
            scheduler_params = self.scheduler.state_dict()
            checkpoint = {"params": scheduler_params, "epoch": epoch,
                          "step": step}
            torch.save(checkpoint, os.path.join(model_path, "scheduler_" + suffix))
Beispiel #10
0
    def process_file(self, file, dir_audio, dir_out):

        raw, fs = soundfile.read(os.path.join(dir_audio, file))

        raw -= raw.mean()
        raw *= math.sqrt(len(raw) * self.ref_rms**2 / (raw**2).sum())

        out_file = os.path.join(dir_out, file)
        makedirs_safe(os.path.dirname(out_file))
        soundfile.write(out_file, raw, samplerate=fs)

        return raw
Beispiel #11
0
    def process_file(self,
                     file,
                     dir_audio,
                     dir_out,
                     silence_threshold_db=-50,
                     hop_size_ms=None):

        raw, fs = soundfile.read(os.path.join(dir_audio, file))

        frame_length = AudioProcessing.fs_to_frame_length(fs)
        if hop_size_ms is None:
            hop_size_ms = min(self.min_silence_ms, 32)

        _, indices = librosa.effects.trim(raw,
                                          top_db=abs(silence_threshold_db),
                                          frame_length=frame_length,
                                          hop_length=int(fs / 1000 *
                                                         hop_size_ms))

        trim_start = indices[0] / fs * 1000
        trim_end = (len(raw) - indices[1]) / fs * 1000

        # Add silence to the front if audio starts to early.
        if trim_start < self.min_silence_ms:
            # TODO: Find a robust way to create silence so that alignment still
            #       works (maybe concat mirrored segments).
            logging.warning(
                "File {} has only {} ms of silence in the beginning.".format(
                    file, trim_start))
            trim_start = 0
        else:
            trim_start -= self.min_silence_ms

        # Append silence if audio ends too late.
        if trim_end < self.min_silence_ms:
            # See TODO above.
            logging.warning(
                "File {} has only {} ms of silence in the end.".format(
                    file, trim_end))
            trim_end = 0
        else:
            trim_end -= self.min_silence_ms

        start_frame = int(trim_start * fs / 1000)
        end_frame = int(-trim_end * fs / 1000 - 1)
        trimmed_raw = raw[start_frame:end_frame]

        out_file = os.path.join(dir_out, file)
        makedirs_safe(os.path.dirname(out_file))
        soundfile.write(out_file, trimmed_raw, samplerate=fs)

        return trimmed_raw
Beispiel #12
0
    def gen_data(dir_in, file_questions, dir_out=None, file_id_list="",
                 id_list=None, return_dict=False):
        """
        Generate question labels from HTK labels.

        :param dir_in:         Directory containing the HTK labels.
        :param file_questions: Full file path to the question file.
        :param dir_out:        Directory to store the question labels.
                               If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids.
                               Normalisation parameters are saved using
                               this name to differentiate parameters
                               between subsets.
        :param id_list:        The list of utterances to process. Should
                               have the form uttId1 \\n uttId2 \\n ...
                               \\n uttIdN. If None, all file in
                               audio_dir are used.
        :param return_dict:    If true, returns an OrderedDict of all
                               samples as first output.
        :return:               Returns two normalisation parameters as
                               tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict
                               followed by the two normalisation
                               parameters.
        """

        # Fill file_id_list by .lab files in dir_in, if not given, and
        # set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.lab"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]
            id_list = ['{}'.format(os.path.basename(element))  # Ignore full path.
                       for element in id_list]

        if dir_out is not None:
            makedirs_safe(dir_out)

        label_operator = HTSLabelNormalisation(file_questions)
        if return_dict:
            label_dict, norm_params = label_operator.perform_normalisation(
                file_id_list_name, id_list, dir_in, dir_out, return_dict=True)
            return label_dict, norm_params[0], norm_params[1]
        else:
            norm_params = label_operator.perform_normalisation(
                file_id_list_name, id_list, dir_in, dir_out, return_dict=False)
            return norm_params[0], norm_params[1]
Beispiel #13
0
    def process_file(self, file, dir_audio, dir_out):

        raw, fs = soundfile.read(os.path.join(dir_audio, file))

        data_noisy_matlab = self.nparray_to_matlab(raw)
        data_noisy_matlab = self.eng.transpose(data_noisy_matlab)

        enhanced = self.eng.runme(data_noisy_matlab, fs)

        out_file = os.path.join(dir_out, file)
        makedirs_safe(os.path.dirname(out_file))
        soundfile.write(out_file, enhanced, samplerate=fs)

        return enhanced
Beispiel #14
0
    def test_init_load(self):
        # Try epochs=3, loading existing model.
        hparams = self._get_hparams()
        hparams.out_dir = os.path.join(
            hparams.out_dir, "test_init_load")  # Add function name to path.
        hparams.model_type = None
        target_dir = os.path.join(hparams.out_dir, hparams.networks_dir)
        makedirs_safe(target_dir)
        shutil.copyfile(
            os.path.join("integration", "fixtures",
                         "test_model_in409_out67.nn"),
            os.path.join(target_dir, hparams.model_name))
        trainer = self._get_trainer(hparams)
        trainer.init(hparams)
        self.assertIsNotNone(trainer.model_handler.model)

        shutil.rmtree(hparams.out_dir)
Beispiel #15
0
    def _save_to_npz(file_path: os.PathLike, features: np.ndarray,
                     feature_name: str) -> None:

        makedirs_safe(os.path.dirname(file_path))
        if not file_path.endswith(".npz"):
            file_path += ".npz"
        file_path_backup = file_path + "_bak"

        clean_backup_file = False

        if os.path.isfile(file_path):
            saved_features = dict(np.load(file_path))

            os.rename(file_path, file_path_backup)
            clean_backup_file = True

            if feature_name in saved_features:
                logging.info("Overriding {} in {}.".format(
                    feature_name, file_path))
            saved_features[feature_name] = features
        else:
            saved_features = {feature_name: features}

        try:
            np.savez(file_path, **saved_features)
        except:
            if os.path.isfile(file_path_backup):
                logging.error("Error when writing {}, restoring backup".format(
                    file_path))
                if os.path.isfile(file_path):
                    os.remove(file_path)
                os.rename(file_path_backup, file_path)
                clean_backup_file = False
            else:
                logging.error("Error when writing {}.".format(file_path))
            raise

        if clean_backup_file:
            os.remove(file_path_backup)
Beispiel #16
0
def main():
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = "English"
    hparams.model_name = "WarpingLayerTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    # hparams.num_questions = 505
    hparams.num_questions = 425
    hparams.out_dir = "experiments/" + hparams.voice + "/VTLNArtificiallyWarped/"
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "warping_layer_test"
    hparams.synth_dir = hparams.out_dir
    batch_size = 2
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        "experiments/" + hparams.voice + "/WORLD",
        "experiments/" + hparams.voice + "/questions", "ignored",
        hparams.num_questions, hparams)

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ),
                      hparams)
    wl.set_norm_params(sp_mean, sp_std_dev)

    # id_list = ["dorian/doriangray_16_00199"]
    id_list = ["p225/p225_051"]
    hparams.num_speakers = 1

    t_benchmark = 0
    for id_name in id_list:
        for idx, alpha in enumerate(np.arange(-0.15, 0.2, 0.05)):
            out_dir = hparams.out_dir + "alpha_{0:0.2f}/".format(alpha)
            makedirs_safe(out_dir)

            sample = WorldFeatLabelGen.load_sample(
                id_name,
                os.path.join("experiments", hparams.voice, "WORLD"),
                add_deltas=True,
                num_coded_sps=hparams.num_coded_sps)
            sample_pre = gen_in.preprocess_sample(sample)
            coded_sps = sample_pre[:, :hparams.num_coded_sps *
                                   (3 if hparams.add_deltas else 1)]

            alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha

            coded_sps = coded_sps[:len(alpha_vec), None, ...].repeat(
                batch_size, 1)  # Copy data in batch dimension.
            alpha_vec = alpha_vec[:, None, None].repeat(
                batch_size, 1)  # Copy data in batch dimension.

            t_start = timer()
            mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps),
                                            None, (len(coded_sps), ),
                                            (len(coded_sps), ),
                                            alphas=torch.from_numpy(alpha_vec))
            mfcc_warped.sum().backward()
            t_benchmark += timer() - t_start
            assert ((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all()
                    )  # Compare results for cloned coded_sps within batch.
            if alpha == 0:
                assert ((mfcc_warped == coded_sps).all()
                        )  # Compare results for no warping.
            sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * (
                3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach()

            sample_post = gen_in.postprocess_sample(sample_pre)
            # Manually create samples without normalisation but with deltas.
            sample_pre = (sample_pre * gen_in.norm_params[1] +
                          gen_in.norm_params[0]).astype(np.float32)

            if np.isnan(sample_pre).any():
                raise ValueError(
                    "Detected nan values in output features for {}.".format(
                        id_name))
            # Save warped features.
            makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name)))
            sample_pre.tofile(
                os.path.join(out_dir, id_name + WorldFeatLabelGen.ext_deltas))

            hparams.synth_dir = out_dir
            Synthesiser.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} runs: {}".format(
        len(id_list) * idx, timedelta(seconds=t_benchmark)))
Beispiel #17
0
    def gen_data(self,
                 dir_in,
                 dir_out=None,
                 file_id_list=None,
                 id_list=None,
                 return_dict=False):
        """
        Prepare atom labels from wav files.
        If id_list is not None, only the ids listed there are generated, otherwise for each .wav file in the dir_in.
        Atoms are computed by the wcad algorithm. Examples with more than 70 atoms are rejected. One can create
        a new file_id_list by uncommenting the lines before the return statement. Nevertheless, the current file_id_list
        is not substituted by it. The algorithm also saves the extracted phrase component in dir_out/id_name.phrase,
        if dir_out is not None.

        :param dir_in:           Directory containing the original wav files.
        :param dir_out:          Directory where the labels are stored. If None, no labels are stored.
        :param file_id_list:     Name of the file containing the ids. Normalisation parameters are saved using
                                 this name to differentiate parameters between subsets.
        :param id_list:          The list of utterances to process.
                                 Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                                 If None, all wav files in audio_dir are used.
        :param return_dict:      If True, returns an OrderedDict of all samples as first output.
        :return:                 Returns mean=0.0, std_dev, min, max of atoms.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]

        if dir_out is not None:
            makedirs_safe(dir_out)

        if return_dict:
            label_dict = OrderedDict()

        mean_std_ext_atom = MeanStdDevExtractor()
        min_max_ext_atom = MinMaxExtractor()
        mean_std_ext_phrase = MeanStdDevExtractor()
        min_max_ext_phrase = MinMaxExtractor()

        # Compute Atoms.
        from wcad import WaveInput, PitchExtractor, MultiphraseExtractor, DictionaryGenerator, AtomExtrator, ModelCreator, ModelSaver, Params, Paths
        correct_utts = list()
        self.logger.info("Create atom labels for " +
                         "[{0}]".format(", ".join(str(i) for i in id_list)))
        for id_name in id_list:
            self.logger.debug("Create atom labels for " + id_name)

            # Wcad has to be called in its root directory, therefore a change dir operation is necessary.
            cwd = os.getcwd()
            os.chdir(self.wcad_root)
            args = [dir_in + "/" + id_name + ".wav", dir_out]
            print(args)
            params = Params()
            # Overwrite the possible theta values by selected values.
            params.local_atoms_thetas = self.theta_interval
            params.k = [self.k]
            # params.min_atom_amp = 0.1
            paths = Paths(args, params)
            # Start the extraction process.
            start_t = time.time()
            waveform = WaveInput(paths.wav, params).read()
            pitch = PitchExtractor(waveform, params, paths).compute()
            # Compute the phrase component.
            phrase = MultiphraseExtractor(pitch, waveform, params,
                                          paths).compute()
            phrase_curve = phrase.curve
            # Extract atroms.
            dictionary = DictionaryGenerator(params, paths).compute()
            atoms = AtomExtrator(waveform, pitch, phrase, dictionary, params,
                                 paths).compute()
            # Create a model.
            model = ModelCreator(phrase, atoms, pitch).compute()
            print(('Model created in %s seconds' % (time.time() - start_t)))
            # Save the atoms.
            ModelSaver(model, params, paths).save()
            os.chdir(cwd)

            # Check if output can be correct.
            possible_extraction_failure = False
            if len(atoms) < 50 and not any(a.amp > 10 for a in atoms):
                correct_utts.append(id_name)
            else:
                self.logger.warning("Possible fail of atom extractor for " +
                                    id_name + " (atoms: " + str(len(atoms)) +
                                    ", frames: " + str(len(phrase_curve)) +
                                    ", max: " +
                                    str(max(a.amp for a in atoms)) + ").")
                possible_extraction_failure = True

            atoms.sort(key=lambda x: x.position)
            # print_atoms(atoms)

            # Get audio length needed to trim the atoms.
            duration = self.get_audio_length(id_name, dir_in,
                                             self.frame_size_ms)

            # The algorithm generates a few atoms at negative positions,
            # pad them into the first atom at positive position.
            padded_amp = 0
            padded_theta = 0
            for idx, atom in enumerate(atoms):
                if atom.position < 0:
                    padded_amp += atom.amp
                    padded_theta += atom.theta
                else:
                    atoms[idx].amp += padded_amp  # Pad the amplitude.
                    atoms[idx].theta = (atoms[idx].theta +
                                        padded_theta) / (idx + 1)
                    del atoms[:idx]  # Remove the negative atoms from the list.
                    break
            # print_atoms(atoms)

            # The algorithm might also generate a few atoms beyond the last label,
            # pad them into the last label.
            padded_amp = 0
            padded_theta = 0
            for idx, atom in reversed(list(enumerate(atoms))):
                if atom.position * self.frame_size_ms > duration:
                    padded_amp += atom.amp
                    padded_theta += atom.theta
                else:
                    atoms[idx].amp += padded_amp
                    atoms[idx].theta = (atoms[idx].theta +
                                        padded_theta) / (len(atoms) - idx)
                    atoms = atoms[:-(len(atoms) - idx - 1)
                                  or None]  # Remove atoms beyond last label.
                    break
            # print_atoms(atoms)

            # Create a label for each frame (size of frame_size_ms) with amplitude and theta of contained atoms.
            np_atom_labels = AtomLabelGen.atoms_to_labels(
                atoms, self.theta_interval, int(duration / self.frame_size_ms))

            np_atom_amps = np.sum(np_atom_labels, axis=1)

            if not possible_extraction_failure:  # Only add successful extractions to mean and std_dev computation.
                mean_std_ext_atom.add_sample(
                    np_atom_amps[np_atom_amps[:, 0] != 0.0]
                )  # Only compute std_dev from atoms.
                min_max_ext_atom.add_sample(np_atom_amps)
                # mean_std_ext_phrase.add_sample(phrase_curve)
                # min_max_ext_phrase.add_sample(phrase_curve)

            if return_dict:
                label_dict[id_name] = np_atom_labels
            if dir_out is not None:
                # Save phrase, because it might be used in synthesis.
                phrase_curve.astype('float32').tofile(
                    os.path.join(dir_out, id_name + self.ext_phrase))

                # Save atoms binary (float32).
                np_atom_labels.astype('float32').tofile(
                    os.path.join(dir_out, id_name + self.ext_atoms))

                # Create a readable version of the atom data.
                # np.savetxt(os.path.join(dir_out, id_name + self.ext_atoms + ".txt"), np_atom_labels)

        # Manually set mean of atoms to 0, otherwise frames without atom will have an amplitude.
        if mean_std_ext_atom.sum_length > 0:  # Make sure at least one atom was added.
            mean_std_ext_atom.sum_frames[:] = 0.0
        else:
            mean_std_ext_atom.sum_frames = np.zeros(np_atom_amps.shape[1:])
            mean_std_ext_atom.sum_squared_frames = np.zeros(
                np_atom_amps.shape[1:])
        mean_std_ext_atom.sum_squared_frames[
            1] = mean_std_ext_atom.sum_length * self.theta_interval[-1]

        mean_std_ext_atom.save(os.path.join(dir_out, file_id_list_name))
        min_max_ext_atom.save(os.path.join(dir_out, file_id_list_name))
        # mean_std_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase'))
        # min_max_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase'))

        mean_atoms, std_atoms = mean_std_ext_atom.get_params()
        min_atoms, max_atoms = min_max_ext_atom.get_params()
        # mean_phrase, std_phrase = mean_std_ext_phrase.get_params()
        # min_phrase, max_phrase = min_max_ext_atom.get_params()

        # Use this block to save the part of the file_id_list for which atom extraction was successful into a new file.
        if correct_utts:
            with open(
                    os.path.join(
                        os.path.dirname(dir_in), "wcad_" +
                        os.path.basename(file_id_list_name) + ".txt"),
                    'w') as f:
                f.write('\n'.join(correct_utts) + '\n')

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, \
                   mean_atoms, std_atoms, \
                   min_atoms, max_atoms
            # mean_phrase, std_phrase, \
            # min_phrase, max_phrase
        else:
            return mean_atoms, std_atoms, \
                   min_atoms, max_atoms
Beispiel #18
0
    def process_file(self,
                     file,
                     dir_audio,
                     dir_out,
                     silence_threshold_db=-50,
                     hop_size_ms=None):
        # sound = AudioSegment.from_file(os.path.join(dir_audio, file), format=audio_format)
        # trim_start = self._detect_leading_silence(sound, silence_threshold_db, chunk_size_ms)
        # trim_end = self._detect_leading_silence(sound.reverse(), silence_threshold_db, chunk_size_ms)

        raw, fs = soundfile.read(os.path.join(dir_audio, file))

        frame_length = WorldFeatLabelGen.fs_to_frame_length(fs)
        if hop_size_ms is None:
            hop_size_ms = min(self.min_silence_ms, 32)

        _, indices = librosa.effects.trim(raw,
                                          top_db=abs(silence_threshold_db),
                                          frame_length=frame_length,
                                          hop_length=int(fs / 1000 *
                                                         hop_size_ms))
        trim_start = indices[0] / fs * 1000
        trim_end = (len(raw) - indices[1]) / fs * 1000

        # Add silence to the front if audio starts to early.
        if trim_start < self.min_silence_ms:
            # TODO: Find a robust way to create silence so that HTK alignment still works (maybe concat mirrored segments).
            logging.warning(
                "File {} has only {} ms of silence in the beginning.".format(
                    file, trim_start))
            # AudioSegment.silent(duration=self.min_silence_ms-trim_start)
            # if trim_start > 0:
            #     silence = (sound[:trim_start] * (math.ceil(self.min_silence_ms / trim_start) - 1))[:self.min_silence_ms-trim_start]
            #     sound = silence + sound
            # elif trim_end > 0:
            #     silence = (sound[-trim_end:] * (math.ceil(self.min_silence_ms / trim_end) - 1))[:self.min_silence_ms-trim_end]
            #     sound = silence + sound
            # else:
            #     self.logger.warning("Cannot append silence to the front of " + file + ". No silence exists at front or end which can be copied.")
            trim_start = 0
        else:
            trim_start -= self.min_silence_ms

        # Append silence if audio ends too late.
        if trim_end < self.min_silence_ms:
            logging.warning(
                "File {} has only {} ms of silence in the end.".format(
                    file, trim_end))
            # silence = AudioSegment.silent(duration=self.min_silence_ms-trim_end)
            # if trim_end > 0:
            #     silence = (sound[-trim_end:] * (math.ceil(self.min_silence_ms / trim_end) - 1))[:self.min_silence_ms-trim_end]
            #     sound = sound + silence
            # elif trim_start > 0:
            #     silence = (sound[:trim_start] * (math.ceil(self.min_silence_ms / trim_start) - 1))[:self.min_silence_ms-trim_start]
            #     sound = sound + silence
            # else:
            #     self.logger.warning("Cannot append silence to the end of " + file + ". No silence exists at front or end which can be copied.")
            trim_end = 0
        else:
            trim_end -= self.min_silence_ms

        # Trim the sound.
        trimmed_raw = raw[int(trim_start * fs /
                              1000):int(-trim_end * fs / 1000 - 1)]
        # trimmed_sound = sound[trim_start:-trim_end-1]

        # Save trimmed sound to file.
        out_file = os.path.join(dir_out, file)
        makedirs_safe(os.path.dirname(out_file))
        soundfile.write(out_file, trimmed_raw, samplerate=fs)

        return trimmed_raw
Beispiel #19
0
def main():
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("-w",
                        "--dir_wav",
                        help="Directory containing the wav files.",
                        type=str,
                        dest="dir_wav",
                        required=True)
    parser.add_argument("-o",
                        "--dir_out",
                        help="Directory to save the trimmed files.",
                        type=str,
                        dest="dir_out",
                        required=True)
    parser.add_argument("-f",
                        "--file_id_list",
                        help="Full path to file containing the ids.",
                        type=str,
                        dest="file_id_list",
                        required=True)
    parser.add_argument("--format",
                        help="Format of the audio file, e.g. WAV.",
                        type=str,
                        dest="format",
                        required=False,
                        default='wav')
    parser.add_argument(
        "--silence_db",
        help="Threshold until which a frame is considered to be silent.",
        type=int,
        dest="silence_threshold_db",
        required=False,
        default=-50)
    parser.add_argument(
        "--chunk_size",
        help="Size of the chunk (frame size) in ms on which db is computed.",
        type=int,
        dest="chunk_size_ms",
        required=False,
        default=10)
    parser.add_argument(
        "--min_silence_ms",
        help=
        "Milliseconds of silence which are always kept in front and back of audio file.",
        type=int,
        dest="min_silence_ms",
        required=False,
        default=200)

    # Parse arguments
    args = parser.parse_args()

    # Read which files to process.
    with open(args.file_id_list) as f:
        id_list = f.readlines()
    # Trim entries in-place.
    id_list[:] = [s.strip(' \t\n\r') for s in id_list]

    # Create output directory if missing.
    makedirs_safe(args.dir_out)

    # Start silence removal.
    silence_remover = SilenceRemover(args.min_silence_ms)
    silence_remover.process_list(id_list, args.dir_wav, args.dir_out,
                                 args.format, args.silence_threshold_db,
                                 args.chunk_size_ms)
Beispiel #20
0
def main():
    """Create samples with artificial alpha for each phoneme."""
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = sys.argv[1]
    hparams.model_name = "WarpingLayerTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    alpha_range = 0.2
    num_phonemes = 70

    num_random_alphas = 7
    # num_random_alphas = 53

    # Randomly pick alphas for each phoneme.
    np.random.seed(42)
    # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range)
    phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 *
                                alpha_range)

    # hparams.num_questions = 505
    hparams.num_questions = 609
    # hparams.num_questions = 425

    hparams.out_dir = os.path.join("experiments", hparams.voice,
                                   "WORLD_artificially_warped")
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "warping_layer_test"
    hparams.synth_dir = hparams.out_dir
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    print(
        "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}."
        .format(hparams.voice, hparams.out_dir, hparams.num_questions,
                len(np.unique(phonemes_to_alpha_tensor)), alpha_range))

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        os.path.join("experiments", hparams.voice, "WORLD"),
        os.path.join("experiments", hparams.voice, "questions"), "ignored",
        hparams.num_questions, hparams)

    hparams.num_speakers = 1
    speaker = "p276"
    num_synth_files = 5  # Number of files to synthesise to check warping manually.

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ),
                      hparams)
    wl.set_norm_params(sp_mean, sp_std_dev)

    def _question_to_phoneme_index(questions):
        """Helper function to convert questions to their current phoneme index."""
        if questions.shape[-1] == 505:  # German question set.
            indices = np.arange(86, 347, 5, dtype=np.int)
        elif questions.shape[-1] == 425:  # English radio question set.
            indices = np.arange(58, 107, dtype=np.int)
        elif questions.shape[-1] == 609:  # English unilex question set.
            indices = np.arange(92, 162, dtype=np.int)
        else:
            raise NotImplementedError(
                "Unknown question set with {} questions.".format(
                    questions.shape[-1]))
        return QuestionLabelGen.questions_to_phoneme_indices(
            questions, indices)

    # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f:
    with open(
            os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format(
                hparams.voice))) as f:
        id_list = f.readlines()
    id_list[:] = [s.strip(' \t\n\r') for s in id_list
                  if speaker in s]  # Trim line endings in-place.

    out_dir = hparams.out_dir
    makedirs_safe(out_dir)
    makedirs_safe(os.path.join(out_dir,
                               "cmp_mgc" + str(hparams.num_coded_sps)))
    t_benchmark = 0
    org_to_warped_mcd = 0.0
    for idx, id_name in enumerate(id_list):

        sample = WorldFeatLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "WORLD"),
            add_deltas=True,
            num_coded_sps=hparams.num_coded_sps)
        sample_pre = gen_in.preprocess_sample(sample)
        coded_sps = sample_pre[:, :hparams.num_coded_sps *
                               (3 if hparams.add_deltas else 1)]

        questions = QuestionLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "questions"),
            num_questions=hparams.num_questions)
        questions = questions[:len(coded_sps)]
        phoneme_indices = _question_to_phoneme_index(questions)
        alpha_vec = phonemes_to_alpha_tensor[phoneme_indices %
                                             len(phonemes_to_alpha_tensor),
                                             None]

        coded_sps = coded_sps[:len(alpha_vec), None,
                              ...]  # Create a batch dimension.
        alpha_vec = alpha_vec[:, None,
                              None]  # Create a batch and feature dimension.

        t_start = timer()
        mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps),
                                        None, (len(coded_sps), ),
                                        (len(coded_sps), ),
                                        alphas=torch.from_numpy(alpha_vec))
        t_benchmark += timer() - t_start
        sample_pre[:len(mfcc_warped), :hparams.num_coded_sps *
                   (3 if hparams.add_deltas else 1)] = mfcc_warped[:,
                                                                   0].detach()

        sample_post = gen_in.postprocess_sample(sample_pre)
        # Manually create samples without normalisation but with deltas.
        sample_pre = (sample_pre * gen_in.norm_params[1] +
                      gen_in.norm_params[0]).astype(np.float32)

        if np.isnan(sample_pre).any():
            raise ValueError(
                "Detected nan values in output features for {}.".format(
                    id_name))

        # Compute error between warped version and original one.
        org_to_warped_mcd += metrics.melcd(
            sample[:, 0:hparams.num_coded_sps],
            sample_pre[:, 0:hparams.num_coded_sps])

        # Save warped features.
        sample_pre.tofile(
            os.path.join(
                out_dir, "cmp_mgc" + str(hparams.num_coded_sps),
                os.path.basename(id_name + WorldFeatLabelGen.ext_deltas)))

        hparams.synth_dir = out_dir
        if idx < num_synth_files:  # Only synthesize a few of samples.
            trainer.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}".
          format(len(id_list), timedelta(seconds=t_benchmark),
                 org_to_warped_mcd / len(id_list)))

    # Copy normalisation files which are necessary for training.
    for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]:
        shutil.copyfile(
            os.path.join(
                gen_in.dir_labels, gen_in.dir_deltas,
                MeanCovarianceExtractor.file_name_appendix + feature + ".bin"),
            os.path.join(
                out_dir, "cmp_mgc" + str(hparams.num_coded_sps),
                MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
 def _get_test_dir(self):
     out_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            type(self).__name__)
     makedirs_safe(out_dir)
     return out_dir
    def gen_data(dir_in,
                 dir_out=None,
                 file_id_list="",
                 id_list=None,
                 return_dict=False):
        """
        Prepare durations from HTK labels (forced-aligned).
        Each numpy array has the dimension num_phonemes x PhonemeDurationLabelGen.num_states (default num_state=5).

        :param dir_in:         Directory where the HTK label files are stored (usually named label_state_align).
        :param dir_out:        Main directory where the labels and normalisation parameters are saved to.
                               If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids. Normalisation parameters are saved using
                               this name to differentiate parameters between subsets.
        :param id_list:        The list of utterances to process.
                               Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                               If None, all file in dir_in are used.
        :param return_dict:    If true, returns an OrderedDict of all samples as first output.
        :return:               Returns two normalisation parameters as tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict followed by the two normalisation parameters.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]
            id_list = [
                '{}'.format(os.path.basename(element)) for element in id_list
            ]  # Ignore full path.

        # Create directories in dir_out if it is given.
        if dir_out is not None:
            makedirs_safe(dir_out)

        # Create the return dictionary if required.
        if return_dict:
            label_dict = OrderedDict()

        # Create normalisation computation units.
        norm_params_ext_dur = MeanStdDevExtractor()

        logging.info("Extract phoneme durations for " +
                     "[{0}]".format(", ".join(str(i) for i in id_list)))
        for file_name in id_list:
            logging.debug("Extract phoneme durations from " + file_name)

            with open(
                    os.path.join(
                        dir_in,
                        file_name + PhonemeDurationLabelGen.ext_phonemes),
                    'r') as f:
                htk_labels = [line.rstrip('\n').split()[:2] for line in f]
                timings = np.array(
                    htk_labels, dtype=np.float32
                ) / PhonemeDurationLabelGen.min_phoneme_length
                dur = timings[:, 1] - timings[:, 0]
                dur = dur.reshape(
                    -1, PhonemeDurationLabelGen.num_states).astype(np.float32)

            if return_dict:
                label_dict[file_name] = dur
            if dir_out is not None:
                dur.tofile(
                    os.path.join(
                        dir_out,
                        file_name + PhonemeDurationLabelGen.ext_durations))

            # Add sample to normalisation computation unit.
            norm_params_ext_dur.add_sample(dur)

        # Save mean and std dev of all features.
        norm_params_ext_dur.save(os.path.join(dir_out, file_id_list_name))

        # Get normalisation parameters.
        norm_first, norm_second = norm_params_ext_dur.get_params()

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, norm_first, norm_second
        else:
            return norm_first, norm_second
Beispiel #23
0
 def setUpClass(cls):
     cls.out_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                type(cls()).__name__)
     makedirs_safe(cls.out_dir)  # Create class name directory.
Beispiel #24
0
    def run_world_synth(synth_output, hparams):
        """Run the WORLD synthesize method."""

        fft_size = pyworld.get_cheaptrick_fft_size(hparams.synth_fs)

        save_dir = hparams.synth_dir if hparams.synth_dir is not None\
                                     else hparams.out_dir if hparams.out_dir is not None\
                                     else os.path.curdir
        for id_name, output in synth_output.items():
            logging.info(
                "Synthesise {} with the WORLD vocoder.".format(id_name))

            coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features(
                output,
                contains_deltas=False,
                num_coded_sps=hparams.num_coded_sps)
            amp_sp = WorldFeatLabelGen.decode_sp(
                coded_sp,
                hparams.sp_type,
                hparams.synth_fs,
                post_filtering=hparams.do_post_filtering).astype(np.double,
                                                                 copy=False)
            args = dict()
            for attr in "preemphasize", "f0_silence_threshold", "lf0_zero":
                if hasattr(hparams, attr):
                    args[attr] = getattr(hparams, attr)
            waveform = WorldFeatLabelGen.world_features_to_raw(
                amp_sp,
                lf0,
                vuv,
                bap,
                fs=hparams.synth_fs,
                n_fft=fft_size,
                **args)

            # f0 = np.exp(lf0, dtype=np.float64)
            # vuv[f0 < WorldFeatLabelGen.f0_silence_threshold] = 0  # WORLD throws an error for too small f0 values.
            # f0[vuv == 0] = 0.0
            # ap = pyworld.decode_aperiodicity(np.ascontiguousarray(bap.reshape(-1, 1), np.float64),
            #                                  hparams.synth_fs,
            #                                  fft_size)
            #
            # waveform = pyworld.synthesize(f0, amp_sp, ap, hparams.synth_fs)
            # waveform = waveform.astype(np.float32, copy=False)  # Does inplace conversion, if possible.

            # Always save as wav file first and convert afterwards if necessary.
            file_path = os.path.join(
                save_dir, "{}{}{}{}".format(
                    os.path.basename(id_name), "_" + hparams.model_name
                    if hparams.model_name is not None else "",
                    hparams.synth_file_suffix, "_WORLD"))
            makedirs_safe(hparams.synth_dir)
            soundfile.write(file_path + ".wav", waveform, hparams.synth_fs)

            # Use PyDub for special audio formats.
            if hparams.synth_ext.lower() != 'wav':
                as_wave = pydub.AudioSegment.from_wav(file_path + ".wav")
                file = as_wave.export(file_path + "." + hparams.synth_ext,
                                      format=hparams.synth_ext)
                file.close()
                os.remove(file_path + ".wav")
Beispiel #25
0
    def run_DM_AM(hparams, input_strings):
        """
        A function for TTS with a pre-trained duration and acoustic model.

        :param hparams:            Hyper-parameter container. The following parameters are used:
                                   front_end:                    Full path to the makeLabels.sh script in scripts/tts_frontend, depends on the language.
                                   festival_dir:                 Full path to the directory with the festival bin/ folder.
                                   front_end_accent (optional):  Give an accent to the front_end, used in tts_frontend.
                                   duration_labels_dir:          Full path to the folder containing the normalisation parameters used to train the duration model.
                                   file_symbol_dict:             A file containing all the used phonemes (has been used to train the duration model, usually mono_phone.list).
                                   duration_model:               Full path to the pre-trained duration model.
                                   num_phoneme_states:           Number of states per phoneme, for each a duration is predicted by the duration model.
                                   question_file:               Full path to question file used to train the acoustic model.
                                   question_labels_norm_file:    Full path to normalisation file of questions used to train the acoustic model.
                                   num_questions:                Number of questions which form the input dimension to the acoustic model.
                                   acoustic_model:               Full path to acoustic model.
        :param input_strings:
        :return:
        """
        # Create a temporary directory to store all files.
        with tempfile.TemporaryDirectory() as tmp_dir_name:
            # tmp_dir_name = os.path.realpath("TMP")
            # makedirs_safe(tmp_dir_name)
            hparams.out_dir = tmp_dir_name
            print("Created temporary directory", tmp_dir_name)
            id_list = ["synth" + str(idx) for idx in range(len(input_strings))]

            # Write the text to synthesise into a single synth.txt file with ids.
            utts_file = os.path.join(tmp_dir_name, "synth.txt")
            with open(utts_file, "w") as text_file:
                for idx, text in enumerate(input_strings):
                    text_file.write("synth{}\t{}\n".format(
                        idx, text))  # TODO: Remove parenthesis etc.

            # Call the front end on the synth.txt file.
            front_end_arguments = [
                hparams.front_end, hparams.festival_dir, utts_file
            ]
            if hasattr(hparams, "front_end_accent"
                       ) and hparams.front_end_accent is not None:
                front_end_arguments.append(hparams.front_end_accent)
            front_end_arguments.append(tmp_dir_name)
            subprocess.check_call(front_end_arguments)

            # Remove durations from mono labels.
            dir_mono_no_align = os.path.join(tmp_dir_name, "mono_no_align")
            dir_mono = os.path.join(tmp_dir_name, "labels", "mono")

            if os.path.isdir(dir_mono_no_align):
                shutil.rmtree(dir_mono_no_align)
            os.rename(dir_mono, dir_mono_no_align)
            for id_name in id_list:
                with open(os.path.join(dir_mono_no_align, id_name + ".lab"),
                          "r") as f:
                    old = f.read()
                    monophones = old.split()[2::3]
                with open(os.path.join(dir_mono_no_align, id_name + ".lab"),
                          "w") as f:
                    f.write("\n".join(monophones))

            # Run duration model.
            hparams.batch_size_test = len(input_strings)
            hparams.test_set_perc = 0.0
            hparams.val_set_perc = 0.0
            hparams.phoneme_label_type = "mono_no_align"
            hparams.output_norm_params_file_prefix = hparams.duration_norm_file_name if hasattr(
                hparams, "duration_norm_file_name") else None
            duration_model_trainer = DurationModelTrainer(
                os.path.join(tmp_dir_name,
                             "mono_no_align"), hparams.duration_labels_dir,
                id_list, hparams.file_symbol_dict, hparams)
            assert hparams.duration_model is not None, "Path to duration model in hparams.duration_model is needed."
            hparams.model_path = hparams.duration_model
            hparams.model_name = os.path.basename(hparams.duration_model)

            # Predict durations. Durations are already converted to multiples of hparams.min_phoneme_length.
            hparams.load_from_checkpoint = True
            duration_model_trainer.init(hparams)
            _, output_dict_post = duration_model_trainer.forward(
                hparams, id_list)
            hparams.output_norm_params_file_prefix = None  # Reset again.

            # Write duration to full labels.
            dir_full = os.path.join(tmp_dir_name, "labels", "full")
            dir_label_state_align = os.path.join(tmp_dir_name, "labels",
                                                 "label_state_align")
            makedirs_safe(dir_label_state_align)
            for id_name in id_list:
                with open(os.path.join(dir_full, id_name + ".lab"), "r") as f:
                    full = f.read().split()[2::3]
                with open(
                        os.path.join(dir_label_state_align, id_name + ".lab"),
                        "w") as f:
                    current_time = 0
                    timings = output_dict_post[id_name]
                    for idx, monophone in enumerate(full):
                        for state in range(hparams.num_phoneme_states):
                            next_time = current_time + int(timings[idx, state])
                            f.write("{}\t{}\t{}[{}]\n".format(
                                current_time, next_time, monophone, state + 2))
                            current_time = next_time

            # Generate questions from HTK full labels.
            QuestionLabelGen.gen_data(dir_label_state_align,
                                      hparams.question_file,
                                      dir_out=tmp_dir_name,
                                      file_id_list="synth",
                                      id_list=id_list,
                                      return_dict=False)

            # Run acoustic model and synthesise.
            shutil.copy2(hparams.question_labels_norm_file,
                         tmp_dir_name + "/min-max.bin"
                         )  # Get normalisation parameters in same directory.
            acoustic_model_trainer = AcousticModelTrainer(
                hparams.world_features_dir, tmp_dir_name, id_list,
                hparams.num_questions, hparams)
            assert hparams.acoustic_model is not None, "Path to acoustic model in hparams.acoustic_model is needed."
            hparams.model_path = hparams.acoustic_model
            hparams.model_name = os.path.basename(hparams.acoustic_model)
            hparams.load_from_checkpoint = True
            acoustic_model_trainer.init(hparams)
            hparams.model_name = ""  # No suffix in synthesised files.
            _, output_dict_post = acoustic_model_trainer.synth(
                hparams, id_list)

            logging.info("Synthesized files are in {}.".format(
                hparams.synth_dir))

        return 0
Beispiel #26
0
dir_questions = "questions"
dir_world = os.path.realpath("WORLD")
thetas = np.arange(0.03, 0.155, 0.03)
dir_atoms = "wcad-" + "_".join(map("{:.3f}".format, thetas))

if extract_features:
    # Generate labels.
    # # shutil.rmtree(dir_labels)
    # makedirs_safe(dir_labels)
    logging.warning("Label files are not recreated.")
    # TODO: Possible implementation at TTSModel.run_DM_AM().

    # Generate durations
    logging.info("Create duration files.")
    shutil.rmtree(dir_dur)
    makedirs_safe(dir_dur)
    PhonemeDurationLabelGen.gen_data(dir_labels, dir_dur, id_list=id_list)

    # Generate questions.
    logging.info("Create question files.")
    shutil.rmtree(dir_questions)
    makedirs_safe(dir_questions)
    QuestionLabelGen.gen_data(dir_labels,
                              "questions-en-radio_dnn_400.hed",
                              dir_questions,
                              id_list=id_list)

    # Generate WORLD features.
    logging.info("Create WORLD files.")
    shutil.rmtree(dir_world)
    makedirs_safe(dir_world)
Beispiel #27
0
 def setUpClass(cls):
     hparams = cls._get_hparams(cls())
     makedirs_safe(hparams.out_dir)  # Create class name directory.
     # Load test data
     cls.id_list = cls._get_id_list()
Beispiel #28
0
def main():
    logging.basicConfig(level=logging.INFO)

    hparams = VTLNTrainer.create_hparams()  # TODO: Parse input for hparams.

    # General parameters.
    hparams.num_questions = 609
    hparams.voice = "English"
    hparams.work_dir = os.path.realpath(
        os.path.join("experiments", hparams.voice))
    hparams.data_dir = os.path.realpath("database")
    hparams.out_dir = os.path.join(hparams.work_dir, "VTLNModel")
    hparams.num_speakers = 33
    hparams.speaker_emb_dim = 128
    hparams.frame_size_ms = 5
    hparams.seed = 1234
    hparams.num_coded_sps = 30
    hparams.add_deltas = True

    # Training parameters.
    hparams.epochs = 15
    hparams.use_gpu = True
    hparams.train_pre_net = True
    hparams.dropout = 0.05
    hparams.batch_size_train = 2
    hparams.batch_size_val = hparams.batch_size_train
    hparams.batch_size_benchmark = hparams.batch_size_train
    hparams.grad_clip_norm_type = 2
    hparams.grad_clip_max_norm = 1.0
    hparams.use_saved_learning_rate = False
    hparams.optimiser_args["lr"] = 0.001
    hparams.optimiser_type = "Adam"
    hparams.scheduler_type = "Plateau"
    hparams.scheduler_args["patience"] = 5
    hparams.start_with_test = True
    hparams.epochs_per_checkpoint = 5
    hparams.save_final_model = True
    hparams.use_best_as_final_model = True

    # hparams.model_type = None
    hparams.model_type = "VTLN"
    hparams.model_name = "VTLN-emb_all.nn"
    hparams.pre_net_model_name = "Bds-emb_all-dropout05-lr001.nn"
    hparams.pass_embs_to_pre_net = True
    hparams.f_get_emb_index = (vctk_utils.id_name_to_speaker_English, )

    # Training.
    makedirs_safe(os.path.join(hparams.out_dir, "nn"))
    source_model_path = os.path.join(hparams.work_dir, "BaselineModel", "nn",
                                     hparams.pre_net_model_name)
    target_model_path = os.path.join(hparams.out_dir, "nn",
                                     hparams.pre_net_model_name)
    logging.info("Copy {} to {}.".format(source_model_path, target_model_path))
    shutil.copyfile(source_model_path, target_model_path)

    trainer = VTLNTrainer(hparams)
    trainer.init(hparams)
    trainer.train(hparams)
    trainer.benchmark(hparams)

    # hparams.synth_gen_figure = False
    hparams.synth_vocoder = "WORLD"

    synth_list = dict()
    synth_list["train"] = ["p225/p225_010", "p226/p226_010", "p239/p239_010"]
    synth_list["val"] = ["p225/p225_051", "p226/p226_009", "p239/p239_066"]
    synth_list["test"] = ["p225/p225_033", "p226/p226_175", "p239/p239_056"]

    # with open(os.path.join(hparams.data_dir, "file_id_list_English_listening_test.txt" + sys.argv[1])) as f:
    #     id_list_val = f.readlines()
    # synth_list["val"] = [s.strip(' \t\n\r') for s in id_list_val]  # Trim line endings in-place.

    for key, value in synth_list.items():
        hparams.synth_file_suffix = "_" + str(key)
        trainer.synth(hparams, synth_list[key])
Beispiel #29
0
def main():
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = "English"
    hparams.model_name = "AllPassWarpModelTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    # hparams.num_questions = 505
    hparams.num_questions = 425
    hparams.out_dir = os.path.join("experiments", hparams.voice,
                                   "VTLNArtificiallyWarped")
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "all_pass_warp_test"
    hparams.synth_dir = hparams.out_dir
    batch_size = 2
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    # hparams.add_hparam("warp_matrix_size", hparams.num_coded_sps)
    hparams.alpha_ranges = [
        0.2,
    ]

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps,
                               num_bap=hparams.num_bap)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        "experiments/" + hparams.voice + "/WORLD",
        "experiments/" + hparams.voice + "/questions", "ignored",
        hparams.num_questions, hparams)

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    all_pass_warp_model = AllPassWarpModel((hparams.num_coded_sps, ),
                                           (hparams.num_coded_sps, ), hparams)
    all_pass_warp_model.set_norm_params(sp_mean, sp_std_dev)

    # id_list = ["dorian/doriangray_16_00199"]
    # id_list = ["p225/p225_051", "p277/p277_012", "p278/p278_012", "p279/p279_012"]
    id_list = ["p225/p225_051"]

    t_benchmark = 0
    for id_name in id_list:
        sample = WorldFeatLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "WORLD"),
            add_deltas=True,
            num_coded_sps=hparams.num_coded_sps,
            num_bap=hparams.num_bap,
            sp_type=hparams.sp_type)
        sample_pre = gen_in.preprocess_sample(sample)
        coded_sps = sample_pre[:, :hparams.num_coded_sps *
                               (3 if hparams.add_deltas else 1)].copy()
        coded_sps = coded_sps[:, None,
                              ...].repeat(batch_size,
                                          1)  # Copy data in batch dimension.

        for idx, alpha in enumerate(np.arange(-0.2, 0.21, 0.05)):
            out_dir = os.path.join(hparams.out_dir,
                                   "alpha_{0:0.2f}".format(alpha))
            makedirs_safe(out_dir)

            alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha
            alpha_vec = alpha_vec[:, None].repeat(
                batch_size, 1)  # Copy data in batch dimension.

            t_start = timer()
            sp_warped, (_, nn_alpha) = all_pass_warp_model(
                torch.from_numpy(coded_sps.copy()),
                None, (len(coded_sps), ), (len(coded_sps), ),
                alphas=torch.tensor(alpha_vec, requires_grad=True))
            sp_warped.sum().backward()
            t_benchmark += timer() - t_start
            # assert((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all())  # Compare results for cloned coded_sps within batch.
            if np.isclose(alpha, 0):
                assert np.isclose(
                    sp_warped.detach().cpu().numpy(),
                    coded_sps).all()  # Compare no warping results.
            sample_pre[:len(sp_warped), :hparams.num_coded_sps * (
                3 if hparams.add_deltas else 1)] = sp_warped[:, 0].detach()

            sample_post = gen_in.postprocess_sample(sample_pre,
                                                    apply_mlpg=False)
            # Manually create samples without normalisation but with deltas.
            sample_pre_with_deltas = (sample_pre * gen_in.norm_params[1] +
                                      gen_in.norm_params[0]).astype(np.float32)

            if np.isnan(sample_pre_with_deltas).any():
                raise ValueError(
                    "Detected nan values in output features for {}.".format(
                        id_name))
            # Save warped features.
            makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name)))
            sample_pre_with_deltas.tofile(
                os.path.join(out_dir,
                             id_name + "." + WorldFeatLabelGen.ext_deltas))

            hparams.synth_dir = out_dir
            # sample_no_deltas = WorldFeatLabelGen.convert_from_world_features(*WorldFeatLabelGen.convert_to_world_features(sample, contains_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap))
            Synthesiser.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} runs: {}, average: {}".format(
        len(id_list) * idx, timedelta(seconds=t_benchmark),
        timedelta(seconds=t_benchmark) / (len(id_list) * idx)))
Beispiel #30
0
    def gen_data(self, dir_in, dir_out=None, file_id_list="", id_list=None, add_deltas=False, return_dict=False):
        """
        Prepare LF0 and V/UV features from audio files. If add_delta is false each numpy array has the dimension
        num_frames x 2 [f0, vuv], otherwise the deltas and double deltas are added between
        the features resulting in num_frames x 4 [lf0(3*1), vuv].

        :param dir_in:         Directory where the .wav files are stored for each utterance to process.
        :param dir_out:        Main directory where the labels and normalisation parameters are saved to subdirectories.
                               If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids. Normalisation parameters are saved using
                               this name to differentiate parameters between subsets.
        :param id_list:        The list of utterances to process.
                               Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                               If None, all file in audio_dir are used.
        :param add_deltas:     Add deltas and double deltas to all features except vuv.
        :param return_dict:    If true, returns an OrderedDict of all samples as first output.
        :return:               Returns two normalisation parameters as tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict followed by the two normalisation parameters.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(os.path.basename(file_id_list))[0]

        # Create directories in dir_out if it is given.
        if dir_out is not None:
            if add_deltas:
                makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_deltas))
            else:
                makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_lf0))
                makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_vuv))

        # Create the return dictionary if required.
        if return_dict:
            label_dict = OrderedDict()

        # Create normalisation computation units.
        norm_params_ext_lf0 = MeanStdDevExtractor()
        # norm_params_ext_vuv = MeanStdDevExtractor()
        norm_params_ext_deltas = MeanStdDevExtractor()

        logging.info("Extract WORLD LF0 features for " + "[{0}]".format(", ".join(str(i) for i in id_list)))
        for file_name in id_list:
            logging.debug("Extract WORLD LF0 features from " + file_name)

            # Load audio file and extract features.
            audio_name = os.path.join(dir_in, file_name + ".wav")
            raw, fs = soundfile.read(audio_name)
            _f0, t = pyworld.dio(raw, fs)  # Raw pitch extraction. TODO: Use magphase here?
            f0 = pyworld.stonemask(raw, _f0, t, fs)  # Pitch refinement.

            # Compute lf0 and vuv information.
            lf0 = np.log(f0, dtype=np.float32)
            lf0[lf0 <= math.log(LF0LabelGen.f0_silence_threshold)] = LF0LabelGen.lf0_zero
            lf0, vuv = interpolate_lin(lf0)

            if add_deltas:
                # Compute the deltas and double deltas for all features.
                lf0_deltas, lf0_double_deltas = compute_deltas(lf0)

                # Combine them to a single feature sample.
                labels = np.concatenate((lf0, lf0_deltas, lf0_double_deltas, vuv), axis=1)

                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = labels
                if dir_out is not None:
                    labels.tofile(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_name + LF0LabelGen.ext_deltas))

                # Add sample to normalisation computation unit.
                norm_params_ext_deltas.add_sample(labels)
            else:
                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = np.concatenate((lf0, vuv), axis=1)
                if dir_out is not None:
                    lf0.tofile(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_name + LF0LabelGen.ext_lf0))
                    vuv.astype(np.float32).tofile(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_name + LF0LabelGen.ext_vuv))

                # Add sample to normalisation computation unit.
                norm_params_ext_lf0.add_sample(lf0)
                # norm_params_ext_vuv.add_sample(vuv)

        # Save mean and std dev of all features.
        if not add_deltas:
            norm_params_ext_lf0.save(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_id_list_name))
            # norm_params_ext_vuv.save(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_id_list_name))
        else:
            # Manually set vuv normalisation parameters before saving.
            norm_params_ext_deltas.sum_frames[-1] = 0.0  # Mean = 0.0
            norm_params_ext_deltas.sum_squared_frames[-1] = norm_params_ext_deltas.sum_length  # Variance = 1.0
            norm_params_ext_deltas.save(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_id_list_name))

        # Get normalisation parameters.
        if not add_deltas:
            norm_lf0 = norm_params_ext_lf0.get_params()
            # norm_vuv = norm_params_ext_vuv.get_params()

            norm_first = np.concatenate((norm_lf0[0], (0.0,)), axis=0)
            norm_second = np.concatenate((norm_lf0[1], (1.0,)), axis=0)
        else:
            norm_first, norm_second = norm_params_ext_deltas.get_params()

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, norm_first, norm_second
        else:
            return norm_first, norm_second