コード例 #1
0
def main():
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = "English"
    hparams.model_name = "WarpingLayerTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    # hparams.num_questions = 505
    hparams.num_questions = 425
    hparams.out_dir = "experiments/" + hparams.voice + "/VTLNArtificiallyWarped/"
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "warping_layer_test"
    hparams.synth_dir = hparams.out_dir
    batch_size = 2
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        "experiments/" + hparams.voice + "/WORLD",
        "experiments/" + hparams.voice + "/questions", "ignored",
        hparams.num_questions, hparams)

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ),
                      hparams)
    wl.set_norm_params(sp_mean, sp_std_dev)

    # id_list = ["dorian/doriangray_16_00199"]
    id_list = ["p225/p225_051"]
    hparams.num_speakers = 1

    t_benchmark = 0
    for id_name in id_list:
        for idx, alpha in enumerate(np.arange(-0.15, 0.2, 0.05)):
            out_dir = hparams.out_dir + "alpha_{0:0.2f}/".format(alpha)
            makedirs_safe(out_dir)

            sample = WorldFeatLabelGen.load_sample(
                id_name,
                os.path.join("experiments", hparams.voice, "WORLD"),
                add_deltas=True,
                num_coded_sps=hparams.num_coded_sps)
            sample_pre = gen_in.preprocess_sample(sample)
            coded_sps = sample_pre[:, :hparams.num_coded_sps *
                                   (3 if hparams.add_deltas else 1)]

            alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha

            coded_sps = coded_sps[:len(alpha_vec), None, ...].repeat(
                batch_size, 1)  # Copy data in batch dimension.
            alpha_vec = alpha_vec[:, None, None].repeat(
                batch_size, 1)  # Copy data in batch dimension.

            t_start = timer()
            mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps),
                                            None, (len(coded_sps), ),
                                            (len(coded_sps), ),
                                            alphas=torch.from_numpy(alpha_vec))
            mfcc_warped.sum().backward()
            t_benchmark += timer() - t_start
            assert ((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all()
                    )  # Compare results for cloned coded_sps within batch.
            if alpha == 0:
                assert ((mfcc_warped == coded_sps).all()
                        )  # Compare results for no warping.
            sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * (
                3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach()

            sample_post = gen_in.postprocess_sample(sample_pre)
            # Manually create samples without normalisation but with deltas.
            sample_pre = (sample_pre * gen_in.norm_params[1] +
                          gen_in.norm_params[0]).astype(np.float32)

            if np.isnan(sample_pre).any():
                raise ValueError(
                    "Detected nan values in output features for {}.".format(
                        id_name))
            # Save warped features.
            makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name)))
            sample_pre.tofile(
                os.path.join(out_dir, id_name + WorldFeatLabelGen.ext_deltas))

            hparams.synth_dir = out_dir
            Synthesiser.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} runs: {}".format(
        len(id_list) * idx, timedelta(seconds=t_benchmark)))
コード例 #2
0
class AcousticModelTrainer(ModelTrainer):
    """
    Implementation of a ModelTrainer for the generation of acoustic data.

    Use question labels as input and WORLD features w/o deltas/double deltas (specified in hparams.add_deltas) as output.
    Synthesize audio from model output with MLPG smoothing.
    """
    logger = logging.getLogger(__name__)

    #########################
    # Default constructor
    #
    def __init__(self,
                 dir_world_features,
                 dir_question_labels,
                 id_list,
                 num_questions,
                 hparams=None):
        """Default constructor.

        :param dir_world_features:      Path to the directory containing the world features.
        :param dir_question_labels:     Path to the directory containing the question labels.
        :param id_list:                 List of ids, can contain a speaker directory.
        :param num_questions:           Number of questions in question file.
        :param hparams:                 Set of hyper parameters.
        """
        if hparams is None:
            hparams = self.create_hparams()
            hparams.out_dir = os.path.curdir

        # Write missing default parameters.
        if hparams.variable_sequence_length_train is None:
            hparams.variable_sequence_length_train = hparams.batch_size_train > 1
        if hparams.variable_sequence_length_test is None:
            hparams.variable_sequence_length_test = hparams.batch_size_test > 1
        if hparams.synth_dir is None:
            hparams.synth_dir = os.path.join(hparams.out_dir, "synth")

        super(AcousticModelTrainer, self).__init__(id_list, hparams)

        self.InputGen = QuestionLabelGen(dir_question_labels, num_questions)
        self.InputGen.get_normalisation_params(
            dir_question_labels, hparams.input_norm_params_file_prefix)

        self.OutputGen = WorldFeatLabelGen(dir_world_features,
                                           add_deltas=hparams.add_deltas,
                                           num_coded_sps=hparams.num_coded_sps,
                                           sp_type=hparams.sp_type)
        self.OutputGen.get_normalisation_params(
            dir_world_features, hparams.output_norm_params_file_prefix)

        self.dataset_train = LabelGensDataset(self.id_list_train,
                                              self.InputGen,
                                              self.OutputGen,
                                              hparams,
                                              match_lengths=True)
        self.dataset_val = LabelGensDataset(self.id_list_val,
                                            self.InputGen,
                                            self.OutputGen,
                                            hparams,
                                            match_lengths=True)

        if self.loss_function is None:
            self.loss_function = torch.nn.MSELoss(reduction='none')

        if hparams.scheduler_type == "default":
            hparams.scheduler_type = "Plateau"
            hparams.add_hparams(plateau_verbose=True)

    @staticmethod
    def create_hparams(hparams_string=None, verbose=False):
        """Create model hyper parameter container. Parse non default from given string."""
        hparams = ModelTrainer.create_hparams(hparams_string, verbose=False)

        hparams.add_hparams(
            num_questions=None,
            question_file=None,  # Used to add labels in plot.
            num_coded_sps=60,
            sp_type="mcep",
            add_deltas=True,
            synth_load_org_sp=False,
            synth_load_org_lf0=False,
            synth_load_org_vuv=False,
            synth_load_org_bap=False)

        if verbose:
            logging.info(hparams.get_debug_string())

        return hparams

    def gen_figure_from_output(self, id_name, label, hidden, hparams):

        labels_post = self.OutputGen.postprocess_sample(label)
        coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features(
            labels_post,
            contains_deltas=False,
            num_coded_sps=hparams.num_coded_sps)
        lf0, _ = interpolate_lin(lf0)

        # Load original lf0.
        org_labels_post = WorldFeatLabelGen.load_sample(
            id_name,
            self.OutputGen.dir_labels,
            add_deltas=self.OutputGen.add_deltas,
            num_coded_sps=hparams.num_coded_sps)
        original_mgc, original_lf0, original_vuv, *_ = WorldFeatLabelGen.convert_to_world_features(
            org_labels_post,
            contains_deltas=self.OutputGen.add_deltas,
            num_coded_sps=hparams.num_coded_sps)
        original_lf0, _ = interpolate_lin(original_lf0)

        # Get a data plotter.
        grid_idx = 0
        plotter = DataPlotter()
        net_name = os.path.basename(hparams.model_name)
        filename = str(os.path.join(hparams.out_dir, id_name + '.' + net_name))
        plotter.set_title(id_name + ' - ' + net_name)
        plotter.set_num_colors(3)
        # plotter.set_lim(grid_idx=0, ymin=math.log(60), ymax=math.log(250))
        plotter.set_label(grid_idx=grid_idx,
                          xlabel='frames [' + str(hparams.frame_size_ms) +
                          ' ms]',
                          ylabel='log(f0)')

        graphs = list()
        graphs.append((original_lf0, 'Original lf0'))
        graphs.append((lf0, 'PyTorch lf0'))
        plotter.set_data_list(grid_idx=grid_idx, data_list=graphs)
        plotter.set_area_list(grid_idx=grid_idx,
                              area_list=[(np.invert(vuv.astype(bool)), '0.8',
                                          1.0),
                                         (np.invert(original_vuv.astype(bool)),
                                          'red', 0.2)])

        grid_idx += 1
        import librosa
        plotter.set_label(grid_idx=grid_idx,
                          xlabel='frames [' + str(hparams.frame_size_ms) +
                          ' ms]',
                          ylabel='Original spectrogram')
        plotter.set_specshow(grid_idx=grid_idx,
                             spec=librosa.amplitude_to_db(np.absolute(
                                 WorldFeatLabelGen.mcep_to_amp_sp(
                                     original_mgc, hparams.synth_fs)),
                                                          top_db=None))

        grid_idx += 1
        plotter.set_label(grid_idx=grid_idx,
                          xlabel='frames [' + str(hparams.frame_size_ms) +
                          ' ms]',
                          ylabel='NN spectrogram')
        plotter.set_specshow(grid_idx=grid_idx,
                             spec=librosa.amplitude_to_db(np.absolute(
                                 WorldFeatLabelGen.mcep_to_amp_sp(
                                     coded_sp, hparams.synth_fs)),
                                                          top_db=None))

        if hasattr(hparams, "phoneme_indices") and hparams.phoneme_indices is not None \
           and hasattr(hparams, "question_file") and hparams.question_file is not None:
            questions = QuestionLabelGen.load_sample(
                id_name,
                "experiments/" + hparams.voice + "/questions/",
                num_questions=hparams.num_questions)[:len(lf0)]
            np_phonemes = QuestionLabelGen.questions_to_phonemes(
                questions, hparams.phoneme_indices, hparams.question_file)
            plotter.set_annotations(grid_idx, np_phonemes)

        plotter.gen_plot()
        plotter.save_to_file(filename + '.Org-PyTorch' +
                             hparams.gen_figure_ext)

    def compute_score(self, dict_outputs_post, dict_hiddens, hparams):

        # Get data for comparision.
        dict_original_post = dict()
        for id_name in dict_outputs_post.keys():
            dict_original_post[id_name] = WorldFeatLabelGen.load_sample(
                id_name,
                dir_out=self.OutputGen.dir_labels,
                add_deltas=True,
                num_coded_sps=hparams.num_coded_sps)

        f0_rmse = 0.0
        f0_rmse_max_id = "None"
        f0_rmse_max = 0.0
        all_rmse = []
        vuv_error_rate = 0.0
        vuv_error_max_id = "None"
        vuv_error_max = 0.0
        all_vuv = []
        mcd = 0.0
        mcd_max_id = "None"
        mcd_max = 0.0
        all_mcd = []
        bap_error = 0.0
        bap_error_max_id = "None"
        bap_error_max = 0.0
        all_bap_error = []

        for id_name, labels in dict_outputs_post.items():
            output_coded_sp, output_lf0, output_vuv, output_bap = self.OutputGen.convert_to_world_features(
                sample=labels,
                contains_deltas=False,
                num_coded_sps=hparams.num_coded_sps)
            output_vuv = output_vuv.astype(bool)

            # Get data for comparision.
            org_coded_sp, org_lf0, org_vuv, org_bap = self.OutputGen.convert_to_world_features(
                sample=dict_original_post[id_name],
                contains_deltas=self.OutputGen.add_deltas,
                num_coded_sps=hparams.num_coded_sps)

            # Compute f0 from lf0.
            org_f0 = np.exp(org_lf0.squeeze())[:len(
                output_lf0)]  # Fix minor negligible length mismatch.
            output_f0 = np.exp(output_lf0)

            # Compute MCD.
            org_coded_sp = org_coded_sp[:len(output_coded_sp)]
            current_mcd = metrics.melcd(
                output_coded_sp[:, 1:],
                org_coded_sp[:, 1:])  # TODO: Use aligned mcd.
            if current_mcd > mcd_max:
                mcd_max_id = id_name
                mcd_max = current_mcd
            mcd += current_mcd
            all_mcd.append(current_mcd)

            # Compute RMSE.
            f0_mse = (org_f0 - output_f0)**2
            current_f0_rmse = math.sqrt(
                (f0_mse * org_vuv[:len(output_lf0)]).sum() /
                org_vuv[:len(output_lf0)].sum())
            if current_f0_rmse != current_f0_rmse:
                logging.error(
                    "Computed NaN for F0 RMSE for {}.".format(id_name))
            else:
                if current_f0_rmse > f0_rmse_max:
                    f0_rmse_max_id = id_name
                    f0_rmse_max = current_f0_rmse
                f0_rmse += current_f0_rmse
                all_rmse.append(current_f0_rmse)

            # Compute error of VUV in percentage.
            num_errors = (org_vuv[:len(output_lf0)] != output_vuv)
            vuv_error_rate_tmp = float(num_errors.sum()) / len(output_lf0)
            if vuv_error_rate_tmp > vuv_error_max:
                vuv_error_max_id = id_name
                vuv_error_max = vuv_error_rate_tmp
            vuv_error_rate += vuv_error_rate_tmp
            all_vuv.append(vuv_error_rate_tmp)

            # Compute aperiodicity distortion.
            org_bap = org_bap[:len(output_bap)]
            if len(output_bap.shape) > 1 and output_bap.shape[1] > 1:
                current_bap_error = metrics.melcd(
                    output_bap, org_bap)  # TODO: Use aligned mcd?
            else:
                current_bap_error = math.sqrt(
                    ((org_bap - output_bap)**
                     2).mean()) * (10.0 / np.log(10) * np.sqrt(2.0))
            if current_bap_error > bap_error_max:
                bap_error_max_id = id_name
                bap_error_max = current_bap_error
            bap_error += current_bap_error
            all_bap_error.append(current_bap_error)

        f0_rmse /= len(dict_outputs_post)
        vuv_error_rate /= len(dict_outputs_post)
        mcd /= len(dict_original_post)
        bap_error /= len(dict_original_post)

        self.logger.info("Worst MCD: {} {:4.2f}dB".format(mcd_max_id, mcd_max))
        self.logger.info("Worst F0 RMSE: {} {:4.2f}Hz".format(
            f0_rmse_max_id, f0_rmse_max))
        self.logger.info("Worst VUV error: {} {:2.2f}%".format(
            vuv_error_max_id, vuv_error_max * 100))
        self.logger.info("Worst BAP error: {} {:4.2f}db".format(
            bap_error_max_id, bap_error_max))
        self.logger.info(
            "Benchmark score: MCD {:4.2f}dB, F0 RMSE {:4.2f}Hz, VUV {:2.2f}%, BAP error {:4.2f}db"
            .format(mcd, f0_rmse, vuv_error_rate * 100, bap_error))

        return mcd, f0_rmse, vuv_error_rate, bap_error

    def synthesize(self, id_list, synth_output, hparams):
        """
        Depending on hparams override the network output with the extracted features,
        then continue with normal synthesis pipeline.
        """

        if hparams.synth_load_org_sp\
                or hparams.synth_load_org_lf0\
                or hparams.synth_load_org_vuv\
                or hparams.synth_load_org_bap:
            for id_name in id_list:

                world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\
                                              else os.path.join(self.OutputGen.dir_labels,
                                                                self.dir_extracted_acoustic_features)
                labels = WorldFeatLabelGen.load_sample(
                    id_name, world_dir, num_coded_sps=hparams.num_coded_sps)
                len_diff = len(labels) - len(synth_output[id_name])
                if len_diff > 0:
                    labels = WorldFeatLabelGen.trim_end_sample(labels,
                                                               int(len_diff /
                                                                   2),
                                                               reverse=True)
                    labels = WorldFeatLabelGen.trim_end_sample(
                        labels, len_diff - int(len_diff / 2))

                if hparams.synth_load_org_sp:
                    synth_output[
                        id_name][:len(labels), :self.OutputGen.
                                 num_coded_sps] = labels[:, :self.OutputGen.
                                                         num_coded_sps]

                if hparams.synth_load_org_lf0:
                    synth_output[id_name][:len(labels), -3] = labels[:, -3]

                if hparams.synth_load_org_vuv:
                    synth_output[id_name][:len(labels), -2] = labels[:, -2]

                if hparams.synth_load_org_bap:
                    synth_output[id_name][:len(labels), -1] = labels[:, -1]

        # Run the vocoder.
        ModelTrainer.synthesize(self, id_list, synth_output, hparams)
コード例 #3
0
def main():
    """Create samples with artificial alpha for each phoneme."""
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = sys.argv[1]
    hparams.model_name = "WarpingLayerTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    alpha_range = 0.2
    num_phonemes = 70

    num_random_alphas = 7
    # num_random_alphas = 53

    # Randomly pick alphas for each phoneme.
    np.random.seed(42)
    # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range)
    phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 *
                                alpha_range)

    # hparams.num_questions = 505
    hparams.num_questions = 609
    # hparams.num_questions = 425

    hparams.out_dir = os.path.join("experiments", hparams.voice,
                                   "WORLD_artificially_warped")
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "warping_layer_test"
    hparams.synth_dir = hparams.out_dir
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    print(
        "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}."
        .format(hparams.voice, hparams.out_dir, hparams.num_questions,
                len(np.unique(phonemes_to_alpha_tensor)), alpha_range))

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        os.path.join("experiments", hparams.voice, "WORLD"),
        os.path.join("experiments", hparams.voice, "questions"), "ignored",
        hparams.num_questions, hparams)

    hparams.num_speakers = 1
    speaker = "p276"
    num_synth_files = 5  # Number of files to synthesise to check warping manually.

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ),
                      hparams)
    wl.set_norm_params(sp_mean, sp_std_dev)

    def _question_to_phoneme_index(questions):
        """Helper function to convert questions to their current phoneme index."""
        if questions.shape[-1] == 505:  # German question set.
            indices = np.arange(86, 347, 5, dtype=np.int)
        elif questions.shape[-1] == 425:  # English radio question set.
            indices = np.arange(58, 107, dtype=np.int)
        elif questions.shape[-1] == 609:  # English unilex question set.
            indices = np.arange(92, 162, dtype=np.int)
        else:
            raise NotImplementedError(
                "Unknown question set with {} questions.".format(
                    questions.shape[-1]))
        return QuestionLabelGen.questions_to_phoneme_indices(
            questions, indices)

    # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f:
    with open(
            os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format(
                hparams.voice))) as f:
        id_list = f.readlines()
    id_list[:] = [s.strip(' \t\n\r') for s in id_list
                  if speaker in s]  # Trim line endings in-place.

    out_dir = hparams.out_dir
    makedirs_safe(out_dir)
    makedirs_safe(os.path.join(out_dir,
                               "cmp_mgc" + str(hparams.num_coded_sps)))
    t_benchmark = 0
    org_to_warped_mcd = 0.0
    for idx, id_name in enumerate(id_list):

        sample = WorldFeatLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "WORLD"),
            add_deltas=True,
            num_coded_sps=hparams.num_coded_sps)
        sample_pre = gen_in.preprocess_sample(sample)
        coded_sps = sample_pre[:, :hparams.num_coded_sps *
                               (3 if hparams.add_deltas else 1)]

        questions = QuestionLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "questions"),
            num_questions=hparams.num_questions)
        questions = questions[:len(coded_sps)]
        phoneme_indices = _question_to_phoneme_index(questions)
        alpha_vec = phonemes_to_alpha_tensor[phoneme_indices %
                                             len(phonemes_to_alpha_tensor),
                                             None]

        coded_sps = coded_sps[:len(alpha_vec), None,
                              ...]  # Create a batch dimension.
        alpha_vec = alpha_vec[:, None,
                              None]  # Create a batch and feature dimension.

        t_start = timer()
        mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps),
                                        None, (len(coded_sps), ),
                                        (len(coded_sps), ),
                                        alphas=torch.from_numpy(alpha_vec))
        t_benchmark += timer() - t_start
        sample_pre[:len(mfcc_warped), :hparams.num_coded_sps *
                   (3 if hparams.add_deltas else 1)] = mfcc_warped[:,
                                                                   0].detach()

        sample_post = gen_in.postprocess_sample(sample_pre)
        # Manually create samples without normalisation but with deltas.
        sample_pre = (sample_pre * gen_in.norm_params[1] +
                      gen_in.norm_params[0]).astype(np.float32)

        if np.isnan(sample_pre).any():
            raise ValueError(
                "Detected nan values in output features for {}.".format(
                    id_name))

        # Compute error between warped version and original one.
        org_to_warped_mcd += metrics.melcd(
            sample[:, 0:hparams.num_coded_sps],
            sample_pre[:, 0:hparams.num_coded_sps])

        # Save warped features.
        sample_pre.tofile(
            os.path.join(
                out_dir, "cmp_mgc" + str(hparams.num_coded_sps),
                os.path.basename(id_name + WorldFeatLabelGen.ext_deltas)))

        hparams.synth_dir = out_dir
        if idx < num_synth_files:  # Only synthesize a few of samples.
            trainer.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}".
          format(len(id_list), timedelta(seconds=t_benchmark),
                 org_to_warped_mcd / len(id_list)))

    # Copy normalisation files which are necessary for training.
    for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]:
        shutil.copyfile(
            os.path.join(
                gen_in.dir_labels, gen_in.dir_deltas,
                MeanCovarianceExtractor.file_name_appendix + feature + ".bin"),
            os.path.join(
                out_dir, "cmp_mgc" + str(hparams.num_coded_sps),
                MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
コード例 #4
0
def main():
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = "English"
    hparams.model_name = "AllPassWarpModelTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    # hparams.num_questions = 505
    hparams.num_questions = 425
    hparams.out_dir = os.path.join("experiments", hparams.voice,
                                   "VTLNArtificiallyWarped")
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "all_pass_warp_test"
    hparams.synth_dir = hparams.out_dir
    batch_size = 2
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    # hparams.add_hparam("warp_matrix_size", hparams.num_coded_sps)
    hparams.alpha_ranges = [
        0.2,
    ]

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps,
                               num_bap=hparams.num_bap)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        "experiments/" + hparams.voice + "/WORLD",
        "experiments/" + hparams.voice + "/questions", "ignored",
        hparams.num_questions, hparams)

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    all_pass_warp_model = AllPassWarpModel((hparams.num_coded_sps, ),
                                           (hparams.num_coded_sps, ), hparams)
    all_pass_warp_model.set_norm_params(sp_mean, sp_std_dev)

    # id_list = ["dorian/doriangray_16_00199"]
    # id_list = ["p225/p225_051", "p277/p277_012", "p278/p278_012", "p279/p279_012"]
    id_list = ["p225/p225_051"]

    t_benchmark = 0
    for id_name in id_list:
        sample = WorldFeatLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "WORLD"),
            add_deltas=True,
            num_coded_sps=hparams.num_coded_sps,
            num_bap=hparams.num_bap,
            sp_type=hparams.sp_type)
        sample_pre = gen_in.preprocess_sample(sample)
        coded_sps = sample_pre[:, :hparams.num_coded_sps *
                               (3 if hparams.add_deltas else 1)].copy()
        coded_sps = coded_sps[:, None,
                              ...].repeat(batch_size,
                                          1)  # Copy data in batch dimension.

        for idx, alpha in enumerate(np.arange(-0.2, 0.21, 0.05)):
            out_dir = os.path.join(hparams.out_dir,
                                   "alpha_{0:0.2f}".format(alpha))
            makedirs_safe(out_dir)

            alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha
            alpha_vec = alpha_vec[:, None].repeat(
                batch_size, 1)  # Copy data in batch dimension.

            t_start = timer()
            sp_warped, (_, nn_alpha) = all_pass_warp_model(
                torch.from_numpy(coded_sps.copy()),
                None, (len(coded_sps), ), (len(coded_sps), ),
                alphas=torch.tensor(alpha_vec, requires_grad=True))
            sp_warped.sum().backward()
            t_benchmark += timer() - t_start
            # assert((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all())  # Compare results for cloned coded_sps within batch.
            if np.isclose(alpha, 0):
                assert np.isclose(
                    sp_warped.detach().cpu().numpy(),
                    coded_sps).all()  # Compare no warping results.
            sample_pre[:len(sp_warped), :hparams.num_coded_sps * (
                3 if hparams.add_deltas else 1)] = sp_warped[:, 0].detach()

            sample_post = gen_in.postprocess_sample(sample_pre,
                                                    apply_mlpg=False)
            # Manually create samples without normalisation but with deltas.
            sample_pre_with_deltas = (sample_pre * gen_in.norm_params[1] +
                                      gen_in.norm_params[0]).astype(np.float32)

            if np.isnan(sample_pre_with_deltas).any():
                raise ValueError(
                    "Detected nan values in output features for {}.".format(
                        id_name))
            # Save warped features.
            makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name)))
            sample_pre_with_deltas.tofile(
                os.path.join(out_dir,
                             id_name + "." + WorldFeatLabelGen.ext_deltas))

            hparams.synth_dir = out_dir
            # sample_no_deltas = WorldFeatLabelGen.convert_from_world_features(*WorldFeatLabelGen.convert_to_world_features(sample, contains_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap))
            Synthesiser.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} runs: {}, average: {}".format(
        len(id_list) * idx, timedelta(seconds=t_benchmark),
        timedelta(seconds=t_benchmark) / (len(id_list) * idx)))