Example #1
0
    def synthesize(self, file_id_list, synth_output, hparams):

        # Create speaker subdirectories if necessary.
        for id_name in file_id_list:
            path_split = os.path.split(id_name)
            if len(path_split) > 2:
                makedirs_safe(os.path.join(hparams.synth_dir, *path_split[:-1]))

        if hparams.synth_vocoder == "WORLD":
            self.run_world_synth(synth_output, hparams)
        # elif hparams.synth_vocoder == "STRAIGHT":  # Add further vocoders here.
        elif hparams.synth_vocoder == "r9y9wavenet_quantized_16k_world_feats_English":

            # If no path is given, use pre-trained model.
            if not hasattr(hparams, "synth_vocoder_path") or hparams.synth_vocoder_path is None:
                parent_dirs = os.path.realpath(__file__).split(os.sep)
                dir_itts = str.join(os.sep, parent_dirs[:parent_dirs.index(main_dir) + 1])
                hparams.synth_vocoder_path = os.path.join(dir_itts, "misc", "pretrained", "r9y9wavenet_quantized_16k_world_feats_English.nn")

            # Default quantization is with mu=255.
            if not hasattr(hparams, "mu") or hparams.mu is None:
                hparams.mu = 255

            org_frame_rate_output_Hz = hparams.frame_rate_output_Hz if hasattr(hparams, 'frame_rate_output_Hz') else None
            hparams.frame_rate_output_Hz = 16000
            self.run_r9y9wavenet_quantized_16k_world_feats_synth(synth_output, hparams)
            hparams.frame_rate_output_Hz = org_frame_rate_output_Hz
Example #2
0
 def save_to_file(self, filename):
     if self.plt is None:
         logging.error(
             "No generated plot exists, please run 'gen_plot()' first.")
     else:
         makedirs_safe(os.path.dirname(filename))
         self.plt.savefig(filename, bbox_inches=0)
         logging.info("Figure saved as " + filename)
Example #3
0
    def synth(self, hparams, ids_input):
        assert(hparams.synth_dir is not None)  # Directory to store the generated audio files has to be set.
        makedirs_safe(hparams.synth_dir)
        id_list = ModelTrainer._input_to_str_list(ids_input)

        self.logger.info("Start synthesising [{0}]".format(", ".join(str(i) for i in id_list)))
        t_start = timer()
        model_output, model_output_post = self._forward_batched(hparams, id_list, hparams.batch_size_synth, load_target=False, synth=True, benchmark=False, gen_figure=hparams.synth_gen_figure)
        t_training = timer() - t_start
        self.logger.info('Synthesis time for {} sample(s): {}'.format(len(id_list), timedelta(seconds=t_training)))
        return model_output, model_output_post
    def gen_data(dir_in,
                 file_questions,
                 dir_out=None,
                 file_id_list=None,
                 id_list=None,
                 return_dict=False):
        """
        Generate question labels from HTK labels.

        :param dir_in:         Directory containing the HTK labels.
        :param file_questions: Full file path to the question file.
        :param dir_out:        Directory to store the question labels. If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids. Normalisation parameters are saved using
                               this name to differentiate parameters between subsets.a
        :param id_list:        The list of utterances to process.
                               Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                               If None, all file in audio_dir are used.
        :param return_dict:    If true, returns an OrderedDict of all samples as first output.
        :return:               Returns two normalisation parameters as tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict followed by the two normalisation parameters.
        """

        # Fill file_id_list by .lab files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.lab"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]
            id_list = [
                '{}'.format(os.path.basename(element)) for element in id_list
            ]  # Ignore full path.

        # Create directories in dir_out if it is given.
        if dir_out is not None:
            makedirs_safe(dir_out)

        # Get question generation class.
        label_operater = HTSLabelNormalisation(file_questions)
        if return_dict:
            label_dict, norm_params = label_operater.perform_normalisation(
                file_id_list_name, id_list, dir_in, dir_out, return_dict=True)
            # self.norm_params = (samples_min, samples_max)
            return label_dict, norm_params[0], norm_params[1]
        else:
            norm_params = label_operater.perform_normalisation(
                file_id_list_name, id_list, dir_in, dir_out, return_dict=False)
            return norm_params[0], norm_params[1]
Example #5
0
    def init(self, hparams):
        # Create and initialize model.
        self.logger.info("Create ModelHandler.")
        self.model_handler = ModelHandlerPyTorch(hparams)

        self.logger.info("Model handler ready.")
        self.logger.info("CPU memory: " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e3) + " MB.")
        if hparams.use_gpu:
            self.logger.info("GPU memory: " + str(get_gpu_memory_map()) + " MB.")

        # Create the necessary directories.
        makedirs_safe(os.path.join(hparams.out_dir, hparams.networks_dir, hparams.checkpoints_dir))
        # Create the default model path if not set.
        if hparams.model_dir is None:
            hparams.model_dir = os.path.join(hparams.out_dir, hparams.networks_dir)

        model_path = os.path.join(hparams.model_dir, hparams.model_name)
        if hparams.epochs <= 0:
            # Try to load the model. If it doesn't exist, create a new one and save it.
            # Return the loaded/created model, because no training was requested.
            try:
                self.model_handler.load_model(model_path,
                                              hparams.use_gpu,
                                              hparams.optimiser_args["lr"] if hparams.optimiser_args["lr"] is not None
                                                                           else hparams.learning_rate)
            except FileNotFoundError:
                if hparams.model_type is None:
                    self.logger.error("Model does not exist at {} and you didn't give model_type to create a new one.".format(model_path))
                    raise  # This will rethrow the last exception.
                else:
                    self.logger.warning('Model does not exist at {}. Creating a new one instead and saving it.'.format(model_path))
                    dim_in, dim_out = self.dataset_train.get_dims()
                    self.model_handler.create_model(hparams, dim_in, dim_out)
                    self.model_handler.save_model(model_path)

            self.logger.info("Model ready.")
            return self.model_handler

        if hparams.model_type is None:
            self.model_handler.load_model(model_path, hparams.use_gpu, hparams.optimiser_args["lr"] if hparams.optimiser_args["lr"] is not None else hparams.learning_rate)
        else:
            dim_in, dim_out = self.dataset_train.get_dims()
            self.model_handler.create_model(hparams, dim_in, dim_out)

        self.logger.info("Model ready.")
Example #6
0
    def run_world_synth(self, synth_output, hparams):
        """Run the WORLD synthesize method."""
        fft_size = pyworld.get_cheaptrick_fft_size(hparams.synth_fs)

        save_dir = hparams.synth_dir if hparams.synth_dir is not None else hparams.out_dir if hparams.out_dir is not None else os.path.curdir
        for id_name, output in synth_output.items():
            logging.info("Synthesise {} with the WORLD vocoder.".format(id_name))

            coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features(output, contains_deltas=False, num_coded_sps=hparams.num_coded_sps)
            ln_sp = pysptk.mgc2sp(np.ascontiguousarray(coded_sp, dtype=np.float64), alpha=WorldFeatLabelGen.mgc_alpha, gamma=0.0, fftlen=fft_size)
            # sp = np.exp(sp.real * 2.0)
            # sp.imag = sp.imag * 180.0 / np.pi
            sp = np.exp(ln_sp.real)
            sp = np.power(sp.real / 32768.0, 2)
            # sp = np.power(sp.real / 32768.0, 2)
            # sp = np.exp(np.power(sp.real, 2))
            # sp = pyworld.decode_spectral_envelope(np.ascontiguousarray(coded_sp, np.float64), self.synth_fs, fft_size)  # Cepstral version.
            f0 = np.exp(lf0, dtype=np.float64)
            vuv[f0 < WorldFeatLabelGen.f0_silence_threshold] = 0  # WORLD throws an error for too small f0 values.
            f0[vuv == 0] = 0.0
            ap = pyworld.decode_aperiodicity(np.ascontiguousarray(bap.reshape(-1, 1), np.float64), hparams.synth_fs, fft_size)

            waveform = pyworld.synthesize(f0, sp, ap, hparams.synth_fs)
            waveform = waveform.astype(np.float32, copy=False)  # Does inplace conversion, if possible.

            # Always save as wav file first and convert afterwards if necessary.
            wav_file_path = os.path.join(save_dir, "{}{}{}.wav".format(os.path.basename(id_name),
                                                                       "_" + hparams.model_name if hparams.model_name is not None else "",
                                                                       hparams.synth_file_suffix, "_WORLD", ".wav"))
            makedirs_safe(hparams.synth_dir)
            soundfile.write(wav_file_path, waveform, hparams.synth_fs)

            # Use PyDub for special audio formats.
            if hparams.synth_ext.lower() != 'wav':
                as_wave = pydub.AudioSegment.from_wav(wav_file_path)
                as_wave.export(os.path.join(hparams.synth_dir, id_name + "." + hparams.synth_ext), format=hparams.synth_ext)
                os.remove(wav_file_path)
Example #7
0
                        parent_dirs[:parent_dirs.index("IdiapTTS") + 1])
    sys.path.append(
        dir_itts
    )  # Adds the IdiapTTS folder to the path, required to work on grid.
from misc.utils import makedirs_safe
"""
Function that down-samples a list of audio files.

python down_sampling.py <dir_audio> <dir_out> <file_id_list> <target_sampling_rate>
"""

# Read which files to process.
dir_audio = sys.argv[1]
dir_out = sys.argv[2]
file_id_list = sys.argv[3]
target_sampling_rate = sys.argv[4]

with open(file_id_list) as f:
    id_list = f.readlines()
# Trim entries in-place.
id_list[:] = [s.strip(' \t\n\r') for s in id_list]

for file_id in id_list:
    full_path_in = os.path.join(dir_audio, file_id + ".wav")
    print("Downsample " + full_path_in)
    sound = AudioSegment.from_file(full_path_in)
    sound = sound.set_frame_rate(16000)
    full_path_out = os.path.join(dir_out, file_id + ".wav")
    makedirs_safe(os.path.dirname(full_path_out))
    sound.export(full_path_out, format="wav")
Example #8
0
    def gen_data(self,
                 dir_in,
                 dir_out=None,
                 file_id_list=None,
                 id_list=None,
                 return_dict=False):
        """
        Prepare atom labels from wav files.
        If id_list is not None, only the ids listed there are generated, otherwise for each .wav file in the dir_in.
        Atoms are computed by the wcad algorithm. Examples with more than 70 atoms are rejected. One can create
        a new file_id_list by uncommenting the lines before the return statement. Nevertheless, the current file_id_list
        is not substituted by it. The algorithm also saves the extracted phrase component in dir_out/id_name.phrase,
        if dir_out is not None.

        :param dir_in:           Directory containing the original wav files.
        :param dir_out:          Directory where the labels are stored. If None, no labels are stored.
        :param file_id_list:     Name of the file containing the ids. Normalisation parameters are saved using
                                 this name to differentiate parameters between subsets.
        :param id_list:          The list of utterances to process.
                                 Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                                 If None, all wav files in audio_dir are used.
        :param return_dict:      If True, returns an OrderedDict of all samples as first output.
        :return:                 Returns mean=0.0, std_dev, min, max of atoms.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]

        if dir_out is not None:
            makedirs_safe(dir_out)

        if return_dict:
            label_dict = OrderedDict()

        mean_std_ext_atom = MeanStdDevExtractor()
        min_max_ext_atom = MinMaxExtractor()
        mean_std_ext_phrase = MeanStdDevExtractor()
        min_max_ext_phrase = MinMaxExtractor()

        # Compute Atoms.
        from wcad import WaveInput, PitchExtractor, MultiphraseExtractor, DictionaryGenerator, AtomExtrator, ModelCreator, ModelSaver, Params, Paths
        correct_utts = list()
        self.logger.info("Create atom labels for " +
                         "[{0}]".format(", ".join(str(i) for i in id_list)))
        for id_name in id_list:
            self.logger.debug("Create atom labels for " + id_name)

            # Wcad has to be called in its root directory, therefore a change dir operation is necessary.
            cwd = os.getcwd()
            os.chdir(self.wcad_root)
            args = [dir_in + "/" + id_name + ".wav", dir_out]
            print(args)
            params = Params()
            # Overwrite the possible theta values by selected values.
            params.local_atoms_thetas = self.theta_interval
            params.k = [self.k]
            # params.min_atom_amp = 0.1
            paths = Paths(args, params)
            # Start the extraction process.
            start_t = time.time()
            waveform = WaveInput(paths.wav, params).read()
            pitch = PitchExtractor(waveform, params, paths).compute()
            # Compute the phrase component.
            phrase = MultiphraseExtractor(pitch, waveform, params,
                                          paths).compute()
            phrase_curve = phrase.curve
            # Extract atroms.
            dictionary = DictionaryGenerator(params, paths).compute()
            atoms = AtomExtrator(waveform, pitch, phrase, dictionary, params,
                                 paths).compute()
            # Create a model.
            model = ModelCreator(phrase, atoms, pitch).compute()
            print(('Model created in %s seconds' % (time.time() - start_t)))
            # Save the atoms.
            ModelSaver(model, params, paths).save()
            os.chdir(cwd)

            # Check if output can be correct.
            possible_extraction_failure = False
            if len(atoms) < 50 and not any(a.amp > 10 for a in atoms):
                correct_utts.append(id_name)
            else:
                self.logger.warning("Possible fail of atom extractor for " +
                                    id_name + " (atoms: " + str(len(atoms)) +
                                    ", frames: " + str(len(phrase_curve)) +
                                    ", max: " +
                                    str(max(a.amp for a in atoms)) + ").")
                possible_extraction_failure = True

            atoms.sort(key=lambda x: x.position)
            # print_atoms(atoms)

            # Get audio length needed to trim the atoms.
            duration = self.get_audio_length(id_name, dir_in,
                                             self.frame_size_ms)

            # The algorithm generates a few atoms at negative positions,
            # pad them into the first atom at positive position.
            padded_amp = 0
            padded_theta = 0
            for idx, atom in enumerate(atoms):
                if atom.position < 0:
                    padded_amp += atom.amp
                    padded_theta += atom.theta
                else:
                    atoms[idx].amp += padded_amp  # Pad the amplitude.
                    atoms[idx].theta = (atoms[idx].theta +
                                        padded_theta) / (idx + 1)
                    del atoms[:idx]  # Remove the negative atoms from the list.
                    break
            # print_atoms(atoms)

            # The algorithm might also generate a few atoms beyond the last label,
            # pad them into the last label.
            padded_amp = 0
            padded_theta = 0
            for idx, atom in reversed(list(enumerate(atoms))):
                if atom.position * self.frame_size_ms > duration:
                    padded_amp += atom.amp
                    padded_theta += atom.theta
                else:
                    atoms[idx].amp += padded_amp
                    atoms[idx].theta = (atoms[idx].theta +
                                        padded_theta) / (len(atoms) - idx)
                    atoms = atoms[:-(len(atoms) - idx - 1)
                                  or None]  # Remove atoms beyond last label.
                    break
            # print_atoms(atoms)

            # Create a label for each frame (size of frame_size_ms) with amplitude and theta of contained atoms.
            np_atom_labels = AtomLabelGen.atoms_to_labels(
                atoms, self.theta_interval, int(duration / self.frame_size_ms))

            np_atom_amps = np.sum(np_atom_labels, axis=1)

            if not possible_extraction_failure:  # Only add successful extractions to mean and std_dev computation.
                mean_std_ext_atom.add_sample(
                    np_atom_amps[np_atom_amps[:, 0] != 0.0]
                )  # Only compute std_dev from atoms.
                min_max_ext_atom.add_sample(np_atom_amps)
                # mean_std_ext_phrase.add_sample(phrase_curve)
                # min_max_ext_phrase.add_sample(phrase_curve)

            if return_dict:
                label_dict[id_name] = np_atom_labels
            if dir_out is not None:
                # Save phrase, because it might be used in synthesis.
                phrase_curve.astype('float32').tofile(
                    os.path.join(dir_out, id_name + self.ext_phrase))

                # Save atoms binary (float32).
                np_atom_labels.astype('float32').tofile(
                    os.path.join(dir_out, id_name + self.ext_atoms))

                # Create a readable version of the atom data.
                # np.savetxt(os.path.join(dir_out, id_name + self.ext_atoms + ".txt"), np_atom_labels)

        # Manually set mean of atoms to 0, otherwise frames without atom will have an amplitude.
        mean_std_ext_atom.sum_frames[:] = 0.0
        mean_std_ext_atom.sum_squared_frames[
            1] = mean_std_ext_atom.sum_length * self.theta_interval[-1]
        mean_std_ext_atom.save(os.path.join(dir_out, file_id_list_name))
        min_max_ext_atom.save(os.path.join(dir_out, file_id_list_name))
        # mean_std_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase'))
        # min_max_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase'))

        mean_atoms, std_atoms = mean_std_ext_atom.get_params()
        min_atoms, max_atoms = min_max_ext_atom.get_params()
        # mean_phrase, std_phrase = mean_std_ext_phrase.get_params()
        # min_phrase, max_phrase = min_max_ext_atom.get_params()

        # Use this block to save the part of the file_id_list for which atom extraction was successful into a new file.
        if correct_utts:
            with open(
                    os.path.join(
                        os.path.dirname(dir_in), "wcad_" +
                        os.path.basename(file_id_list_name) + ".txt"),
                    'w') as f:
                f.write('\n'.join(correct_utts) + '\n')

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, \
                   mean_atoms, std_atoms, \
                   min_atoms, max_atoms
            # mean_phrase, std_phrase, \
            # min_phrase, max_phrase
        else:
            return mean_atoms, std_atoms, \
                   min_atoms, max_atoms
Example #9
0
    def process_file(self,
                     file,
                     dir_wav,
                     dir_out,
                     audio_format="wav",
                     silence_threshold_db=-50,
                     chunk_size_ms=10):
        sound = AudioSegment.from_file(os.path.join(dir_wav, file),
                                       format=audio_format)

        trim_start = self._detect_leading_silence(sound, silence_threshold_db,
                                                  chunk_size_ms)
        trim_end = self._detect_leading_silence(sound.reverse(),
                                                silence_threshold_db,
                                                chunk_size_ms)

        # Add silence to the front if audio starts to early.
        if trim_start < self.min_silence_ms:
            # TODO: Find a robust way to create silence so that HTK alignment still works (maybe concat mirrored segments).
            logging.warning(
                "File {} has only {} ms of silence in the beginning.".format(
                    file, trim_start))
            # AudioSegment.silent(duration=self.min_silence_ms-trim_start)
            # if trim_start > 0:
            #     silence = (sound[:trim_start] * (math.ceil(self.min_silence_ms / trim_start) - 1))[:self.min_silence_ms-trim_start]
            #     sound = silence + sound
            # elif trim_end > 0:
            #     silence = (sound[-trim_end:] * (math.ceil(self.min_silence_ms / trim_end) - 1))[:self.min_silence_ms-trim_end]
            #     sound = silence + sound
            # else:
            #     self.logger.warning("Cannot append silence to the front of " + file + ". No silence exists at front or end which can be copied.")
            trim_start = 0
        else:
            trim_start -= self.min_silence_ms

        # Append silence if audio ends too late.
        if trim_end < self.min_silence_ms:
            logging.warning(
                "File {} has only {} ms of silence in the end.".format(
                    file, trim_end))
            # silence = AudioSegment.silent(duration=self.min_silence_ms-trim_end)
            # if trim_end > 0:
            #     silence = (sound[-trim_end:] * (math.ceil(self.min_silence_ms / trim_end) - 1))[:self.min_silence_ms-trim_end]
            #     sound = sound + silence
            # elif trim_start > 0:
            #     silence = (sound[:trim_start] * (math.ceil(self.min_silence_ms / trim_start) - 1))[:self.min_silence_ms-trim_start]
            #     sound = sound + silence
            # else:
            #     self.logger.warning("Cannot append silence to the end of " + file + ". No silence exists at front or end which can be copied.")
            trim_end = 0
        else:
            trim_end -= self.min_silence_ms

        # Trim the sound.
        trimmed_sound = sound[trim_start:-trim_end - 1]

        # Save trimmed sound to file.
        out_file = os.path.join(dir_out, file)
        makedirs_safe(os.path.dirname(out_file))
        trimmed_sound.export(out_file, format=audio_format)

        return trimmed_sound
Example #10
0
def main():
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("-w",
                        "--dir_wav",
                        help="Directory containing the wav files.",
                        type=str,
                        dest="dir_wav",
                        required=True)
    parser.add_argument("-o",
                        "--dir_out",
                        help="Directory to save the trimmed files.",
                        type=str,
                        dest="dir_out",
                        required=True)
    parser.add_argument("-f",
                        "--file_id_list",
                        help="Full path to file containing the ids.",
                        type=str,
                        dest="file_id_list",
                        required=True)
    parser.add_argument("--format",
                        help="Format of the audio file, e.g. WAV.",
                        type=str,
                        dest="format",
                        required=False,
                        default='wav')
    parser.add_argument(
        "--silence_db",
        help="Threshold until which a frame is considered to be silent.",
        type=int,
        dest="silence_threshold_db",
        required=False,
        default=-50)
    parser.add_argument(
        "--chunk_size",
        help="Size of the chunk (frame size) in ms on which db is computed.",
        type=int,
        dest="chunk_size_ms",
        required=False,
        default=10)
    parser.add_argument(
        "--min_silence_ms",
        help=
        "Milliseconds of silence which are always kept in front and back of audio file.",
        type=int,
        dest="min_silence_ms",
        required=False)

    # Parse arguments
    args = parser.parse_args()

    # Read which files to process.
    with open(args.file_id_list) as f:
        id_list = f.readlines()
    # Trim entries in-place.
    id_list[:] = [s.strip(' \t\n\r') for s in id_list]

    # Create output directory if missing.
    makedirs_safe(args.dir_out)

    # Start silence removal.
    silence_remover = SilenceRemover()
    if args.min_silence_ms is not None:
        silence_remover.min_silence_ms = args.min_silence_ms
    silence_remover.process_list(id_list, args.dir_wav, args.dir_out,
                                 args.format, args.silence_threshold_db,
                                 args.chunk_size_ms)
    def gen_data(self,
                 dir_in,
                 dir_out=None,
                 file_id_list=None,
                 id_list=None,
                 add_deltas=False,
                 return_dict=False):
        """
        Prepare WORLD features from audio files. If add_delta is false labels have the dimension
        num_frames x (num_coded_sps + 3) [mgc(num_coded_sps), lf0, vuv, bap(1)], otherwise
        the deltas and double deltas are added between the features resulting in
        num_frames x (3*num_coded_sps + 7) [mgc(3*num_coded_sps), lf0(3*1), vuv, bap(3*1)].

        :param dir_in:         Directory where the .wav files are stored for each utterance to process.
        :param dir_out:        Main directory where the labels and normalisation parameters are saved to subdirectories.
                               If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids. Normalisation parameters are saved using
                               this name to differentiate parameters between subsets.
        :param id_list:        The list of utterances to process.
                               Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                               If None, all file in audio_dir are used.
        :param add_deltas:     Add deltas and double deltas to all features except vuv.
        :param return_dict:    If true, returns an OrderedDict of all samples as first output.
        :return:               Returns two normalisation parameters as tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict followed by the two normalisation parameters.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]

        # Create directories in dir_out if it is given.
        if dir_out is not None:
            if add_deltas:
                makedirs_safe(os.path.join(dir_out, self.dir_deltas))
            else:
                makedirs_safe(os.path.join(dir_out, self.dir_lf0))
                makedirs_safe(os.path.join(dir_out, self.dir_vuv))
                makedirs_safe(os.path.join(dir_out, self.dir_coded_sps))
                makedirs_safe(os.path.join(dir_out, self.dir_bap))

        # Create the return dictionary if required.
        if return_dict:
            label_dict = OrderedDict()

        if add_deltas:
            # Create normalisation computation units.
            norm_params_ext_coded_sp = MeanCovarianceExtractor()
            norm_params_ext_lf0 = MeanCovarianceExtractor()
            norm_params_ext_bap = MeanCovarianceExtractor()
        else:
            # Create normalisation computation units.
            norm_params_ext_coded_sp = MeanStdDevExtractor()
            norm_params_ext_lf0 = MeanStdDevExtractor()
            # norm_params_ext_vuv = MeanStdDevExtractor()
            norm_params_ext_bap = MeanStdDevExtractor()

        logging.info("Extract WORLD{} features for".format(
            "" if not add_deltas else " deltas") +
                     "[{0}]".format(", ".join(str(i) for i in id_list)))
        for file_name in id_list:

            # Load audio file and extract features.
            audio_name = os.path.join(dir_in, file_name + ".wav")
            raw, fs = soundfile.read(audio_name)
            logging.debug("Extract WORLD{} features from {} at {}Hz.".format(
                "" if not add_deltas else " deltas", file_name, fs))
            f0, sp, ap = pyworld.wav2world(raw, fs)

            file_name = os.path.basename(file_name)  # Remove speaker.

            # Compute lf0 and vuv information.
            lf0 = np.log(f0.clip(min=1E-10), dtype=np.float32)
            lf0[lf0 <= math.log(self.f0_silence_threshold)] = self.lf0_zero
            lf0, vuv = interpolate_lin(lf0)
            lf0 = lf0.astype(dtype=np.float32)
            vuv = vuv.astype(dtype=np.float32)
            # Throw a warning when less then 5% of all frames are unvoiced.
            if vuv.sum() / len(vuv) < 0.05:
                self.logger.warning(
                    "Detected only {:.0f}% [{}/{}] unvoiced frames in {}.".
                    format(vuv.sum() / len(vuv) * 100.0, int(vuv.sum()),
                           len(vuv), file_name))

            # Decode spectrum to a lower dimension and aperiodicity to one band aperiodicity.
            # coded_sp = pyworld.code_spectral_envelope(sp, fs, WorldFeatLabelGen.num_coded_sps)  # Cepstral version.
            coded_sp = np.sqrt(sp) * 32768.0
            coded_sp = np.array(pysptk.mcep(coded_sp,
                                            order=self.num_coded_sps - 1,
                                            alpha=self.mgc_alpha,
                                            eps=1.0e-8,
                                            min_det=0.0,
                                            etype=1,
                                            itype=3),
                                dtype=np.float32)
            bap = np.array(pyworld.code_aperiodicity(ap, fs), dtype=np.float32)

            if add_deltas:
                # Compute the deltas and double deltas for all features.
                lf0_deltas, lf0_double_deltas = compute_deltas(lf0)
                coded_sp_deltas, coded_sp_double_deltas = compute_deltas(
                    coded_sp)
                bap_deltas, bap_double_deltas = compute_deltas(bap)

                coded_sp = np.concatenate(
                    (coded_sp, coded_sp_deltas, coded_sp_double_deltas),
                    axis=1)
                lf0 = np.concatenate((lf0, lf0_deltas, lf0_double_deltas),
                                     axis=1)
                bap = np.concatenate((bap, bap_deltas, bap_double_deltas),
                                     axis=1)

                # Combine them to a single feature sample.
                labels = np.concatenate((coded_sp, lf0, vuv, bap), axis=1)

                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = labels
                if dir_out is not None:
                    labels.tofile(
                        os.path.join(dir_out, self.dir_deltas,
                                     file_name + self.ext_deltas))

            else:
                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = np.concatenate(
                        (coded_sp, lf0, vuv, bap), axis=1)
                if dir_out is not None:
                    coded_sp.tofile(
                        os.path.join(dir_out, self.dir_coded_sps,
                                     file_name + self.ext_coded_sp))
                    lf0.tofile(
                        os.path.join(dir_out, self.dir_lf0,
                                     file_name + self.ext_lf0))
                    vuv.astype(np.float32).tofile(
                        os.path.join(dir_out, self.dir_vuv,
                                     file_name + self.ext_vuv))
                    bap.tofile(
                        os.path.join(dir_out, self.dir_bap,
                                     file_name + self.ext_bap))

            # Add sample to normalisation computation unit.
            norm_params_ext_coded_sp.add_sample(coded_sp)
            norm_params_ext_lf0.add_sample(lf0)
            # norm_params_ext_vuv.add_sample(vuv)
            norm_params_ext_bap.add_sample(bap)

        # Save mean and std dev of all features.
        if not add_deltas:
            norm_params_ext_coded_sp.save(
                os.path.join(dir_out, self.dir_coded_sps, file_id_list_name))
            norm_params_ext_lf0.save(
                os.path.join(dir_out, self.dir_lf0, file_id_list_name))
            # norm_params_ext_vuv.save(os.path.join(dir_out, WorldFeatLabelGen.dir_vuv, file_id_list_name))
            norm_params_ext_bap.save(
                os.path.join(dir_out, self.dir_bap, file_id_list_name))
        else:
            self.logger.info("Write norm_prams to{}".format(
                os.path.join(dir_out, self.dir_deltas, "_".join(
                    (file_id_list_name, self.dir_coded_sps)))))
            norm_params_ext_coded_sp.save(
                os.path.join(dir_out, self.dir_deltas, "_".join(
                    (file_id_list_name, self.dir_coded_sps))))
            norm_params_ext_lf0.save(
                os.path.join(dir_out, self.dir_deltas, "_".join(
                    (file_id_list_name, self.dir_lf0))))
            norm_params_ext_bap.save(
                os.path.join(dir_out, self.dir_deltas, "_".join(
                    (file_id_list_name, self.dir_bap))))

        # Get normalisation parameters.
        if not add_deltas:
            norm_coded_sp = norm_params_ext_coded_sp.get_params()
            norm_lf0 = norm_params_ext_lf0.get_params()
            # norm_vuv = norm_params_ext_vuv.get_params()
            norm_bap = norm_params_ext_bap.get_params()

            norm_first = np.concatenate(
                (norm_coded_sp[0], norm_lf0[0], (0.0, ), norm_bap[0]), axis=0)
            norm_second = np.concatenate(
                (norm_coded_sp[1], norm_lf0[1], (1.0, ), norm_bap[1]), axis=0)

        else:
            norm_coded_sp = norm_params_ext_coded_sp.get_params()
            norm_lf0 = norm_params_ext_lf0.get_params()
            # norm_vuv = norm_params_ext_vuv.get_params()
            norm_bap = norm_params_ext_bap.get_params()

            norm_first = (norm_coded_sp[0], norm_lf0[0], (0.0, ), norm_bap[0])
            norm_second = (norm_coded_sp[1], norm_lf0[1], (1.0, ), norm_bap[1])

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, norm_first, norm_second
        else:
            return norm_first, norm_second
Example #12
0
    def gen_data(self, dir_in, dir_out=None, file_id_list=None, id_list=None, add_deltas=False, return_dict=False):
        """
        Prepare LF0 and V/UV features from audio files. If add_delta is false each numpy array has the dimension
        num_frames x 2 [f0, vuv], otherwise the deltas and double deltas are added between
        the features resulting in num_frames x 4 [lf0(3*1), vuv].

        :param dir_in:         Directory where the .wav files are stored for each utterance to process.
        :param dir_out:        Main directory where the labels and normalisation parameters are saved to subdirectories.
                               If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids. Normalisation parameters are saved using
                               this name to differentiate parameters between subsets.
        :param id_list:        The list of utterances to process.
                               Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                               If None, all file in audio_dir are used.
        :param add_deltas:     Add deltas and double deltas to all features except vuv.
        :param return_dict:    If true, returns an OrderedDict of all samples as first output.
        :return:               Returns two normalisation parameters as tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict followed by the two normalisation parameters.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(os.path.basename(file_id_list))[0]

        # Create directories in dir_out if it is given.
        if dir_out is not None:
            if add_deltas:
                makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_deltas))
            else:
                makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_lf0))
                makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_vuv))

        # Create the return dictionary if required.
        if return_dict:
            label_dict = OrderedDict()

        # Create normalisation computation units.
        norm_params_ext_lf0 = MeanStdDevExtractor()
        # norm_params_ext_vuv = MeanStdDevExtractor()
        norm_params_ext_deltas = MeanStdDevExtractor()

        logging.info("Extract WORLD LF0 features for " + "[{0}]".format(", ".join(str(i) for i in id_list)))
        for file_name in id_list:
            logging.debug("Extract WORLD LF0 features from " + file_name)

            # Load audio file and extract features.
            audio_name = os.path.join(dir_in, file_name + ".wav")
            raw, fs = soundfile.read(audio_name)
            _f0, t = pyworld.dio(raw, fs)  # Raw pitch extraction. TODO: Use magphase here?
            f0 = pyworld.stonemask(raw, _f0, t, fs)  # Pitch refinement.

            # Compute lf0 and vuv information.
            lf0 = np.log(f0, dtype=np.float32)
            lf0[lf0 <= math.log(LF0LabelGen.f0_silence_threshold)] = LF0LabelGen.lf0_zero
            lf0, vuv = interpolate_lin(lf0)

            if add_deltas:
                # Compute the deltas and double deltas for all features.
                lf0_deltas, lf0_double_deltas = compute_deltas(lf0)

                # Combine them to a single feature sample.
                labels = np.concatenate((lf0, lf0_deltas, lf0_double_deltas, vuv), axis=1)

                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = labels
                if dir_out is not None:
                    labels.tofile(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_name + LF0LabelGen.ext_deltas))

                # Add sample to normalisation computation unit.
                norm_params_ext_deltas.add_sample(labels)
            else:
                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = np.concatenate((lf0, vuv), axis=1)
                if dir_out is not None:
                    lf0.tofile(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_name + LF0LabelGen.ext_lf0))
                    vuv.astype(np.float32).tofile(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_name + LF0LabelGen.ext_vuv))

                # Add sample to normalisation computation unit.
                norm_params_ext_lf0.add_sample(lf0)
                # norm_params_ext_vuv.add_sample(vuv)

        # Save mean and std dev of all features.
        if not add_deltas:
            norm_params_ext_lf0.save(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_id_list_name))
            # norm_params_ext_vuv.save(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_id_list_name))
        else:
            # Manually set vuv normalisation parameters before saving.
            norm_params_ext_deltas.sum_frames[-1] = 0.0  # Mean = 0.0
            norm_params_ext_deltas.sum_squared_frames[-1] = norm_params_ext_deltas.sum_length  # Variance = 1.0
            norm_params_ext_deltas.save(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_id_list_name))

        # Get normalisation parameters.
        if not add_deltas:
            norm_lf0 = norm_params_ext_lf0.get_params()
            # norm_vuv = norm_params_ext_vuv.get_params()

            norm_first = np.concatenate((norm_lf0[0], (0.0,)), axis=0)
            norm_second = np.concatenate((norm_lf0[1], (1.0,)), axis=0)
        else:
            norm_first, norm_second = norm_params_ext_deltas.get_params()

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, norm_first, norm_second
        else:
            return norm_first, norm_second