Beispiel #1
0
    def load_vocoder(self, model_file, model_config, use_cuda):
        self.vocoder_config = load_config(model_config)
        self.vocoder_model = setup_generator(self.vocoder_config)
        self.vocoder_model.load_state_dict(
            torch.load(model_file, map_location="cpu")["model"])
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0
        self.vocoder_config = load_config(model_config)

        if use_cuda:
            self.vocoder_model.cuda()
        self.vocoder_model.eval()
Beispiel #2
0
 def __init__(self, path, INPUT_SR, TARGET_SR, WINDOW_LENGTH):
     self.INPUT_SR = INPUT_SR
     self.TARGET_SR = TARGET_SR
     self.WINDOW_LENGTH = WINDOW_LENGTH
     self.CONFIG = load_config('config_fr.json')
     self.CONFIG['audio']['sample_rate'] = self.INPUT_SR
     self.AP_INPUT = AudioProcessor(**self.CONFIG['audio'])
     self.CONFIG['audio']['sample_rate'] = self.TARGET_SR
     self.AP_TARGET = AudioProcessor(**self.CONFIG['audio'])
     self.files = glob.glob(path + '/**/*.wav', recursive=True)
     #If you change your dataset, delete cache.json
     if os.path.isfile('./cache.json'):
         with open('./cache.json', "r") as json_file:
             self.pre_repertoir = json.load(json_file)
     else:
         print("> Computing wave files length...")
         self.pre_repertoir = [
             librosa.get_duration(filename=file)
             for file in tqdm(self.files)
         ]
         with open('./cache.json', mode="w") as json_file:
             json.dump(self.pre_repertoir, json_file)
     self.repertoir = [
         int(item / WINDOW_LENGTH) for item in self.pre_repertoir
     ]
     self.length = self.get_len()
Beispiel #3
0
    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
        # TODO: set a function in wavernn code base for model setup and call it here.
        sys.path.append(
            lib_path)  # set this if WaveRNN is not installed globally
        #pylint: disable=import-outside-toplevel
        from WaveRNN.models.wavernn import Model
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(model_config)
        # This is the default architecture we use for our models.
        # You might need to update it
        self.wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=self.wavernn_config.mode,
            mulaw=self.wavernn_config.mulaw,
            pad=self.wavernn_config.pad,
            use_aux_net=self.wavernn_config.use_aux_net,
            use_upsample_net=self.wavernn_config.use_upsample_net,
            upsample_factors=self.wavernn_config.upsample_factors,
            feat_dims=80,
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=self.ap.hop_length,
            sample_rate=self.ap.sample_rate,
        ).cuda()

        check = torch.load(model_file, map_location="cpu")
        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()
Beispiel #4
0
 def test_in_out(self):
     self._create_random_model()
     config = load_config(
         os.path.join(get_tests_input_path(), 'server_config.json'))
     tts_root_path = get_tests_output_path()
     config['tts_checkpoint'] = os.path.join(tts_root_path,
                                             config['tts_checkpoint'])
     config['tts_config'] = os.path.join(tts_root_path,
                                         config['tts_config'])
     synthesizer = Synthesizer(config)
     synthesizer.tts("Better this test works!!")
Beispiel #5
0
    def _create_random_model(self):
        # pylint: disable=global-statement
        global symbols, phonemes
        config = load_config(
            os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
        if 'characters' in config.keys():
            symbols, phonemes = make_symbols(**config.characters)

        num_chars = len(phonemes) if config.use_phonemes else len(symbols)
        model = setup_model(num_chars, 0, config)
        output_path = os.path.join(get_tests_output_path())
        save_checkpoint(model, None, 10, 10, 1, output_path)
Beispiel #6
0
    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        # pylint: disable=global-statement
        global symbols, phonemes

        print(" > Loading TTS model ...")
        print(" | > model config: ", tts_config)
        print(" | > checkpoint file: ", tts_checkpoint)

        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)

        if 'characters' in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        # TODO: fix this for multi-speaker model - load speakers
        if self.config.tts_speakers is not None:
            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
            num_speakers = len(self.tts_speakers)
        else:
            num_speakers = 0
        self.tts_model = setup_model(self.input_size,
                                     num_speakers=num_speakers,
                                     c=self.tts_config)
        # load model state
        cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
        # load the model
        self.tts_model.load_state_dict(cp['model'])
        if use_cuda:
            self.tts_model.cuda()
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 3000
        if 'r' in cp:
            self.tts_model.decoder.set_r(cp['r'])
            print(f" > model reduction factor: {cp['r']}")
Beispiel #7
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Compute mean and variance of spectrogtram features.")
    parser.add_argument(
        "--config_path",
        type=str,
        required=True,
        help="TTS config file path to define audio processin parameters.")
    parser.add_argument("--out_path",
                        default=None,
                        type=str,
                        help="directory to save the output file.")
    args = parser.parse_args()

    # load config
    CONFIG = load_config(args.config_path)
    CONFIG.audio['signal_norm'] = False  # do not apply earlier normalization
    CONFIG.audio['stats_path'] = None  # discard pre-defined stats

    # load audio processor
    ap = AudioProcessor(**CONFIG.audio)

    # load the meta data of target dataset
    dataset_items = load_meta_data(CONFIG.datasets)[0]  # take only train data
    print(f" > There are {len(dataset_items)} files.")

    mel_sum = 0
    mel_square_sum = 0
    linear_sum = 0
    linear_square_sum = 0
    N = 0
    for item in tqdm(dataset_items):
        # compute features
        wav = ap.load_wav(item[1])
        linear = ap.spectrogram(wav)
        mel = ap.melspectrogram(wav)

        # compute stats
        N += mel.shape[1]
        mel_sum += mel.sum(1)
        linear_sum += linear.sum(1)
        mel_square_sum += (mel**2).sum(axis=1)
        linear_square_sum += (linear**2).sum(axis=1)

    mel_mean = mel_sum / N
    mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
    linear_mean = linear_sum / N
    linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)

    output_file_path = os.path.join(args.out_path, "scale_stats.npy")
    stats = {}
    stats['mel_mean'] = mel_mean
    stats['mel_std'] = mel_scale
    stats['linear_mean'] = linear_mean
    stats['linear_std'] = linear_scale

    print(f' > Avg mel spec mean: {mel_mean.mean()}')
    print(f' > Avg mel spec scale: {mel_scale.mean()}')
    print(f' > Avg linear spec mean: {linear_mean.mean()}')
    print(f' > Avg lienar spec scale: {linear_scale.mean()}')

    # set default config values for mean-var scaling
    CONFIG.audio['stats_path'] = output_file_path
    CONFIG.audio['signal_norm'] = True
    # remove redundant values
    del CONFIG.audio['max_norm']
    del CONFIG.audio['min_level_db']
    del CONFIG.audio['symmetric_norm']
    del CONFIG.audio['clip_norm']
    stats['audio_config'] = CONFIG.audio
    np.save(output_file_path, stats, allow_pickle=True)
    print(f' > scale_stats.npy is saved to {output_file_path}')
Beispiel #8
0
                        default="",
                        help='DISTRIBUTED: process group id.')
    args = parser.parse_args()

    if args.continue_path != '':
        args.output_path = args.continue_path
        args.config_path = os.path.join(args.continue_path, 'config.json')
        list_of_files = glob.glob(
            args.continue_path +
            "/*.pth.tar")  # * means all if need specific format then *.csv
        latest_model_file = max(list_of_files, key=os.path.getctime)
        args.restore_path = latest_model_file
        print(f" > Training continues for {args.restore_path}")

    # setup output paths and read configs
    c = load_config(args.config_path)
    # check_config(c)
    _ = os.path.dirname(os.path.realpath(__file__))

    OUT_PATH = args.continue_path
    if args.continue_path == '':
        OUT_PATH = create_experiment_folder(c.output_path, c.run_name,
                                            args.debug)

    AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')

    c_logger = ConsoleLogger()

    if args.rank == 0:
        os.makedirs(AUDIO_PATH, exist_ok=True)
        new_fields = {}
Beispiel #9
0
from tests import get_tests_input_path

from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2
from mozilla_voice_tts.tts.tf.utils.tflite import (convert_tacotron2_to_tflite,
                                                   load_tflite_model)
from mozilla_voice_tts.utils.io import load_config

tf.get_logger().setLevel('INFO')

#pylint: disable=unused-variable

torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))


class TacotronTFTrainTest(unittest.TestCase):
    @staticmethod
    def generate_dummy_inputs():
        chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
        chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
        chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
        mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
        mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        stop_targets = torch.zeros(8, 30, 1).float().to(device)
        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)

        chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
Beispiel #10
0
import os
import unittest

import torch as T
from tests import get_tests_input_path

from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss, AngleProtoLoss
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
from mozilla_voice_tts.utils.io import load_config

file_path = get_tests_input_path()
c = load_config(os.path.join(file_path, "test_config.json"))


class SpeakerEncoderTests(unittest.TestCase):
    # pylint: disable=R0201
    def test_in_out(self):
        dummy_input = T.rand(4, 20, 80)  # B x T x D
        dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
        model = SpeakerEncoder(input_dim=80,
                               proj_dim=256,
                               lstm_dim=768,
                               num_lstm_layers=3)
        # computing d vectors
        output = model.forward(dummy_input)
        assert output.shape[0] == 4
        assert output.shape[1] == 256
        output = model.inference(dummy_input)
        assert output.shape[0] == 4
        assert output.shape[1] == 256
        # compute d vectors by passing LSTM hidden
Beispiel #11
0
parser = argparse.ArgumentParser()
parser.add_argument('--torch_model_path',
                    type=str,
                    help='Path to target torch model to be converted to TF.')
parser.add_argument('--config_path',
                    type=str,
                    help='Path to config file of torch model.')
parser.add_argument(
    '--output_path',
    type=str,
    help='path to output file including file name to save TF model.')
args = parser.parse_args()

# load model config
config_path = args.config_path
c = load_config(config_path)
num_speakers = 0

# init torch model
model = setup_generator(c)
checkpoint = torch.load(args.torch_model_path,
                        map_location=torch.device('cpu'))
state_dict = checkpoint['model']
model.load_state_dict(state_dict)
model.remove_weight_norm()
state_dict = model.state_dict()

# init tf model
model_tf = setup_tf_generator(c)

common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
Beispiel #12
0
                        help="JSON file for multi-speaker model.",
                        default="")
    parser.add_argument(
        '--speaker_fileid',
        type=str,
        help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.",
        default=None)
    parser.add_argument(
        '--gst_style',
        help="Wav path file for GST stylereference.",
        default=None)

    args = parser.parse_args()

    # load the config
    C = load_config(args.config_path)
    C.forward_attn_mask = True

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # if the vocabulary was passed, replace the default
    if 'characters' in C.keys():
        symbols, phonemes = make_symbols(**C.characters)

    speaker_embedding = None
    speaker_embedding_dim = None
    num_speakers = 0

    # load speakers
    if args.speakers_json != '':