def __init__(self,
                 model_name: str = 'multiband_mel_gan_vctk',
                 device='cpu'):
        super().__init__()
        assert model_name in PARAMS['models'], \
            'Model name {} is not valid! choose in {}'.format(
                model_name, str(PARAMS['models'].keys()))

        model_name_mapping = PARAMS['models'][model_name]

        self.device = device
        self.encoder = MelSpectrogram(**PARAMS['audio'][model_name])
        self.vocoder = load_model(
            download_pretrained_model(model_name_mapping)).to(device).eval()
        self.vocoder.remove_weight_norm()

        # make stat tensors
        param_key = 'vctk' if 'vctk' in model_name else 'lj'
        stats = MULTI_BAND_MEL_GAN_PARAMS[param_key]
        self.mean = torch.FloatTensor(
            stats['mean']).unsqueeze(0).unsqueeze(-1).to(device)
        self.scale = torch.FloatTensor(
            stats['scale']).unsqueeze(0).unsqueeze(-1).to(device)

        # print params
        print('Total Model {} params.'.format(self.num_params(self.vocoder)))
Esempio n. 2
0
    def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'):
        self.id = worker_id
        #Model selection
        self.fs = 22050
        self.lang = "English"
        self.tag = "kan-bayashi/ljspeech_tacotron2"
        self.vocoder_tag = "ljspeech_parallel_wavegan.v1"

        #Model setup
        self.d = ModelDownloader()
        self.text2speech = Text2Speech(
            **self.d.download_and_unpack(self.tag),
            device="cpu",
            # Only for Tacotron 2
            threshold=0.5,
            minlenratio=0.0,
            maxlenratio=10.0,
            use_att_constraint=False,
            backward_window=1,
            forward_window=3,
        )
        self.vocoder = load_model(download_pretrained_model(
            self.vocoder_tag)).to("cpu").eval()

        self.text2speech.spc2wav = None
        self.vocoder.remove_weight_norm()

        self.audio_d = audio_dest
        self.audio_f = audio_format
 def __init__(
     self,
     model_file: Union[Path, str],
     config_file: Optional[Union[Path, str]] = None,
 ):
     """Initialize ParallelWaveGANPretrainedVocoder module."""
     super().__init__()
     try:
         from parallel_wavegan.utils import load_model
     except ImportError:
         logging.error(
             "`parallel_wavegan` is not installed. "
             "Please install via `pip install -U parallel_wavegan`.")
         raise
     if config_file is None:
         dirname = os.path.dirname(str(model_file))
         config_file = os.path.join(dirname, "config.yml")
     with open(config_file) as f:
         config = yaml.load(f, Loader=yaml.Loader)
     self.fs = config["sampling_rate"]
     self.vocoder = load_model(model_file, config)
     if hasattr(self.vocoder, "remove_weight_norm"):
         self.vocoder.remove_weight_norm()
     self.normalize_before = False
     if hasattr(self.vocoder, "mean"):
         self.normalize_before = True
Esempio n. 4
0
    def __init__(self, torch_device=None):
        if torch_device is None:
            if torch.cuda.is_available():
                torch_device = 'cuda'
            else:
                torch_device = 'cpu'

        self.tacotron_file_config = path.join(TTS_WORK_DIR, TTS_CONFIG_FILE)
        self.tacotron_file_checkpoints = path.join(TTS_WORK_DIR, TTS_MODEL_WEIGHTS)

        self.vocoder_file_config = path.join(TTS_WORK_DIR, TTS_CONFIG_FILE)
        self.vocoder_file_checkpoints = path.join(VOCODER_WORK_DIR, VOCODER_MODEL_WEIGHTS)

        # Tacotron2 Loading
        self.tacotron_instance = Text2Speech(
            self.tacotron_file_config,
            self.tacotron_file_checkpoints,
            device=torch_device,
            threshold=0.5,
            minlenratio=0.0,
            maxlenratio=10.0,
            use_att_constraint=False,
            backward_window=1,
            forward_window=3
        )
        self.tacotron_instance.spc2wav = None

        # Vocoder Loading
        self.vocoder = load_model(self.vocoder_file_checkpoints)\
            .to(torch_device)\
            .eval()
        self.vocoder.remove_weight_norm()
Esempio n. 5
0
    def setup_model(self):
        try:
            self.model_reload_needed = False
            self.output_status("Loading nltk...")

            # setup nltk
            import nltk
            nltk.data.path.append(MODEL_DIR + '/nltk_models')
            try:
                nltk.data.find('tokenizers/punkt')
            except LookupError:
                nltk.download('punkt', download_dir=MODEL_DIR + "/nltk_models")

            self.output_status("Loading torch...", end=" ")

            # setup model
            import torch
            from espnet_model_zoo.downloader import ModelDownloader
            from espnet2.bin.tts_inference import Text2Speech
            from parallel_wavegan.utils import download_pretrained_model
            from parallel_wavegan.utils import load_model

            self.mlDevice = "cuda" if torch.cuda.is_available() else "cpu"
            self.output_status("Running on " + self.mlDevice)

            self.output_status("Loading espnet...")

            d = ModelDownloader(MODEL_DIR + "/espnet_models")
            self.text2speech = Text2Speech(
                **d.download_and_unpack(self.tag),
                device=self.mlDevice,
                # Only for Tacotron 2
                threshold=0.5,
                minlenratio=0.0,
                maxlenratio=10.0,
                use_att_constraint=False,
                backward_window=1,
                forward_window=3,
                # Only for FastSpeech & FastSpeech2
                speed_control_alpha=1.0,
            )
            self.text2speech.spc2wav = None  # Disable griffin-lim
            # NOTE: Sometimes download is failed due to "Permission denied". That is
            #   the limitation of google drive. Please retry after serveral hours.

            self.output_status("Loading vocoder models...")

            self.vocoder = load_model(
                download_pretrained_model(self.vocoder_tag,
                                          download_dir=MODEL_DIR +
                                          "/vocoder_models")).to(
                                              self.mlDevice).eval()
            self.vocoder.remove_weight_norm()
            self.output_status("Model setup completed.")
        except Exception as e:
            self.output_err("Model error", e)
            raise HandledException()
Esempio n. 6
0
    def load_new_model(self, checkpoint):

        # setup model
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            device = torch.device("cpu")
        device = torch.device('cpu')
        self.device = device

        model = load_model(checkpoint, self.config)
        #logging.info(f"Loaded model parameters from {checkpoint}.")
        model.remove_weight_norm()

        model = model.eval().to(device)
        return model
Esempio n. 7
0
def perform_tts(input_text):
    idim, odim, train_args = get_model_conf(model_path)
    model_class = dynamic_import(train_args.model_module)
    model = model_class(idim, odim, train_args)
    torch_load(model_path, model)
    model = model.eval().to(device)
    inference_args = Namespace(
        **{
            "threshold": 0.5,
            "minlenratio": 0.0,
            "maxlenratio": 10.0,
            # Only for Tacotron 2
            "use_attention_constraint": True,
            "backward_window": 1,
            "forward_window": 3,
            # Only for fastspeech (lower than 1.0 is faster speech, higher than 1.0 is slower speech)
            "fastspeech_alpha": 1.0,
        })

    # define neural vocoder
    fs = 22050
    vocoder = load_model(vocoder_path)
    vocoder.remove_weight_norm()
    vocoder = vocoder.eval().to(device)

    # define text frontend

    with open(dict_path) as f:
        lines = f.readlines()
    lines = [line.replace("\n", "").split(" ") for line in lines]
    char_to_id = {c: int(i) for c, i in lines}
    g2p = G2p()

    print('input : ', input_text)
    with torch.no_grad():
        start = time.time()
        x = frontend(input_text, g2p, char_to_id, idim)
        c, _, _ = model.inference(x, inference_args)
        y = vocoder.inference(c)
    rtf = (time.time() - start) / (len(y) / fs)
    print(f"RTF = {rtf:5f}")
    print(y)
    write("static/test.wav", fs, y.view(-1).cpu().numpy())
Esempio n. 8
0
    def __init__(self, checkpoint, config):
        # load config
        if config is None:
            dirname = os.path.dirname(checkpoint)
            config = os.path.join(dirname, "config.yml")
        with open(config) as f:
            config = yaml.load(f, Loader=yaml.Loader)
        self.config = config

        # setup model
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            device = torch.device("cpu")
        self.device = device

        model = load_model(checkpoint, config)
        logging.info(f"Loaded model parameters from {checkpoint}.")
        model.remove_weight_norm()

        self.model = model.eval().to(device)
Esempio n. 9
0
def test_parallel_wavegan_compatibility():
    from parallel_wavegan.utils import download_pretrained_model
    from parallel_wavegan.utils import load_model

    ckpt_path = download_pretrained_model("ljspeech_hifigan.v1")
    state_dict = torch.load(ckpt_path,
                            map_location="cpu")["model"]["generator"]
    model_pwg = load_model(ckpt_path)
    model_espnet2 = HiFiGANGenerator()
    model_espnet2.load_state_dict(state_dict)
    model_pwg.eval()
    model_espnet2.eval()

    with torch.no_grad():
        c = torch.randn(5, 80)
        out_pwg = model_pwg.inference(c)
        out_espnet2 = model_espnet2.inference(c)
        np.testing.assert_array_equal(
            out_pwg.cpu().numpy(),
            out_espnet2.cpu().numpy(),
        )
Esempio n. 10
0
    def __init__(self, checkpoint, config=None):
        """
        Parameters
        ----------
            checkpoint: str, the path of model checkpoint file.
            config: str, the path of model configuration file.
        """

        # load config
        if config is None:
            dirname = os.path.dirname(checkpoint)
            config = os.path.join(dirname, "config.yml")
        with open(config) as f:
            self._config = yaml.load(f, Loader=yaml.Loader)

        # setup model
        if torch.cuda.is_available():
            self._device = torch.device("cuda")
        else:
            self._device = torch.device("cpu")
        self._model = load_model(checkpoint, self._config)
        self._model.remove_weight_norm()
        self._model = self._model.eval().to(self._device)
Esempio n. 11
0
    # scaler.mean_ = np.load(args.stats)[0]
    # scaler.scale_ = np.load(args.stats)[1]
else:
    raise ValueError("support only hdf5 (and normally npy - but not now) format.")
# from version 0.23.0, this information is needed
scaler.n_features_in_ = scaler.mean_.shape[0]
mel = scaler.transform(mel)

# plt.imshow(mel)
# plt.show()

#==============================================Put it through network==================================================
# converter.output_to_wav([[mel]])
print(f"Now loading in pretrained melGAN model")
download_pretrained_model("vctk_multi_band_melgan.v2", "melgan")
model = load_model("melgan/vctk_multi_band_melgan.v2/checkpoint-1000000steps.pkl")
model.remove_weight_norm()
model = model.eval().to(device)


result = model.inference(torch.tensor(mel, dtype=torch.float).to(device)).view(-1)
# from playsound import playsound

# import pyaudio
# p = pyaudio.PyAudio()

# stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
#                 channels=wf.getnchannels(),
#                 rate=wf.getframerate(),
#                 output=True)
x, sr = sf.read(utility.get_full_path(".\\input\\p225\\p225_001.wav"))
Esempio n. 12
0
    from espnet_model_zoo.downloader import ModelDownloader
    import soundfile as sf
    import librosa
    import numpy as np
    import os
    import kaldiio

    d = ModelDownloader()
    # tag = 'kan-bayashi/libritts_gst+xvector_transformer'
    text2speech = Text2Speech(
        "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/config.yaml",
        "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/train.loss.best.pth",
        device="cuda")
    # text2speech.spc2wav = None
    vocoder = load_model(
        download_pretrained_model("libritts_parallel_wavegan.v1.long")).to(
            "cuda").eval()

    vocoder.remove_weight_norm()
    spembs = None
    if text2speech.use_speech:
        speech, fs = sf.read("/nolan/VCTK-Corpus/wav48/p226/p226_001.wav")
        # speech, _ = librosa.load("/nolan/VCTK-Corpus/wav48/p225/p225_001.wav", text2speech.fs)
        speech = torch.from_numpy(speech).float().cuda()
        # speech = torch.randn(50000,)

    texts = [
        "Mostly I would recommend giving a quick look to the figures beyond the introduction.",
    ]
    for i, text in enumerate(texts):
        with torch.no_grad():
Esempio n. 13
0
def main():
    """Run decoding process."""
    parser = argparse.ArgumentParser(
        description=
        "Decode dumped features with trained Parallel WaveGAN Generator "
        "(See detail in parallel_wavegan/bin/decode.py).")
    parser.add_argument("--feats-scp",
                        "--scp",
                        default=None,
                        type=str,
                        help="kaldi-style feats.scp file. "
                        "you need to specify either feats-scp or dumpdir.")
    parser.add_argument("--dumpdir",
                        default=None,
                        type=str,
                        help="directory including feature files. "
                        "you need to specify either feats-scp or dumpdir.")
    parser.add_argument("--outdir",
                        type=str,
                        required=True,
                        help="directory to save generated speech.")
    parser.add_argument("--checkpoint",
                        type=str,
                        required=True,
                        help="checkpoint file to be loaded.")
    parser.add_argument(
        "--config",
        default=None,
        type=str,
        help="yaml format configuration file. if not explicitly provided, "
        "it will be searched in the checkpoint directory. (default=None)")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning("Skip DEBUG/INFO messages")

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # load config
    if args.config is None:
        dirname = os.path.dirname(args.checkpoint)
        args.config = os.path.join(dirname, "config.yml")
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.feats_scp is not None and args.dumpdir is not None) or \
            (args.feats_scp is None and args.dumpdir is None):
        raise ValueError("Please specify either --dumpdir or --feats-scp.")

    # get dataset
    if args.dumpdir is not None:
        if config["format"] == "hdf5":
            mel_query = "*.h5"
            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
        elif config["format"] == "npy":
            mel_query = "*-feats.npy"
            mel_load_fn = np.load
        else:
            raise ValueError("Support only hdf5 or npy format.")
        dataset = MelDataset(
            args.dumpdir,
            mel_query=mel_query,
            mel_load_fn=mel_load_fn,
            return_utt_id=True,
        )
    else:
        dataset = MelSCPDataset(
            feats_scp=args.feats_scp,
            return_utt_id=True,
        )
    logging.info(f"The number of features to be decoded = {len(dataset)}.")

    # setup model
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model = load_model(args.checkpoint, config)
    logging.info(f"Loaded model parameters from {args.checkpoint}.")
    model.remove_weight_norm()
    model = model.eval().to(device)

    # start generation
    total_rtf = 0.0
    with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar:
        for idx, (utt_id, c) in enumerate(pbar, 1):
            # generate
            c = torch.tensor(c, dtype=torch.float).to(device)
            start = time.time()
            y = model.inference(c).view(-1)
            rtf = (time.time() - start) / (len(y) / config["sampling_rate"])
            pbar.set_postfix({"RTF": rtf})
            total_rtf += rtf

            # save as PCM 16 bit wav file
            sf.write(os.path.join(config["outdir"], f"{utt_id}_gen.wav"),
                     y.cpu().numpy(), config["sampling_rate"], "PCM_16")

    # report average RTF
    logging.info(
        f"Finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f})."
    )
Esempio n. 14
0
import os
import torch
import soundfile as sf
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model

d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

while True:
    conn, addr = s.accept()
    data = conn.recv(1024)
    encoding = 'utf-8'
    data = str(data, encoding)
    conn.close()
    # synthesis
    with torch.no_grad():
        start = time.time()
        wav, c, *_ = text2speech(data)
        wav = vocoder.inference(c)
    rtf = (time.time() - start) / (len(wav) / fs)
    print(f"RTF = {rtf:5f}")
 def get_vocoder(self):
     vocoder = load_model(self.vocoder_model_path).to(self.device).eval()
     vocoder.remove_weight_norm()
     return vocoder
Esempio n. 16
0
File: svs.py Progetto: r9y9/nnsvs
    def __init__(self, model_dir, device="cpu"):
        self.device = device

        if isinstance(model_dir, str):
            model_dir = Path(model_dir)

        # search for config.yaml
        assert model_dir / "config.yaml"
        self.config = OmegaConf.load(model_dir / "config.yaml")

        # qst
        self.binary_dict, self.numeric_dict = hts.load_question_set(model_dir /
                                                                    "qst.hed")

        self.pitch_idx = len(self.binary_dict) + 1
        self.pitch_indices = np.arange(len(self.binary_dict),
                                       len(self.binary_dict) + 3)

        # Time-lag model
        self.timelag_config = OmegaConf.load(model_dir / "timelag_model.yaml")
        self.timelag_model = instantiate(self.timelag_config.netG).to(device)
        checkpoint = torch.load(
            model_dir / "timelag_model.pth",
            map_location=device,
        )
        self.timelag_model.load_state_dict(checkpoint["state_dict"])

        self.timelag_in_scaler = MinMaxScaler(
            np.load(model_dir / "in_timelag_scaler_min.npy"),
            np.load(model_dir / "in_timelag_scaler_scale.npy"),
        )
        self.timelag_out_scaler = StandardScaler(
            np.load(model_dir / "out_timelag_scaler_mean.npy"),
            np.load(model_dir / "out_timelag_scaler_var.npy"),
            np.load(model_dir / "out_timelag_scaler_scale.npy"),
        )
        self.timelag_model.eval()

        # Duration model
        self.duration_config = OmegaConf.load(model_dir /
                                              "duration_model.yaml")
        self.duration_model = instantiate(self.duration_config.netG).to(device)
        checkpoint = torch.load(
            model_dir / "duration_model.pth",
            map_location=device,
        )
        self.duration_model.load_state_dict(checkpoint["state_dict"])

        self.duration_in_scaler = MinMaxScaler(
            np.load(model_dir / "in_duration_scaler_min.npy"),
            np.load(model_dir / "in_duration_scaler_scale.npy"),
        )
        self.duration_out_scaler = StandardScaler(
            np.load(model_dir / "out_duration_scaler_mean.npy"),
            np.load(model_dir / "out_duration_scaler_var.npy"),
            np.load(model_dir / "out_duration_scaler_scale.npy"),
        )
        self.duration_model.eval()

        # Acoustic model
        self.acoustic_config = OmegaConf.load(model_dir /
                                              "acoustic_model.yaml")
        self.acoustic_model = instantiate(self.acoustic_config.netG).to(device)
        checkpoint = torch.load(
            model_dir / "acoustic_model.pth",
            map_location=device,
        )
        self.acoustic_model.load_state_dict(checkpoint["state_dict"])
        self.acoustic_in_scaler = MinMaxScaler(
            np.load(model_dir / "in_acoustic_scaler_min.npy"),
            np.load(model_dir / "in_acoustic_scaler_scale.npy"),
        )
        self.acoustic_out_scaler = StandardScaler(
            np.load(model_dir / "out_acoustic_scaler_mean.npy"),
            np.load(model_dir / "out_acoustic_scaler_var.npy"),
            np.load(model_dir / "out_acoustic_scaler_scale.npy"),
        )
        self.acoustic_model.eval()

        # Post-filter
        if (model_dir / "postfilter_model.yaml").exists():
            self.postfilter_config = OmegaConf.load(model_dir /
                                                    "postfilter_model.yaml")
            self.postfilter_model = instantiate(
                self.postfilter_config.netG).to(device)
            checkpoint = torch.load(
                model_dir / "postfilter_model.pth",
                map_location=device,
            )
            self.postfilter_model.load_state_dict(checkpoint["state_dict"])
            self.postfilter_model.eval()
            self.postfilter_out_scaler = StandardScaler(
                np.load(model_dir / "out_postfilter_scaler_mean.npy"),
                np.load(model_dir / "out_postfilter_scaler_var.npy"),
                np.load(model_dir / "out_postfilter_scaler_scale.npy"),
            )
        else:
            self.postfilter_model = None

        # Vocoder model
        if (model_dir / "vocoder_model.yaml").exists():
            if not _pwg_available:
                warn(
                    "parallel_wavegan is not installed. Vocoder model is disabled."
                )
                self.vocoder = None
            else:
                self.vocoder_config = OmegaConf.load(model_dir /
                                                     "vocoder_model.yaml")
                self.vocoder = load_model(
                    model_dir / "vocoder_model.pth",
                    config=self.vocoder_config).to(device)
                self.vocoder.eval()
                self.vocoder.remove_weight_norm()
                self.vocoder_in_scaler = StandardScaler(
                    np.load(model_dir / "in_vocoder_scaler_mean.npy"),
                    np.load(model_dir / "in_vocoder_scaler_var.npy"),
                    np.load(model_dir / "in_vocoder_scaler_scale.npy"),
                )
        else:
            self.vocoder = None
Esempio n. 17
0
fs = 22050


def get_args():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--text", help="your text", required=True)
    print(' '.join(sys.argv))
    args = parser.parse_args()
    return args


## specify the path to vocoder's checkpoint
vocoder_checkpoint = "exp/vocoder/checkpoint-400000steps.pkl"
vocoder = load_model(vocoder_checkpoint).to("cuda").eval()
vocoder.remove_weight_norm()

## specify path to the main model(transformer/tacotron2/fastspeech) and its config file
config_file = "exp/tts_train_raw_char/config.yaml"
model_path = "exp/tts_train_raw_char/train.loss.ave_5best.pth"

text2speech = Text2Speech(
    config_file,
    model_path,
    device="cuda",
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=True,
Esempio n. 18
0
    def validate_one_epoch(
        cls,
        model: torch.nn.Module,
        iterator: Iterable[Dict[str, torch.Tensor]],
        reporter: SubReporter,
        options: TrainerOptions,
        distributed_option: DistributedOption,
    ) -> None:
        assert check_argument_types()
        ngpu = options.ngpu
        no_forward_run = options.no_forward_run
        distributed = distributed_option.distributed

        model.eval()

        #############################
        ###  setup vocoder model  ###
        #############################

        print(f"options: {options}")

        if options.vocoder_checkpoint != "":
            # load config
            if options.vocoder_config == "":
                dirname = os.path.dirname(options.vocoder_checkpoint)
                print(f"dirname: {dirname}")
                options.vocoder_config = os.path.join(dirname, "config.yml")
            logging.info(f"options.vocoder_config: {options.vocoder_config}")
            with open(options.vocoder_config) as f:
                config = yaml.load(f, Loader=yaml.Loader)
            config.update(vars(options))

            model_vocoder = load_model(options.vocoder_checkpoint, config)
            logging.info(
                f"Loaded model parameters from {options.vocoder_checkpoint}.")
            # if options.normalize_before:
            # if True:
            #     assert hasattr(model_vocoder, "mean"), "Feature stats are not registered."
            #     assert hasattr(model_vocoder, "scale"), "Feature stats are not registered."
            model_vocoder.remove_weight_norm()
            model_vocoder = model_vocoder.eval().to(
                "cuda" if ngpu > 0 else "cpu")
        else:
            model_vocoder = None

        # [For distributed] Because iteration counts are not always equals between
        # processes, send stop-flag to the other processes if iterator is finished
        iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")
        for (index, batch) in iterator:
            assert isinstance(batch, dict), type(batch)
            if distributed:
                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
                if iterator_stop > 0:
                    break

            batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
            if no_forward_run:
                continue

            del_keys = [
                "pitch_aug", "pitch_aug_lengths", "time_aug",
                "time_aug_lengths"
            ]
            for key in del_keys:
                if key in batch.keys():
                    del batch[key]

            retval = model(**batch, flag_IsValid=True)
            if isinstance(retval, dict):
                stats = retval["stats"]
                weight = retval["weight"]
            else:
                # _, stats, weight = retval
                _, stats, weight, spec_predicted, spec_gt, length = retval

                # monitor spec during validation stage
                # [batch size, max length, feat dim]
                spec_predicted_denorm, _ = model.normalize.inverse(
                    spec_predicted.clone())
                spec_gt_denorm, _ = model.normalize.inverse(spec_gt.clone())

                cls.log_figure(
                    model,
                    model_vocoder,
                    index[0],
                    spec_predicted_denorm,
                    spec_gt_denorm,
                    length,
                    Path(options.output_dir) / "valid",
                )

            if ngpu > 1 or distributed:
                # Apply weighted averaging for stats.
                # if distributed, this method can also apply all_reduce()
                stats, weight = recursive_average(stats, weight, distributed)

            reporter.register(stats, weight)
            reporter.next()

        else:
            if distributed:
                iterator_stop.fill_(1)
                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
Esempio n. 19
0
    device=mlDevice,
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
# NOTE: Sometimes download is failed due to "Permission denied". That is
#   the limitation of google drive. Please retry after serveral hours.
vocoder = load_model(
    download_pretrained_model(
        vocoder_tag, download_dir='./vocoder_models')).to(mlDevice).eval()
vocoder.remove_weight_norm()

import scipy.io.wavfile as wv
import os

if os.path.isfile(out_name + ".wav"): os.remove(out_name + ".wav")

from concurrent.futures import ThreadPoolExecutor
executor = ThreadPoolExecutor(max_workers=5)


def save_wav(wav, count=-1):
    # print("Outputing wav file...")
    out_arr = wav.view(-1).cpu().numpy()