コード例 #1
0
ファイル: talknet.py プロジェクト: silencelearner/NeMo
    def generate_spectrogram(self, tokens: torch.Tensor,
                             **kwargs) -> torch.Tensor:
        assert hasattr(self, '_durs_model') and hasattr(self, '_pitch_model')

        if self.blanking:
            tokens = [
                AudioToCharWithDursF0Dataset.interleave(
                    x=torch.empty(len(t) + 1,
                                  dtype=torch.long,
                                  device=t.device).fill_(self.vocab.blank),
                    y=t,
                ) for t in tokens
            ]
            tokens = AudioToCharWithDursF0Dataset.merge(tokens,
                                                        value=self.vocab.pad,
                                                        dtype=torch.long)

        text_len = torch.tensor(tokens.shape[-1],
                                dtype=torch.long).unsqueeze(0)
        durs = self._durs_model(tokens, text_len)
        durs = durs.exp() - 1
        durs[durs < 0.0] = 0.0
        durs = durs.round().long()

        # Pitch
        f0_sil, f0_body = self._pitch_model(tokens, text_len, durs)
        sil_mask = f0_sil.sigmoid() > 0.5
        f0 = f0_body * self._pitch_model.f0_std + self._pitch_model.f0_mean
        f0 = (~sil_mask * f0).float()

        # Spect
        mel = self(tokens, text_len, durs, f0)

        return mel
コード例 #2
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
        if isinstance(cfg, dict):
            cfg = OmegaConf.create(cfg)

        self.learn_alignment = False
        if "learn_alignment" in cfg:
            self.learn_alignment = cfg.learn_alignment
        self._parser = None
        self._tb_logger = None
        super().__init__(cfg=cfg, trainer=trainer)

        schema = OmegaConf.structured(FastPitchConfig)
        # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes
        if isinstance(cfg, dict):
            cfg = OmegaConf.create(cfg)
        elif not isinstance(cfg, DictConfig):
            raise ValueError(
                f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig"
            )
        # Ensure passed cfg is compliant with schema
        OmegaConf.merge(cfg, schema)

        self.bin_loss_warmup_epochs = 100
        self.aligner = None
        self.log_train_images = False
        self.mel_loss = MelLoss()
        loss_scale = 0.1 if self.learn_alignment else 1.0
        self.pitch_loss = PitchLoss(loss_scale=loss_scale)
        self.duration_loss = DurationLoss(loss_scale=loss_scale)
        input_fft_kwargs = {}
        if self.learn_alignment:
            self.aligner = instantiate(self._cfg.alignment_module)
            self.forward_sum_loss = ForwardSumLoss()
            self.bin_loss = BinLoss()
            self.vocab = AudioToCharWithDursF0Dataset.make_vocab(
                **self._cfg.train_ds.dataset.vocab)
            input_fft_kwargs["n_embed"] = len(self.vocab.labels)
            input_fft_kwargs["padding_idx"] = self.vocab.pad

        self.preprocessor = instantiate(self._cfg.preprocessor)

        input_fft = instantiate(self._cfg.input_fft, **input_fft_kwargs)
        output_fft = instantiate(self._cfg.output_fft)
        duration_predictor = instantiate(self._cfg.duration_predictor)
        pitch_predictor = instantiate(self._cfg.pitch_predictor)

        self.fastpitch = FastPitchModule(
            input_fft,
            output_fft,
            duration_predictor,
            pitch_predictor,
            self.aligner,
            cfg.n_speakers,
            cfg.symbols_embedding_dim,
            cfg.pitch_embedding_kernel_size,
            cfg.n_mel_channels,
        )
コード例 #3
0
ファイル: talknet.py プロジェクト: sycomix/NeMo
    def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
        super().__init__(cfg=cfg, trainer=trainer)

        cfg = self._cfg
        self.vocab = AudioToCharWithDursF0Dataset.make_vocab(
            **cfg.train_ds.dataset.vocab)
        self.embed = nn.Embedding(len(self.vocab.labels), cfg.d_char)
        self.encoder = instantiate(cfg.encoder)
        d_out = cfg.encoder.jasper[-1].filters
        self.proj = nn.Conv1d(d_out, 1, kernel_size=1)
コード例 #4
0
ファイル: talknet.py プロジェクト: researchase/NeMo
    def __init__(self, cfg: DictConfig):
        super().__init__(cfg=cfg)
        typecheck.set_typecheck_enabled(enabled=False)

        cfg = self._cfg
        self.vocab = AudioToCharWithDursF0Dataset.make_vocab(**cfg.train_ds.dataset.vocab)
        self.embed = nn.Embedding(len(self.vocab.labels), cfg.d_char)
        self.model = instantiate(cfg.model)
        d_out = cfg.model.jasper[-1].filters
        self.proj = nn.Conv1d(d_out, 1, kernel_size=1)
コード例 #5
0
ファイル: talknet.py プロジェクト: sycomix/NeMo
    def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
        super().__init__(cfg=cfg, trainer=trainer)

        cfg = self._cfg
        self.vocab = AudioToCharWithDursF0Dataset.make_vocab(
            **cfg.train_ds.dataset.vocab)
        self.embed = GaussianEmbedding(self.vocab, cfg.d_char)
        self.encoder = instantiate(cfg.encoder)
        d_out = cfg.encoder.jasper[-1].filters
        self.sil_proj = nn.Conv1d(d_out, 1, kernel_size=1)
        self.body_proj = nn.Conv1d(d_out, 1, kernel_size=1)
        self.f0_mean, self.f0_std = cfg.f0_mean, cfg.f0_std
コード例 #6
0
ファイル: talknet.py プロジェクト: researchase/NeMo
    def __init__(self, cfg: DictConfig):
        super().__init__(cfg=cfg)
        typecheck.set_typecheck_enabled(enabled=False)

        cfg = self._cfg
        self.vocab = AudioToCharWithDursF0Dataset.make_vocab(**cfg.train_ds.dataset.vocab)
        self.preprocessor = instantiate(cfg.preprocessor)
        self.embed = GaussianEmbedding(self.vocab, cfg.d_char)
        self.norm_f0 = MaskedInstanceNorm1d(1)
        self.res_f0 = StyleResidual(cfg.d_char, 1, kernel_size=3)
        self.model = instantiate(cfg.model)
        d_out = cfg.model.jasper[-1].filters
        self.proj = nn.Conv1d(d_out, cfg.n_mels, kernel_size=1)
コード例 #7
0
ファイル: talknet.py プロジェクト: sycomix/NeMo
    def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
        super().__init__(cfg=cfg, trainer=trainer)

        cfg = self._cfg
        self.vocab = AudioToCharWithDursF0Dataset.make_vocab(
            **cfg.train_ds.dataset.vocab)
        self.blanking = cfg.train_ds.dataset.blanking
        self.preprocessor = instantiate(cfg.preprocessor)
        self.embed = GaussianEmbedding(self.vocab, cfg.d_char)
        self.norm_f0 = MaskedInstanceNorm1d(1)
        self.res_f0 = StyleResidual(cfg.d_char, 1, kernel_size=3)
        self.encoder = instantiate(cfg.encoder)
        d_out = cfg.encoder.jasper[-1].filters
        self.proj = nn.Conv1d(d_out, cfg.n_mels, kernel_size=1)
コード例 #8
0
ファイル: aligner.py プロジェクト: silencelearner/NeMo
    def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
        super().__init__(cfg=cfg, trainer=trainer)
        typecheck.set_typecheck_enabled(enabled=False)

        cfg = self._cfg
        self.vocab = AudioToCharWithDursF0Dataset.make_vocab(**cfg.train_ds.dataset.vocab)
        self.embed = nn.Embedding(len(self.vocab.labels), cfg.d_char)
        self.preprocessor = instantiate(cfg.preprocessor)
        self.alignment_encoder = instantiate(cfg.alignment_encoder)

        self.forward_sum_loss = ForwardSumLoss()
        self.bin_loss = BinLoss()

        self.bin_start_ratio = cfg.bin_start_ratio
        self.add_bin_loss = False
コード例 #9
0
    def parser(self):
        if self._parser is not None:
            return self._parser

        if self.learn_alignment:
            vocab = AudioToCharWithDursF0Dataset.make_vocab(
                **self._cfg.train_ds.dataset.vocab)
            self._parser = vocab.encode
        else:
            self._parser = parsers.make_parser(
                labels=self._cfg.labels,
                name='en',
                unk_id=-1,
                blank_id=-1,
                do_normalize=True,
                abbreviation_version="fastpitch",
                make_table=False,
            )
        return self._parser
コード例 #10
0
ファイル: talknet.py プロジェクト: askaydevs/ITN_Phore
    def forward(self, text, durs):
        """See base class."""
        # Fake padding
        text = F.pad(text, [0, 2, 0, 0], value=self.pad)
        durs = F.pad(durs, [0, 2, 0, 0], value=0)

        repeats = AudioToCharWithDursF0Dataset.repeat_merge(text, durs, self.pad)
        total_time = repeats.shape[-1]

        # Centroids: [B,T,N]
        c = (durs / 2.0) + F.pad(torch.cumsum(durs, dim=-1)[:, :-1], [1, 0, 0, 0], value=0)
        c = c.unsqueeze(1).repeat(1, total_time, 1)

        # Sigmas: [B,T,N]
        sigmas = durs
        sigmas = sigmas.float() / self.sigma_c
        sigmas = sigmas.unsqueeze(1).repeat(1, total_time, 1) + self.EPS
        assert c.shape == sigmas.shape

        # Times at indexes
        t = torch.arange(total_time, device=c.device).view(1, -1, 1).repeat(durs.shape[0], 1, durs.shape[-1]).float()
        t = t + 0.5

        ns = slice(None)
        if self.merge_blanks:
            ns = slice(1, None, 2)

        # Weights: [B,T,N]
        d = torch.distributions.normal.Normal(c, sigmas)
        w = d.log_prob(t).exp()[:, :, ns]  # [B,T,N]
        pad_mask = (text == self.pad)[:, ns].unsqueeze(1).repeat(1, total_time, 1)
        w.masked_fill_(pad_mask, 0.0)  # noqa
        w = w / (w.sum(-1, keepdim=True) + self.EPS)
        pad_mask = (repeats == self.pad).unsqueeze(-1).repeat(1, 1, text[:, ns].size(1))  # noqa
        w.masked_fill_(pad_mask, 0.0)  # noqa
        pad_mask[:, :, :-1] = False
        w.masked_fill_(pad_mask, 1.0)  # noqa

        # Embeds
        u = torch.bmm(w, self.embed(text)[:, ns, :])  # [B,T,E]

        return u
コード例 #11
0
ファイル: fastpitch.py プロジェクト: quuhua911/NeMo
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
        # Convert to Hydra 1.0 compatible DictConfig
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        cfg = model_utils.maybe_update_config_version(cfg)

        # Setup normalizer
        self.normalizer = None
        self.text_normalizer_call = None
        self.text_normalizer_call_kwargs = {}
        self._setup_normalizer(cfg)

        self.learn_alignment = cfg.get("learn_alignment", False)

        # Setup vocabulary (=tokenizer) and input_fft_kwargs (supported only with self.learn_alignment=True)
        input_fft_kwargs = {}
        if self.learn_alignment:
            self.vocab = None
            self.ds_class_name = cfg.train_ds.dataset._target_.split(".")[-1]

            if self.ds_class_name == "AudioToCharWithPriorAndPitchDataset":
                self.vocab = AudioToCharWithDursF0Dataset.make_vocab(
                    **cfg.train_ds.dataset.vocab)
                input_fft_kwargs["n_embed"] = len(self.vocab.labels)
                input_fft_kwargs["padding_idx"] = self.vocab.pad
            elif self.ds_class_name == "TTSDataset":
                self._setup_tokenizer(cfg)
                assert self.vocab is not None
                input_fft_kwargs["n_embed"] = len(self.vocab.tokens)
                input_fft_kwargs["padding_idx"] = self.vocab.pad
            else:
                raise ValueError(
                    f"Unknown dataset class: {self.ds_class_name}")

        self._parser = None
        self._tb_logger = None
        super().__init__(cfg=cfg, trainer=trainer)

        self.bin_loss_warmup_epochs = cfg.get("bin_loss_warmup_epochs", 100)
        self.log_train_images = False

        loss_scale = 0.1 if self.learn_alignment else 1.0
        dur_loss_scale = loss_scale
        pitch_loss_scale = loss_scale
        if "dur_loss_scale" in cfg:
            dur_loss_scale = cfg.dur_loss_scale
        if "pitch_loss_scale" in cfg:
            pitch_loss_scale = cfg.pitch_loss_scale

        self.mel_loss = MelLoss()
        self.pitch_loss = PitchLoss(loss_scale=pitch_loss_scale)
        self.duration_loss = DurationLoss(loss_scale=dur_loss_scale)

        self.aligner = None
        if self.learn_alignment:
            self.aligner = instantiate(self._cfg.alignment_module)
            self.forward_sum_loss = ForwardSumLoss()
            self.bin_loss = BinLoss()

        self.preprocessor = instantiate(self._cfg.preprocessor)
        input_fft = instantiate(self._cfg.input_fft, **input_fft_kwargs)
        output_fft = instantiate(self._cfg.output_fft)
        duration_predictor = instantiate(self._cfg.duration_predictor)
        pitch_predictor = instantiate(self._cfg.pitch_predictor)

        self.fastpitch = FastPitchModule(
            input_fft,
            output_fft,
            duration_predictor,
            pitch_predictor,
            self.aligner,
            cfg.n_speakers,
            cfg.symbols_embedding_dim,
            cfg.pitch_embedding_kernel_size,
            cfg.n_mel_channels,
        )
        self._input_types = self._output_types = None