def __init__(self, cfg: DictConfig): super().__init__(cfg=cfg) typecheck.set_typecheck_enabled(enabled=False) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab(**cfg.train_ds.dataset.vocab) self.embed = nn.Embedding(len(self.vocab.labels), cfg.d_char) self.model = instantiate(cfg.model) d_out = cfg.model.jasper[-1].filters self.proj = nn.Conv1d(d_out, 1, kernel_size=1)
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): super().__init__(cfg=cfg, trainer=trainer) typecheck.set_typecheck_enabled(enabled=False) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab( **cfg.train_ds.dataset.vocab) self.embed = GaussianEmbedding(self.vocab, cfg.d_char) self.model = instantiate(cfg.model) d_out = cfg.model.jasper[-1].filters self.sil_proj = nn.Conv1d(d_out, 1, kernel_size=1) self.body_proj = nn.Conv1d(d_out, 1, kernel_size=1) self.f0_mean, self.f0_std = cfg.f0_mean, cfg.f0_std
def __init__(self, cfg: DictConfig): super().__init__(cfg=cfg) typecheck.set_typecheck_enabled(enabled=False) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab(**cfg.train_ds.dataset.vocab) self.preprocessor = instantiate(cfg.preprocessor) self.embed = GaussianEmbedding(self.vocab, cfg.d_char) self.norm_f0 = MaskedInstanceNorm1d(1) self.res_f0 = StyleResidual(cfg.d_char, 1, kernel_size=3) self.model = instantiate(cfg.model) d_out = cfg.model.jasper[-1].filters self.proj = nn.Conv1d(d_out, cfg.n_mels, kernel_size=1)
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): super().__init__(cfg=cfg, trainer=trainer) typecheck.set_typecheck_enabled(enabled=False) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab(**cfg.train_ds.dataset.vocab) self.embed = nn.Embedding(len(self.vocab.labels), cfg.d_char) self.preprocessor = instantiate(cfg.preprocessor) self.alignment_encoder = instantiate(cfg.alignment_encoder) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.bin_start_ratio = cfg.bin_start_ratio self.add_bin_loss = False