Esempio n. 1
0
    def init_multispeaker(self, config: Coqpit, data: List = None):
        """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
        or with external `d_vectors` computed from a speaker encoder model.

        If you need a different behaviour, override this function for your model.

        Args:
            config (Coqpit): Model configuration.
            data (List, optional): Dataset items to infer number of speakers. Defaults to None.
        """
        if hasattr(config, "model_args"):
            config = config.model_args
        self.embedded_speaker_dim = 0
        # init speaker manager
        self.speaker_manager = get_speaker_manager(config, data=data)
        if config.num_speakers > 0 and self.speaker_manager.num_speakers == 0:
            self.speaker_manager.num_speakers = config.num_speakers
        self.num_speakers = self.speaker_manager.num_speakers
        # init speaker embedding layer
        if config.use_speaker_embedding and not config.use_d_vector_file:
            self.embedded_speaker_dim = config.speaker_embedding_channels
            self.emb_g = nn.Embedding(config.num_speakers,
                                      config.speaker_embedding_channels)
        # init d-vector usage
        if config.use_d_vector_file:
            self.embedded_speaker_dim = config.d_vector_dim
Esempio n. 2
0
def main(args):  # pylint: disable=redefined-outer-name
    # pylint: disable=global-variable-undefined
    global meta_data, speaker_manager

    # Audio processor
    ap = AudioProcessor(**c.audio)

    # load data instances
    meta_data_train, meta_data_eval = load_meta_data(c.datasets,
                                                     eval_split=args.eval)

    # use eval and training partitions
    meta_data = meta_data_train + meta_data_eval

    # parse speakers
    speaker_manager = get_speaker_manager(c, args, meta_data_train)

    # setup model
    model = setup_model(c)

    # restore model
    model.load_checkpoint(c, args.checkpoint_path, eval=True)

    if use_cuda:
        model.cuda()

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)
    # set r
    r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
    own_loader = setup_loader(ap, r, verbose=True)

    extract_spectrograms(
        own_loader,
        model,
        ap,
        args.output_path,
        quantized_wav=args.quantized,
        save_audio=args.save_audio,
        debug=args.debug,
        metada_name="metada.txt",
    )
Esempio n. 3
0
    def init_multispeaker(self, config: Coqpit, data: List = None):
        """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
        `in_channels` size of the connected layers.

        This implementation yields 3 possible outcomes:

        1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing.
        2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512.
        3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of
        `config.d_vector_dim` or 512.

        You can override this function for new models.0

        Args:
            config (Coqpit): Model configuration.
            data (List, optional): Dataset items to infer number of speakers. Defaults to None.
        """
        # init speaker manager
        self.speaker_manager = get_speaker_manager(config, data=data)

        # set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager
        if data is not None or self.speaker_manager.speaker_ids:
            self.num_speakers = self.speaker_manager.num_speakers
        else:
            self.num_speakers = (config.num_speakers
                                 if "num_speakers" in config
                                 and config.num_speakers != 0 else
                                 self.speaker_manager.num_speakers)

        # set ultimate speaker embedding size
        if config.use_speaker_embedding or config.use_d_vector_file:
            self.embedded_speaker_dim = (
                config.d_vector_dim if "d_vector_dim" in config
                and config.d_vector_dim is not None else 512)
        # init speaker embedding layer
        if config.use_speaker_embedding and not config.use_d_vector_file:
            self.speaker_embedding = nn.Embedding(self.num_speakers,
                                                  self.embedded_speaker_dim)
            self.speaker_embedding.weight.data.normal_(0, 0.3)
Esempio n. 4
0
    def init_multispeaker(self, config: "Coqpit", data: list = None) -> None:
        """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
        or with external `d_vectors` computed from a speaker encoder model.

        If you need a different behaviour, override this function for your model.

        Args:
            config (Coqpit): Model configuration.
            data (List, optional): Dataset items to infer number of speakers. Defaults to None.
        """
        # init speaker manager
        self.speaker_manager = get_speaker_manager(config, data=data)
        self.num_speakers = self.speaker_manager.num_speakers
        if config.use_d_vector_file:
            self.external_d_vector_dim = config.d_vector_dim
        else:
            self.external_d_vector_dim = 0
        # init speaker embedding layer
        if config.use_speaker_embedding and not config.use_d_vector_file:
            self.embedded_speaker_dim = self.c_in_channels
            self.emb_g = nn.Embedding(self.num_speakers,
                                      self.embedded_speaker_dim)
            nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
Esempio n. 5
0
 def get_speaker_manager(config: Coqpit,
                         restore_path: str,
                         data: List,
                         out_path: str = None) -> SpeakerManager:
     return get_speaker_manager(config, restore_path, data, out_path)