Beispiel #1
0
    def __init__(
        self,
        input_size: int,
        rnn_type: str = "lstm",
        bidirectional: bool = True,
        use_projection: bool = True,
        num_layers: int = 4,
        hidden_size: int = 320,
        output_size: int = 320,
        dropout: float = 0.0,
        in_channel: int = 1,
    ):
        assert check_argument_types()
        super().__init__()
        self._output_size = output_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.use_projection = use_projection
        if rnn_type not in {"lstm", "gru"}:
            raise ValueError(f"Not supported rnn_type={rnn_type}")

        # Subsample is not used for VGGRNN
        subsample = np.ones(num_layers + 1, dtype=np.int64)
        rnn_type = ("b" if bidirectional else "") + rnn_type
        if use_projection:
            self.enc = torch.nn.ModuleList([
                VGG2L(in_channel),
                RNNP(
                    get_vgg2l_odim(input_size, in_channel=in_channel),
                    num_layers,
                    hidden_size,
                    output_size,
                    subsample,
                    dropout,
                    typ=rnn_type,
                ),
            ])

        else:
            self.enc = torch.nn.ModuleList([
                VGG2L(in_channel),
                RNN(
                    get_vgg2l_odim(input_size, in_channel=in_channel),
                    num_layers,
                    hidden_size,
                    output_size,
                    dropout,
                    typ=rnn_type,
                ),
            ])
Beispiel #2
0
    def __init__(
        self,
        etype,
        idim,
        elayers_sd,
        elayers_rec,
        eunits,
        eprojs,
        subsample,
        dropout,
        num_spkrs=2,
        in_channel=1,
    ):
        """Initialize the encoder of single-channel multi-speaker ASR."""
        super(EncoderMix, self).__init__()
        typ = etype.lstrip("vgg").rstrip("p")
        if typ not in ["lstm", "gru", "blstm", "bgru"]:
            logging.error(
                "Error: need to specify an appropriate encoder architecture")
        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc_mix = torch.nn.ModuleList([VGG2L(in_channel)])
                self.enc_sd = torch.nn.ModuleList([
                    torch.nn.ModuleList([
                        RNNP(
                            get_vgg2l_odim(idim, in_channel=in_channel),
                            elayers_sd,
                            eunits,
                            eprojs,
                            subsample[:elayers_sd + 1],
                            dropout,
                            typ=typ,
                        )
                    ]) for i in range(num_spkrs)
                ])
                self.enc_rec = torch.nn.ModuleList([
                    RNNP(
                        eprojs,
                        elayers_rec,
                        eunits,
                        eprojs,
                        subsample[elayers_sd:],
                        dropout,
                        typ=typ,
                    )
                ])
                logging.info("Use CNN-VGG + B" + typ.upper() + "P for encoder")
            else:
                logging.error(
                    f"Error: need to specify an appropriate encoder architecture. "
                    f"Illegal name {etype}")
                sys.exit()
        else:
            logging.error(
                f"Error: need to specify an appropriate encoder architecture. "
                f"Illegal name {etype}")
            sys.exit()

        self.num_spkrs = num_spkrs
Beispiel #3
0
    def __init__(self,
                 etype,
                 idim,
                 elayers_sd,
                 elayers_rec,
                 eunits,
                 eprojs,
                 subsample,
                 dropout,
                 num_spkrs=2,
                 in_channel=1):
        super(Encoder, self).__init__()
        typ = etype.lstrip("vgg").lstrip("b").rstrip("p")
        if typ != "lstm" and typ != "gru":
            logging.error(
                "Error: need to specify an appropriate encoder architecture")
        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc_mix = torch.nn.ModuleList([VGG2L(in_channel)])
                self.enc_sd = torch.nn.ModuleList([
                    torch.nn.ModuleList([
                        RNNP(get_vgg2l_odim(idim, in_channel=in_channel),
                             elayers_sd,
                             eunits,
                             eprojs,
                             subsample[:elayers_sd + 1],
                             dropout,
                             typ=typ)
                    ]) for i in range(num_spkrs)
                ])
                self.enc_rec = torch.nn.ModuleList([
                    RNNP(eprojs,
                         elayers_rec,
                         eunits,
                         eprojs,
                         subsample[elayers_sd:],
                         dropout,
                         typ=typ)
                ])
                logging.info('Use CNN-VGG + B' + typ.upper() + 'P for encoder')
        else:
            logging.error(
                "Error: need to specify an appropriate encoder architecture")
            sys.exit()

        self.num_spkrs = num_spkrs