Exemple #1
0
    def __init__(self,
                 kernel_size,
                 in_chan,
                 n_src,
                 bn_chan,
                 chunk_size,
                 hop_size=None,
                 mask_act="relu"):
        super(SingleDecoder, self).__init__()
        self.kernel_size = kernel_size
        self.in_chan = in_chan
        self.bn_chan = bn_chan
        self.chunk_size = chunk_size
        hop_size = hop_size if hop_size is not None else chunk_size // 2
        self.hop_size = hop_size
        self.n_src = n_src
        self.mask_act = mask_act

        # Masking in 3D space
        net_out_conv = nn.Conv2d(bn_chan, n_src * bn_chan, 1)
        self.first_out = nn.Sequential(nn.PReLU(), net_out_conv)
        # Gating and masking in 2D space (after fold)
        self.net_out = nn.Sequential(nn.Conv1d(bn_chan, bn_chan, 1), nn.Tanh())
        self.net_gate = nn.Sequential(nn.Conv1d(bn_chan, bn_chan, 1),
                                      nn.Sigmoid())
        self.mask_net = nn.Conv1d(bn_chan, in_chan, 1, bias=False)

        # Get activation function.
        mask_nl_class = activations.get(mask_act)
        # For softmax, feed the source dimension.
        if has_arg(mask_nl_class, "dim"):
            self.output_act = mask_nl_class(dim=1)
        else:
            self.output_act = mask_nl_class()

        _, self.trans_conv = make_enc_dec("free",
                                          kernel_size=kernel_size,
                                          n_filters=in_chan)
Exemple #2
0
    def __init__(
        self,
        in_chan,
        n_src,
        n_heads=4,
        ff_hid=256,
        chunk_size=100,
        hop_size=None,
        n_repeats=6,
        norm_type="gLN",
        ff_activation="relu",
        mask_act="relu",
        bidirectional=True,
        dropout=0,
    ):
        super(DPTransformer, self).__init__()
        self.in_chan = in_chan
        self.n_src = n_src
        self.n_heads = n_heads
        self.ff_hid = ff_hid
        self.chunk_size = chunk_size
        hop_size = hop_size if hop_size is not None else chunk_size // 2
        self.hop_size = hop_size
        self.n_repeats = n_repeats
        self.n_src = n_src
        self.norm_type = norm_type
        self.ff_activation = ff_activation
        self.mask_act = mask_act
        self.bidirectional = bidirectional
        self.dropout = dropout

        self.mha_in_dim = ceil(self.in_chan / self.n_heads) * self.n_heads
        if self.in_chan % self.n_heads != 0:
            warnings.warn(
                f"DPTransformer input dim ({self.in_chan}) is not a multiple of the number of "
                f"heads ({self.n_heads}). Adding extra linear layer at input to accomodate "
                f"(size [{self.in_chan} x {self.mha_in_dim}])")
            self.input_layer = nn.Linear(self.in_chan, self.mha_in_dim)
        else:
            self.input_layer = None

        self.in_norm = norms.get(norm_type)(self.mha_in_dim)
        self.ola = DualPathProcessing(self.chunk_size, self.hop_size)

        # Succession of DPRNNBlocks.
        self.layers = nn.ModuleList([])
        for x in range(self.n_repeats):
            self.layers.append(
                nn.ModuleList([
                    ImprovedTransformedLayer(
                        self.mha_in_dim,
                        self.n_heads,
                        self.ff_hid,
                        self.dropout,
                        self.ff_activation,
                        True,
                        self.norm_type,
                    ),
                    ImprovedTransformedLayer(
                        self.mha_in_dim,
                        self.n_heads,
                        self.ff_hid,
                        self.dropout,
                        self.ff_activation,
                        self.bidirectional,
                        self.norm_type,
                    ),
                ]))
        net_out_conv = nn.Conv2d(self.mha_in_dim, n_src * self.in_chan, 1)
        self.first_out = nn.Sequential(nn.PReLU(), net_out_conv)
        # Gating and masking in 2D space (after fold)
        self.net_out = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1),
                                     nn.Tanh())
        self.net_gate = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1),
                                      nn.Sigmoid())

        # Get activation function.
        mask_nl_class = activations.get(mask_act)
        # For softmax, feed the source dimension.
        if has_arg(mask_nl_class, "dim"):
            self.output_act = mask_nl_class(dim=1)
        else:
            self.output_act = mask_nl_class()
Exemple #3
0
    def __init__(
        self,
        in_chan,
        n_src,
        n_heads=4,
        ff_hid=256,
        chunk_size=100,
        hop_size=None,
        n_repeats=6,
        norm_type="gLN",
        ff_activation="relu",
        mask_act="relu",
        bidirectional=True,
        dropout=0,
    ):
        super(DPTransformer, self).__init__()
        self.in_chan = in_chan
        self.n_src = n_src
        self.n_heads = n_heads
        self.ff_hid = ff_hid
        self.chunk_size = chunk_size
        hop_size = hop_size if hop_size is not None else chunk_size // 2
        self.hop_size = hop_size
        self.n_repeats = n_repeats
        self.n_src = n_src
        self.norm_type = norm_type
        self.ff_activation = ff_activation
        self.mask_act = mask_act
        self.bidirectional = bidirectional
        self.dropout = dropout

        self.in_norm = norms.get(norm_type)(in_chan)

        # Succession of DPRNNBlocks.
        self.layers = nn.ModuleList([])
        for x in range(self.n_repeats):
            self.layers.append(
                nn.ModuleList([
                    ImprovedTransformedLayer(
                        self.in_chan,
                        self.n_heads,
                        self.ff_hid,
                        self.dropout,
                        self.ff_activation,
                        True,
                        self.norm_type,
                    ),
                    ImprovedTransformedLayer(
                        self.in_chan,
                        self.n_heads,
                        self.ff_hid,
                        self.dropout,
                        self.ff_activation,
                        self.bidirectional,
                        self.norm_type,
                    ),
                ]))
        net_out_conv = nn.Conv2d(self.in_chan, n_src * self.in_chan, 1)
        self.first_out = nn.Sequential(nn.PReLU(), net_out_conv)
        # Gating and masking in 2D space (after fold)
        self.net_out = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1),
                                     nn.Tanh())
        self.net_gate = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1),
                                      nn.Sigmoid())

        # Get activation function.
        mask_nl_class = activations.get(mask_act)
        # For softmax, feed the source dimension.
        if has_arg(mask_nl_class, "dim"):
            self.output_act = mask_nl_class(dim=1)
        else:
            self.output_act = mask_nl_class()
Exemple #4
0
    def __init__(
        self,
        in_chan,  # encoder out channel 64
        n_src,
        out_chan=None,
        bn_chan=64,
        n_heads=4,
        ff_hid=256,
        rnn_hid=128,
        rnn_layers=1,
        pe_conv_k=3,
        chunk_size=100,
        hop_size=None,  # 50
        n_repeats=6,  # 2
        norm_type="gLN",
        ff_activation="relu",
        mask_act="relu",  # sigmoid
        bidirectional=True,
        dropout=0,
    ):
        super(DualTransformer, self).__init__()
        self.in_chan = in_chan
        out_chan = out_chan if out_chan is not None else in_chan
        self.out_chan = out_chan
        self.bn_chan = bn_chan
        self.n_src = n_src
        self.n_heads = n_heads
        self.ff_hid = ff_hid
        self.rnn_hid = rnn_hid
        self.rnn_layers = rnn_layers
        self.chunk_size = chunk_size
        hop_size = hop_size if hop_size is not None else chunk_size // 2
        self.hop_size = hop_size
        self.n_repeats = n_repeats
        self.n_src = n_src
        self.norm_type = norm_type
        self.ff_activation = ff_activation
        self.mask_act = mask_act
        self.bidirectional = bidirectional
        self.dropout = dropout

        # mean, var for the whole sequence and channel, but gamma beta only for channel size
        # gln vs cln: on whole sequence or separately
        # self.in_norm = norms.get(norm_type)(in_chan)
        layer_norm = norms.get(norm_type)(in_chan)
        bottleneck_conv = nn.Conv1d(in_chan, bn_chan, 1)
        self.bottleneck = nn.Sequential(layer_norm, bottleneck_conv)

        pe_conv_list = []
        for i in range(pe_conv_k):
            pe_conv_list.append(
                nn.Conv2d(bn_chan,
                          bn_chan,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          bias=False))
            pe_conv_list.append(norms.get(norm_type)(bn_chan))
            pe_conv_list.append(activations.get(ff_activation)())
        self.pe_conv = nn.Sequential(*pe_conv_list)
        d_model = self.bn_chan

        # # *2 for PE
        # self.pe = PositionalEmbedding(in_chan)
        # d_model = self.in_chan * 2

        # Succession of DPRNNBlocks.
        self.layers = nn.ModuleList([])
        for x in range(self.n_repeats):
            self.layers.append(
                nn.ModuleList([
                    # ImprovedTransformedLayer(
                    #     d_model,
                    #     self.n_heads,
                    #     self.ff_hid,
                    #     self.dropout,
                    #     self.ff_activation,
                    #     True,
                    #     self.norm_type,
                    # ),
                    # ImprovedTransformedLayer(
                    #     d_model,
                    #     self.n_heads,
                    #     self.ff_hid,
                    #     self.dropout,
                    #     self.ff_activation,
                    #     self.bidirectional,
                    #     self.norm_type,
                    # ),
                    SingleRNNBlock(
                        in_chan=d_model,
                        hid_size=self.rnn_hid,
                        norm_type=self.norm_type,
                        bidirectional=self.bidirectional,
                        rnn_type='LSTM',
                        num_layers=1,
                        dropout=self.dropout,
                    ),

                    # DualTransformedLayer(
                    #     d_model,
                    #     self.n_heads,
                    #     self.ff_hid,
                    #     self.dropout,
                    #     self.ff_activation,
                    #     self.norm_type,
                    # ),
                    AcousticTransformerLayer(
                        d_model,
                        self.n_heads,
                        self.ff_hid,
                        self.dropout,
                        self.ff_activation,
                        self.norm_type,
                    ),
                ]))
            # self.layers.append(
            #     nn.ModuleList(
            #         [
            #             DualTransformedLayer(
            #                 d_model,
            #                 self.n_heads,
            #                 self.ff_hid,
            #                 self.dropout,
            #                 self.ff_activation,
            #                 self.norm_type,
            #             ),
            #             DualTransformedLayer(
            #                 d_model,
            #                 self.n_heads,
            #                 self.ff_hid,
            #                 self.dropout,
            #                 self.ff_activation,
            #                 self.norm_type,
            #             ),
            #         ]
            #     )
            # )
        # 1x1 conv
        # *2 for PE
        self.strnn_norm_out = norms.get(norm_type)(self.bn_chan)
        net_out_conv = nn.Conv2d(d_model, n_src * self.bn_chan, 1)
        self.first_out = nn.Sequential(nn.PReLU(), net_out_conv)
        # Gating and masking in 2D space (after fold)
        self.net_out = nn.Sequential(nn.Conv1d(self.bn_chan, self.bn_chan, 1),
                                     nn.Tanh())
        self.net_gate = nn.Sequential(nn.Conv1d(self.bn_chan, self.bn_chan, 1),
                                      nn.Sigmoid())
        self.mask_net = nn.Conv1d(bn_chan, out_chan, 1, bias=False)

        # Get activation function.
        mask_nl_class = activations.get(mask_act)
        # For softmax, feed the source dimension.
        if has_arg(mask_nl_class, "dim"):
            self.output_act = mask_nl_class(dim=1)
        else:
            self.output_act = mask_nl_class()