def __init__(self, args, dictionary, audio_features=40):
        super().__init__(dictionary)
        convolutions = eval(args.encoder_convolutions) if args.encoder_convolutions is not None else ((512, 3),) * 2
        stride = 2
        self.dropout = args.dropout

        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, "activation_fn", "relu")
        )

        convolutions = fconv.extend_conv_spec(convolutions)
        self.convolutions = nn.ModuleList()
        in_channels = 1
        for i, (out_channels, kernel_size, kernel_width) in enumerate(convolutions):
            if kernel_size % 2 == 1:
                padding = kernel_size // 2
            else:
                padding = 0
            self.convolutions.append(Conv2D(
                in_channels, out_channels, kernel_size, dropout=self.dropout, padding=padding, stride=stride))
            in_channels = out_channels
        if args.attn_2d:
            self.attn_2d = nn.ModuleList(
                [ConvAttention2D(out_channels, 4, dropout=self.dropout) for _ in range(2)])
        self.bn = nn.ModuleList([BatchNorm(out_channels) for _ in range(len(convolutions))])

        if args.distance_penalty == True:
            args.distance_penalty = 'log'

        flat_dim = audio_features
        for _ in range(len(self.convolutions)):
            flat_dim = math.ceil(flat_dim / stride)
        flat_dim *= out_channels
        self.fc3 = Linear(flat_dim, args.encoder_embed_dim)
        self.embed_positions = PositionalEmbeddingAudio(
            args.max_source_positions, args.encoder_embed_dim, 0, learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None
        self.layer_wise_attention = getattr(args, "layer_wise_attention", False)
        self.encoder_layerdrop = args.encoder_layerdrop

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [ConvTransformerEncoderLayer(args) for _ in range(args.encoder_layers)]
        )
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(args.encoder_embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(args.encoder_embed_dim)
        else:
            self.layernorm_embedding = None
        self.ctc_compress_out = args.ctc_compress_out
        if self.ctc_compress_out:
            self.ctc_fc = nn.Linear(args.encoder_embed_dim, len(dictionary))
            assert args.criterion == "ctc_multi_loss"
            self.ctc_layer = args.ctc_encoder_layer
            self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
Beispiel #2
0
    def __init__(
        self,
        conv_layers_before=None,
        input_size=83,
        embed_dim=512,
        convolutions=((512, 3), ) * 20,
        dropout=0.1,
    ):
        super(FConvEncoder, self).__init__(None)  # no src dictionary
        self.dropout = dropout
        self.num_attention_layers = None

        self.conv_layers_before = conv_layers_before
        self.fc0 = Linear(input_size, embed_dim, dropout=dropout) \
            if input_size != embed_dim else None

        convolutions = extend_conv_spec(convolutions)
        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.residuals = []

        layer_in_channels = [in_channels]
        for _, (out_channels, kernel_size,
                residual) in enumerate(convolutions):
            if residual == 0:
                residual_dim = out_channels
            else:
                residual_dim = layer_in_channels[-residual]
            self.projections.append(
                Linear(residual_dim, out_channels
                       ) if residual_dim != out_channels else None)
            if kernel_size % 2 == 1:
                padding = kernel_size // 2
            else:
                padding = 0
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        dropout=dropout,
                        padding=padding))
            self.residuals.append(residual)
            in_channels = out_channels
            layer_in_channels.append(out_channels)
        self.fc2 = Linear(in_channels, embed_dim)