Beispiel #1
0
    def __init__(self, dim, cfg: Wav2vec_UConfig):
        super().__init__()

        inner_dim = cfg.discriminator_dim
        kernel = cfg.discriminator_kernel
        dilation = cfg.discriminator_dilation
        self.max_pool = cfg.discriminator_max_pool

        if cfg.discriminator_causal:
            padding = kernel - 1
        else:
            padding = kernel // 2

        def make_conv(in_d, out_d, k, p=0, has_dilation=True):
            conv = nn.Conv1d(
                in_d,
                out_d,
                kernel_size=k,
                padding=p,
                dilation=dilation if has_dilation else 1,
            )
            if cfg.discriminator_spectral_norm:
                conv = nn.utils.spectral_norm(conv)
            elif cfg.discriminator_weight_norm:
                conv = nn.utils.weight_norm(conv)
            return conv

        inner_net = [
            nn.Sequential(
                make_conv(inner_dim, inner_dim, kernel, padding),
                SamePad(kernel_size=kernel, causal=cfg.discriminator_causal),
                nn.Dropout(cfg.discriminator_dropout),
                nn.GELU(),
            )
            for _ in range(cfg.discriminator_depth - 1)
        ] + [
            make_conv(inner_dim, 1, kernel, padding, has_dilation=False),
            SamePad(kernel_size=kernel, causal=cfg.discriminator_causal),
        ]

        if cfg.discriminator_linear_emb:
            emb_net = [make_conv(dim, inner_dim, 1)]
        else:
            emb_net = [
                make_conv(dim, inner_dim, kernel, padding),
                SamePad(kernel_size=kernel, causal=cfg.discriminator_causal),
            ]

        if cfg.discriminator_act_after_linear:
            emb_net.append(nn.GELU())

        self.net = nn.Sequential(
            *emb_net,
            nn.Dropout(cfg.discriminator_dropout),
            *inner_net,
        )
Beispiel #2
0
    def __init__(self, args):
        super().__init__()

        self.dropout = args.dropout
        self.embedding_dim = args.encoder_embed_dim
        self.required_seq_len_multiple = args.required_seq_len_multiple

        self.pos_conv = nn.Conv1d(
            self.embedding_dim,
            self.embedding_dim,
            kernel_size=args.conv_pos,
            padding=args.conv_pos // 2,
            groups=args.conv_pos_groups,
        )
        dropout = 0
        std = math.sqrt(
            (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
        nn.init.constant_(self.pos_conv.bias, 0)

        self.pos_conv = nn.utils.weight_norm(self.pos_conv,
                                             name="weight",
                                             dim=2)
        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos),
                                      nn.GELU())

        self.layers = nn.ModuleList([
            self.build_encoder_layer(args) for _ in range(args.encoder_layers)
        ])
        self.layer_norm_first = args.layer_norm_first
        self.layer_norm = LayerNorm(self.embedding_dim)
        self.layerdrop = args.encoder_layerdrop

        self.apply(init_bert_params)
Beispiel #3
0
    def __init__(self, args):
        super().__init__()

        self.dropout = args.dropout
        self.embedding_dim = args.encoder_embed_dim

        self.pos_conv = nn.Conv1d(
            self.embedding_dim,
            self.embedding_dim,
            kernel_size=args.conv_pos,
            padding=args.conv_pos // 2,
            groups=args.conv_pos_groups,
        )
        dropout = 0
        std = math.sqrt(
            (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
        nn.init.constant_(self.pos_conv.bias, 0)

        self.pos_conv = nn.utils.weight_norm(self.pos_conv,
                                             name="weight",
                                             dim=2)
        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos),
                                      nn.GELU())

        builder = AttentionBuilder.from_kwargs(
            attention_dropout=args.attention_dropout,
            clusters=args.clusters,
            iterations=args.iterations,
            topk=args.topk,
            bits=args.bits,
            hash_bias=args.hash_bias,
            length_limit=args.length_limit,
            query_dimensions=args.encoder_embed_dim /
            args.encoder_attention_heads,
        )
        inner_attention = builder.get(args.attention_type)

        self.layers = nn.ModuleList([
            TransformerSentenceEncoderLayer(
                inner_attention=inner_attention,
                embedding_dim=self.embedding_dim,
                ffn_embedding_dim=args.encoder_ffn_embed_dim,
                num_attention_heads=args.encoder_attention_heads,
                dropout=self.dropout,
                attention_dropout=args.attention_dropout,
                activation_dropout=args.activation_dropout,
                activation_fn=args.activation_fn,
                layer_norm_first=args.layer_norm_first,
            ) for _ in range(args.encoder_layers)
        ])

        self.layer_norm_first = args.layer_norm_first
        self.layer_norm = LayerNorm(self.embedding_dim)
        self.layerdrop = args.encoder_layerdrop

        self.apply(init_bert_params)
Beispiel #4
0
    def __init__(self, args):
        super().__init__()

        self.dropout = args.dropout
        self.embedding_dim = args.encoder_embed_dim

        self.pos_conv = nn.Conv1d(
            self.embedding_dim,
            self.embedding_dim,
            kernel_size=args.conv_pos,
            padding=args.conv_pos // 2,
            groups=args.conv_pos_groups,
        )
        dropout = 0
        std = math.sqrt(
            (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
        nn.init.constant_(self.pos_conv.bias, 0)

        self.pos_conv = nn.utils.weight_norm(self.pos_conv,
                                             name="weight",
                                             dim=2)
        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos),
                                      nn.GELU())

        layers = []
        for _ in range(args.encoder_layers):
            layer = TransformerSentenceEncoderLayer(
                embedding_dim=self.embedding_dim,
                ffn_embedding_dim=args.encoder_ffn_embed_dim,
                num_attention_heads=args.encoder_attention_heads,
                dropout=self.dropout,
                attention_dropout=args.attention_dropout,
                activation_dropout=args.activation_dropout,
                activation_fn=args.activation_fn,
                layer_norm_first=args.layer_norm_first,
            )
            if args.checkpoint_activations:
                layer = fsdp_wrap(layer)
                layer = checkpoint_wrapper(layer)
            layers.append(layer)
        self.layers = nn.ModuleList(layers)

        self.layer_norm_first = args.layer_norm_first
        self.layer_norm = LayerNorm(self.embedding_dim)
        self.layerdrop = args.encoder_layerdrop

        self.apply(init_bert_params)
Beispiel #5
0
 def make_conv_block(e, k, g, l):
     return nn.Sequential(*[
         nn.Sequential(
             nn.Conv1d(
                 e,
                 e,
                 kernel_size=k,
                 padding=k // 2,
                 groups=g,
             ),
             SamePad(k),
             TransposeLast(),
             LayerNorm(e, elementwise_affine=False),
             TransposeLast(),
             nn.GELU(),
         ) for _ in range(l)
     ])
Beispiel #6
0
def make_conv_pos(e, k, g):
    pos_conv = nn.Conv1d(
        e,
        e,
        kernel_size=k,
        padding=k // 2,
        groups=g,
    )
    dropout = 0
    std = math.sqrt((4 * (1.0 - dropout)) / (k * e))
    nn.init.normal_(pos_conv.weight, mean=0, std=std)
    nn.init.constant_(pos_conv.bias, 0)

    pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2)
    pos_conv = nn.Sequential(pos_conv, SamePad(k), nn.GELU())

    return pos_conv
Beispiel #7
0
    def __init__(self, args, alway_mask=False):
        super().__init__(args)
        self.args = args
        self.dropout = args.dropout
        self.embedding_dim = args.encoder_embed_dim
        self.feat_scale = math.sqrt(args.encoder_embed_dim)
        if args.no_scale_feature:
            self.feat_scale = 1.0

        subsample = ConvFeatureExtractionModel(
            conv_layers=eval(args.conv_feature_layers),
            dropout=0.0,
            mode=args.speech_extractor_mode,  # default, layer_norm
            conv_bias=args.speech_conv_bias,
        )
        feature_enc_layers = eval(args.conv_feature_layers)
        self.subsample = subsample
        self.feat_proj = (
            nn.Linear(feature_enc_layers[-1][0], self.embedding_dim)
            if feature_enc_layers[-1][0] != self.embedding_dim else None)

        self.feat_layer_norm = LayerNorm(feature_enc_layers[-1][0])

        self.embed_positions = nn.Conv1d(
            self.embedding_dim,
            self.embedding_dim,
            kernel_size=args.conv_pos,
            padding=args.conv_pos // 2,
            groups=args.conv_pos_groups,
        )
        std = math.sqrt(4 / (args.conv_pos * self.embedding_dim))
        nn.init.normal_(self.embed_positions.weight, mean=0, std=std)
        nn.init.constant_(self.embed_positions.bias, 0)

        self.embed_positions = nn.utils.weight_norm(self.embed_positions,
                                                    name="weight",
                                                    dim=2)
        self.embed_positions = nn.Sequential(self.embed_positions,
                                             SamePad(args.conv_pos), nn.GELU())

        self.mask_prob = args.speech_mask_prob
        self.mask_selection = args.speech_mask_selection
        self.mask_other = args.speech_mask_other
        self.mask_length = args.speech_mask_length
        self.no_mask_overlap = args.speech_no_mask_overlap
        self.mask_min_space = args.speech_mask_min_space

        self.mask_channel_prob = args.speech_mask_channel_prob
        self.mask_channel_selection = args.speech_mask_channel_selection
        self.mask_channel_other = args.speech_mask_channel_other
        self.mask_channel_length = args.speech_mask_channel_length
        self.no_mask_channel_overlap = args.speech_no_mask_channel_overlap
        self.mask_channel_min_space = args.speech_mask_channel_min_space

        self.dropout_input = nn.Dropout(args.dropout_input)
        self.dropout_features = nn.Dropout(args.dropout_features)

        self.feature_grad_mult = args.feature_grad_mult

        self.mask_emb = nn.Parameter(
            torch.FloatTensor(args.encoder_embed_dim).uniform_())

        self.layers = nn.ModuleList([
            TransformerEncoderLayer(args) for _ in range(args.encoder_layers)
        ])
        self.layer_norm = LayerNorm(args.encoder_embed_dim)
        self.normalize_before = args.encoder_normalize_before
        self.alway_mask = alway_mask