def __init__(self, dim, cfg: Wav2vec_UConfig): super().__init__() inner_dim = cfg.discriminator_dim kernel = cfg.discriminator_kernel dilation = cfg.discriminator_dilation self.max_pool = cfg.discriminator_max_pool if cfg.discriminator_causal: padding = kernel - 1 else: padding = kernel // 2 def make_conv(in_d, out_d, k, p=0, has_dilation=True): conv = nn.Conv1d( in_d, out_d, kernel_size=k, padding=p, dilation=dilation if has_dilation else 1, ) if cfg.discriminator_spectral_norm: conv = nn.utils.spectral_norm(conv) elif cfg.discriminator_weight_norm: conv = nn.utils.weight_norm(conv) return conv inner_net = [ nn.Sequential( make_conv(inner_dim, inner_dim, kernel, padding), SamePad(kernel_size=kernel, causal=cfg.discriminator_causal), nn.Dropout(cfg.discriminator_dropout), nn.GELU(), ) for _ in range(cfg.discriminator_depth - 1) ] + [ make_conv(inner_dim, 1, kernel, padding, has_dilation=False), SamePad(kernel_size=kernel, causal=cfg.discriminator_causal), ] if cfg.discriminator_linear_emb: emb_net = [make_conv(dim, inner_dim, 1)] else: emb_net = [ make_conv(dim, inner_dim, kernel, padding), SamePad(kernel_size=kernel, causal=cfg.discriminator_causal), ] if cfg.discriminator_act_after_linear: emb_net.append(nn.GELU()) self.net = nn.Sequential( *emb_net, nn.Dropout(cfg.discriminator_dropout), *inner_net, )
def __init__(self, args): super().__init__() self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.required_seq_len_multiple = args.required_seq_len_multiple self.pos_conv = nn.Conv1d( self.embedding_dim, self.embedding_dim, kernel_size=args.conv_pos, padding=args.conv_pos // 2, groups=args.conv_pos_groups, ) dropout = 0 std = math.sqrt( (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) nn.init.normal_(self.pos_conv.weight, mean=0, std=std) nn.init.constant_(self.pos_conv.bias, 0) self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) self.layers = nn.ModuleList([ self.build_encoder_layer(args) for _ in range(args.encoder_layers) ]) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) self.layerdrop = args.encoder_layerdrop self.apply(init_bert_params)
def __init__(self, args): super().__init__() self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.pos_conv = nn.Conv1d( self.embedding_dim, self.embedding_dim, kernel_size=args.conv_pos, padding=args.conv_pos // 2, groups=args.conv_pos_groups, ) dropout = 0 std = math.sqrt( (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) nn.init.normal_(self.pos_conv.weight, mean=0, std=std) nn.init.constant_(self.pos_conv.bias, 0) self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) builder = AttentionBuilder.from_kwargs( attention_dropout=args.attention_dropout, clusters=args.clusters, iterations=args.iterations, topk=args.topk, bits=args.bits, hash_bias=args.hash_bias, length_limit=args.length_limit, query_dimensions=args.encoder_embed_dim / args.encoder_attention_heads, ) inner_attention = builder.get(args.attention_type) self.layers = nn.ModuleList([ TransformerSentenceEncoderLayer( inner_attention=inner_attention, embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, ) for _ in range(args.encoder_layers) ]) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) self.layerdrop = args.encoder_layerdrop self.apply(init_bert_params)
def __init__(self, args): super().__init__() self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.pos_conv = nn.Conv1d( self.embedding_dim, self.embedding_dim, kernel_size=args.conv_pos, padding=args.conv_pos // 2, groups=args.conv_pos_groups, ) dropout = 0 std = math.sqrt( (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) nn.init.normal_(self.pos_conv.weight, mean=0, std=std) nn.init.constant_(self.pos_conv.bias, 0) self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) layers = [] for _ in range(args.encoder_layers): layer = TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, ) if args.checkpoint_activations: layer = fsdp_wrap(layer) layer = checkpoint_wrapper(layer) layers.append(layer) self.layers = nn.ModuleList(layers) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) self.layerdrop = args.encoder_layerdrop self.apply(init_bert_params)
def make_conv_block(e, k, g, l): return nn.Sequential(*[ nn.Sequential( nn.Conv1d( e, e, kernel_size=k, padding=k // 2, groups=g, ), SamePad(k), TransposeLast(), LayerNorm(e, elementwise_affine=False), TransposeLast(), nn.GELU(), ) for _ in range(l) ])
def make_conv_pos(e, k, g): pos_conv = nn.Conv1d( e, e, kernel_size=k, padding=k // 2, groups=g, ) dropout = 0 std = math.sqrt((4 * (1.0 - dropout)) / (k * e)) nn.init.normal_(pos_conv.weight, mean=0, std=std) nn.init.constant_(pos_conv.bias, 0) pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2) pos_conv = nn.Sequential(pos_conv, SamePad(k), nn.GELU()) return pos_conv
def __init__(self, args, alway_mask=False): super().__init__(args) self.args = args self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.feat_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_feature: self.feat_scale = 1.0 subsample = ConvFeatureExtractionModel( conv_layers=eval(args.conv_feature_layers), dropout=0.0, mode=args.speech_extractor_mode, # default, layer_norm conv_bias=args.speech_conv_bias, ) feature_enc_layers = eval(args.conv_feature_layers) self.subsample = subsample self.feat_proj = ( nn.Linear(feature_enc_layers[-1][0], self.embedding_dim) if feature_enc_layers[-1][0] != self.embedding_dim else None) self.feat_layer_norm = LayerNorm(feature_enc_layers[-1][0]) self.embed_positions = nn.Conv1d( self.embedding_dim, self.embedding_dim, kernel_size=args.conv_pos, padding=args.conv_pos // 2, groups=args.conv_pos_groups, ) std = math.sqrt(4 / (args.conv_pos * self.embedding_dim)) nn.init.normal_(self.embed_positions.weight, mean=0, std=std) nn.init.constant_(self.embed_positions.bias, 0) self.embed_positions = nn.utils.weight_norm(self.embed_positions, name="weight", dim=2) self.embed_positions = nn.Sequential(self.embed_positions, SamePad(args.conv_pos), nn.GELU()) self.mask_prob = args.speech_mask_prob self.mask_selection = args.speech_mask_selection self.mask_other = args.speech_mask_other self.mask_length = args.speech_mask_length self.no_mask_overlap = args.speech_no_mask_overlap self.mask_min_space = args.speech_mask_min_space self.mask_channel_prob = args.speech_mask_channel_prob self.mask_channel_selection = args.speech_mask_channel_selection self.mask_channel_other = args.speech_mask_channel_other self.mask_channel_length = args.speech_mask_channel_length self.no_mask_channel_overlap = args.speech_no_mask_channel_overlap self.mask_channel_min_space = args.speech_mask_channel_min_space self.dropout_input = nn.Dropout(args.dropout_input) self.dropout_features = nn.Dropout(args.dropout_features) self.feature_grad_mult = args.feature_grad_mult self.mask_emb = nn.Parameter( torch.FloatTensor(args.encoder_embed_dim).uniform_()) self.layers = nn.ModuleList([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ]) self.layer_norm = LayerNorm(args.encoder_embed_dim) self.normalize_before = args.encoder_normalize_before self.alway_mask = alway_mask