def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim if args.max_relative_length == -1: self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, ) else: self.self_attn = RelativeMultiheadAttention( self.embed_dim, args.encoder_attention_heads, args.max_relative_length, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList( [LayerNorm(self.embed_dim) for i in range(2)])
def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2) if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.encoder_conv_type == 'lightweight': self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) elif args.encoder_conv_type == 'dynamic': self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)]) self.rezero = False if args.rezero: self.rezero = True self.alpha = torch.nn.Parameter(torch.zeros(1))
def __init__(self, args, dictionary, embed_tokens, left_pad=True): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ connEnDeTransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before #self.history = CreateLayerHistory(args, is_encoder=True) if self.normalize: self.layer_norm = LayerNorm(embed_dim) # single linear ''' self.fc1 = Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.fc2 = Linear(args.encoder_embed_dim, args.encoder_embed_dim) ''' # multi layer '''
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if not hasattr(args, 'language_embedding'): args.language_embedding = False if args.language_embedding: self.embed_language = LanguageEmbedding(embed_dim) else: self.embed_language = None
def __init__(self, args): super().__init__(None) self.encoder_freezing_updates = args.encoder_freezing_updates self.num_updates = 0 self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__) self.embed_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_embedding: self.embed_scale = 1.0 self.padding_idx = 1 self.subsample = Conv1dSubsampler( args.input_feat_per_channel * args.input_channels, args.conv_channels, args.encoder_embed_dim, [int(k) for k in args.conv_kernel_sizes.split(",")], ) self.embed_positions = PositionalEmbedding(args.max_source_positions, args.encoder_embed_dim, self.padding_idx) self.transformer_layers = nn.ModuleList([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None self.ctc_proj = None if getattr(args, "ctc_weight", 0.) > 0.: self.ctc_proj = nn.Linear(args.encoder_embed_dim, args.tgt_dict_size)
def __init__(self, args, dictionary, embed_tokens, lang2idx2idx, M, N): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim # define a dict of lang vocab id to its index in syntactic matrix self.lang2idx2idx = torch.LongTensor(lang2idx2idx) # import pdb;pdb.set_trace() # define semantic and syntactic matrices no_langs = len([i for i in self.lang2idx2idx if i > -1]) self.M = M self.N = N self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, args): super().__init__(args) self.args = args self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.pos_enc_type = args.pos_enc_type max_source_positions = self.max_positions() if self.pos_enc_type == "rel_pos": self.embed_positions = RelPositionalEncoding( max_source_positions, self.embedding_dim) elif self.pos_enc_type == "rope": self.embed_positions = None else: raise Exception("Unsupported positional encoding type") self.layers = nn.ModuleList([ self.build_encoder_layer(args) for _ in range(args.encoder_layers) ]) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) self.layerdrop = args.encoder_layerdrop self.apply(init_bert_params)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__( self, out_channels, embed_dim, num_heads, project_input=False, gated=False, downsample=False, ): super().__init__() self.attention = DownsampledMultiHeadAttention( out_channels, embed_dim, num_heads, dropout=0, bias=True, project_input=project_input, gated=gated, downsample=downsample, ) self.in_proj_q = Linear(out_channels, embed_dim) self.in_proj_k = Linear(out_channels, embed_dim) self.in_proj_v = Linear(out_channels, embed_dim) self.ln = LayerNorm(out_channels)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.share_embed = args.share_all_embeddings embed_dim = args.encoder_embed_dim self.padding_idx = dictionary.pad() self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ NATEncoderLayer(args) for _ in range(args.encoder_layers) ]) self.dropout = nn.Dropout(args.dropout) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, layer_id, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention820( self.embed_dim, args.encoder_attention_heads, layer_id=layer_id, args=args, dropout=args.attention_dropout, cur_attn_type='es' ) self.self_attn_layer_norm = LayerNorm(self.embed_dim, args=args) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu') ) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim, layer_id=layer_id, cur_linear='fc1', args=args) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim, layer_id=layer_id, cur_linear='fc2', args=args) self.dropout = args.dropout self.info_linear = None self.se = None self.input_dropout = args.input_dropout if 'input_dropout' in args else 0.0
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): decoder_layers = args.decoder_layers args.decoder_layers = 0 super().__init__(args, dictionary, embed_tokens, no_encoder_attn) self.layers = nn.ModuleList([]) self.layers.extend([ EndorsementDetectorDecoderLayer(args, no_encoder_attn) for _ in range(1) ]) args.decoder_layers = decoder_layers self.padding_idx = embed_tokens.padding_idx self.proj_prob = Linear(args.decoder_embed_dim, 1, bias=True) self.eds_fc1 = Linear(args.decoder_embed_dim, args.decoder_embed_dim, bias=False) self.eds_fc2 = Linear(args.decoder_embed_dim, args.decoder_embed_dim, bias=False) self.eds_layer_norm = LayerNorm(args.decoder_embed_dim) self.self_attn = MultiheadAttention(args.decoder_embed_dim, args.decoder_attention_heads, dropout=0, self_attention=True)
def __init__(self, args, embed_tokens, dictionary): super().__init__() self.share_input_output_embed = args.share_decoder_input_output_embed self.embed_tokens = embed_tokens self.output_embed_dim = args.decoder_output_dim embed_dim = args.decoder_embed_dim self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: assert not isinstance(embed_tokens, nn.ModuleList) self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_tokens = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_tokens, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, kernel_size=1): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None self.conv = TaLKConv(self.conv_dim, offsets_dropout=args.weight_dropout, decode=False, num_heads=args.encoder_attention_heads, min_len_left=kernel_size, min_len_right=kernel_size) self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
def __init__(self, args, dictionary): super().__init__(dictionary) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__) self.embed_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_embedding: self.embed_scale = 1.0 self.padding_idx = 1 self.subsample = Conv1dSubsampler( args.input_feat_per_channel * args.input_channels, args.conv_channels, args.encoder_embed_dim, [int(k) for k in args.conv_kernel_sizes.split(",")], ) self.embed_positions = PositionalEmbedding(args.max_source_positions, args.encoder_embed_dim, self.padding_idx) self.transformer_layers = nn.ModuleList([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None # ctc self.ctc_compress_out = args.ctc_compress_out if self.ctc_compress_out: self.ctc_fc = nn.Linear(args.encoder_embed_dim, len(dictionary)) assert args.criterion == "ctc_multi_loss" self.ctc_layer = args.ctc_encoder_layer self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.sde = args.sde self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.scale_norm: scale = embed_dim**0.5 else: scale = None if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim, scale=scale) else: self.layer_norm = None self.tracker = VariableTracker() self.track_gradients = False
def __init__(self, args, dictionary, embed_tokens, use_linear_se=False): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ LightConvEncoderLayer(args, kernel_size=args.encoder_kernel_size_list[i], use_linear_se=use_linear_se) for i in range(args.encoder_layers) ]) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, embed_dim, output_dim, activation_fn, weight=None, share_emb_pro=None, bias=None): super().__init__() self.dense = nn.Linear(embed_dim, embed_dim) self.activation_fn = utils.get_activation_fn(activation_fn) self.layer_norm = LayerNorm(embed_dim) if weight is None: weight = nn.Linear(embed_dim, output_dim, bias=False).weight elif share_emb_pro is not None: self.add_one_linear = True self.share_emb_pro = share_emb_pro else: self.add_one_linear = False self.weight = weight if bias is None: self.bias = nn.Parameter(torch.zeros(output_dim)) self.bias = bias
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layers = nn.ModuleList([]) self.layers.extend( [ LightConvEncoderLayer( args, kernel_size=args.encoder_kernel_size_list[i] ) for i in range(args.encoder_layers) ] ) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(args, dictionary, embed_tokens) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = getattr(args, "cross_self_attention", False) if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) if args.decoder_normalize_before and not getattr(args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear( self.output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5 )
def __init__( self, cfg: Wav2Vec2Seq2SeqConfig, dictionary, embed_tokens, no_encoder_attn=False, ): super().__init__(dictionary) self.dropout = cfg.decoder_dropout self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder_embed_dim self.output_embed_dim = cfg.decoder_embed_dim self.layerdrop = cfg.decoder_layerdrop padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( cfg.max_target_positions, embed_dim, padding_idx, learned=cfg.decoder_learned_pos, ) if not cfg.no_token_positional_embeddings else None) # TODO: update this when transformer gets converted to dataclass configs transformer_cfg = copy.deepcopy(cfg) with open_dict(transformer_cfg): transformer_cfg.dropout = transformer_cfg.decoder_dropout transformer_cfg.attention_dropout = ( transformer_cfg.decoder_attention_dropout) transformer_cfg.activation_dropout = ( transformer_cfg.decoder_activation_dropout) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(transformer_cfg, no_encoder_attn) for _ in range(transformer_cfg.decoder_layers) ]) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if transformer_cfg.decoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__( self, padding_idx: int, vocab_size: int, projection_length: int = 128, num_encoder_layers: int = 12, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 12, num_projected_attention_heads: int = 12, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, layerdrop: float = 0.0, max_seq_len: int = 512, num_segments: int = 0, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, layernorm_embedding: bool = False, normalize_before: bool = False, dynamic_projection: bool = True, tie_kv=False, apply_bert_init: bool = False, activation_fn: str = "gelu", learned_pos_embedding: bool = True, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, traceable: bool = False, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.proj_len = projection_length self.dynamic_projection = dynamic_projection self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.layerdrop = layerdrop self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.traceable = traceable self.tpu = False # whether we're on TPU self.embed_tokens = self.build_embedding(self.vocab_size, self.embedding_dim, self.padding_idx) self.embed_scale = embed_scale if self.num_segments > 0: self.segment_embeddings = nn.Embedding(self.num_segments, self.embedding_dim, padding_idx=None) nn.init.normal_(self.segment_embeddings.weight, mean=0.0, std=self.embedding_dim**-0.5) else: self.segment_embeddings = None self.embed_positions = (PositionalEmbedding( self.max_seq_len, self.embedding_dim, padding_idx=( self.padding_idx if offset_positions_by_padding else None), learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None) self.projected_embeddings = Parameter( torch.Tensor(self.proj_len, self.embedding_dim)) nn.init.normal_(self.projected_embeddings, mean=0.0, std=self.embedding_dim**-0.5) if self.use_position_embeddings and not self.learned_pos_embedding: projected_positions = get_sinusoidal_positional_embedding( self.proj_len, self.embedding_dim) if self.embed_scale is None: self.embed_scale = math.sqrt(self.embedding_dim) else: projected_positions = None self.register_buffer("projected_positions", projected_positions) if self.layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_luna_sentence_encoder_layer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, num_projected_attention_heads=num_projected_attention_heads, dropout=self.dropout_module.p, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, normalize_before=normalize_before, tie_kv=tie_kv, export=export, ) for _ in range(num_encoder_layers) ]) assert not layernorm_embedding or not normalize_before if layernorm_embedding: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) self.proj_emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None self.proj_emb_layer_norm = None if normalize_before: self.layer_norm = LayerNorm(self.embedding_dim, export=export) self.proj_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.layer_norm = None self.proj_layer_norm = None # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings: self.projected_embeddings.requires_grad = False freeze_module_params(self.embed_tokens) freeze_module_params(self.segment_embeddings) freeze_module_params(self.embed_positions) freeze_module_params(self.emb_layer_norm) freeze_module_params(self.proj_emb_layer_norm) for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer]) log_class_usage(__class__)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ connEnDeTransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.w1 = Linear(embed_dim, embed_dim) self.w2 = Linear(embed_dim, embed_dim, bias=False) ''' self.linearLayers = nn.ModuleList([]) self.linearLayers.extend([ Linear(self.embed_dim, args.decoder_ffn_embed_dim) for _ in range(args.decoder_layers) ]) ''' self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, cfg: Wav2Vec2Config): super().__init__() self.cfg = cfg feature_enc_layers = eval(cfg.conv_feature_layers) self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias, ) self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim and not cfg.quantize_input else None) self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection self.mask_other = cfg.mask_other self.mask_length = cfg.mask_length self.no_mask_overlap = cfg.no_mask_overlap self.mask_min_space = cfg.mask_min_space self.mask_channel_prob = cfg.mask_channel_prob self.mask_channel_selection = cfg.mask_channel_selection self.mask_channel_other = cfg.mask_channel_other self.mask_channel_length = cfg.mask_channel_length self.no_mask_channel_overlap = cfg.no_mask_channel_overlap self.mask_channel_min_space = cfg.mask_channel_min_space self.dropout_input = nn.Dropout(cfg.dropout_input) self.dropout_features = nn.Dropout(cfg.dropout_features) self.feature_grad_mult = cfg.feature_grad_mult self.quantizer = None self.input_quantizer = None self.n_negatives = cfg.num_negatives self.cross_sample_negatives = cfg.cross_sample_negatives self.codebook_negatives = cfg.codebook_negatives self.negatives_from_everywhere = cfg.negatives_from_everywhere self.logit_temp = cfg.logit_temp final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim if cfg.quantize_targets: vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else final_dim self.quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=cfg.latent_vars, temp=cfg.latent_temp, groups=cfg.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, ) self.project_q = nn.Linear(vq_dim, final_dim) else: self.project_q = nn.Linear(self.embed, final_dim) if cfg.quantize_input: if cfg.same_quantizer and self.quantizer is not None: vq_dim = final_dim self.input_quantizer = self.quantizer else: vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else cfg.encoder_embed_dim self.input_quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=cfg.latent_vars, temp=cfg.latent_temp, groups=cfg.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, ) self.project_inp = nn.Linear(vq_dim, cfg.encoder_embed_dim) self.mask_emb = nn.Parameter( torch.FloatTensor(cfg.encoder_embed_dim).uniform_()) self.encoder = TransformerEncoder(cfg) self.layer_norm = LayerNorm(self.embed) self.target_glu = None if cfg.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU()) self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn, layer_id=i) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, no_encoder_attn=False, kernel_size=0): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == "lightweight": self.conv = LightweightConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) elif args.decoder_conv_type == "dynamic": self.conv = DynamicConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.relu_dropout_module = FairseqDropout( args.relu_dropout, module_name=self.__class__.__name__ ) self.input_dropout_module = FairseqDropout( args.input_dropout, module_name=self.__class__.__name__ ) self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True
def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True ): super().__init__(dictionary) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layers = nn.ModuleList([]) self.layers.extend( [ LightConvDecoderLayer( args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i] ) for i in range(args.decoder_layers) ] ) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, output_embed_dim, bias=False) if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, max_seq_len: int = 256, # num_segments: int = 2, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = False, apply_bert_init: bool = True, activation_fn: str = "relu", learned_pos_embedding: bool = True, add_bias_kv: bool = False, add_zero_attn: bool = False, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, use_residual: bool = True, use_norm: bool = True, use_pretrain: bool = False, pretrain_vectors=None, pretrain_dim: int = 200, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout = dropout self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim # self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.use_pretrain = use_pretrain self.pretrain_dim = pretrain_dim assert self.embedding_dim % num_attention_heads == 0 # word embedding: if self.use_pretrain: self.embed_tokens = nn.Embedding(self.vocab_size, self.pretrain_dim, self.padding_idx) else: self.embed_tokens = nn.Embedding(self.vocab_size, self.embedding_dim, self.padding_idx) if self.use_pretrain and self.pretrain_dim % num_attention_heads != 0: self.pretrain_emb_transfer = nn.Linear(self.pretrain_dim, self.embedding_dim) print("Use the pre-trained embedding transfer layer") else: self.pretrain_emb_transfer = None self.embed_scale = embed_scale # position embedding self.embed_positions = (PositionalEmbedding( self.max_seq_len, self.embedding_dim, padding_idx=( self.padding_idx if offset_positions_by_padding else None), learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None) print(f'position embedding: {self.embed_positions}') # Transformer Layers: self.layers = nn.ModuleList([ TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, export=export, use_residual=use_residual, use_norm=use_norm, ) for _ in range(num_encoder_layers) ]) # Layer Norm: if encoder_normalize_before: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) # self.embed_tokens.weight.data.normal_(mean=0.0, std=0.02) # if use_position_embeddings: # self.embed_positions.weight.data.normal_(mean=0.0, std=0.02) if self.use_pretrain: # self.embed_tokens.from_pretrained(pretrain_vectors, freeze=freeze_embeddings) self.embed_tokens.weight.data.copy_(pretrain_vectors) self.embed_tokens.weight.requires_grad = not freeze_embeddings print(f'Use the pre-train embedding(freeze={freeze_embeddings}):') # print(pretrain_vectors) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings and use_pretrain: freeze_module_params(self.embed_tokens)
def __init__( self, cfg: HubertConfig, task_cfg: HubertPretrainingConfig, dictionaries: List[Dictionary], ) -> None: super().__init__() logger.info(f"HubertModel Config: {cfg}") feature_enc_layers = eval(cfg.conv_feature_layers) # noqa self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias, ) feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers]) self.feat2tar_ratio = (cfg.label_rate * feature_ds_rate / task_cfg.sample_rate) self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None) self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection self.mask_other = cfg.mask_other self.mask_length = cfg.mask_length self.no_mask_overlap = cfg.no_mask_overlap self.mask_min_space = cfg.mask_min_space self.mask_channel_prob = cfg.mask_channel_prob self.mask_channel_selection = cfg.mask_channel_selection self.mask_channel_other = cfg.mask_channel_other self.mask_channel_length = cfg.mask_channel_length self.no_mask_channel_overlap = cfg.no_mask_channel_overlap self.mask_channel_min_space = cfg.mask_channel_min_space self.dropout_input = nn.Dropout(cfg.dropout_input) self.dropout_features = nn.Dropout(cfg.dropout_features) self.feature_grad_mult = cfg.feature_grad_mult self.logit_temp = cfg.logit_temp self.skip_masked = cfg.skip_masked self.skip_nomask = cfg.skip_nomask final_dim = (cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim) self.mask_emb = nn.Parameter( torch.FloatTensor(cfg.encoder_embed_dim).uniform_()) self.encoder = TransformerEncoder(cfg) self.layer_norm = LayerNorm(self.embed) self.target_glu = None if cfg.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU()) self.untie_final_proj = cfg.untie_final_proj if self.untie_final_proj: self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim * len(dictionaries)) else: self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim) # modules below are not needed during fine-tuning if any([d is None for d in dictionaries]): logger.info( "cannot find dictionary. assume will be used for fine-tuning") else: self.num_classes = [len(d) for d in dictionaries] self.label_embs_concat = nn.Parameter( torch.FloatTensor(sum(self.num_classes), final_dim)) nn.init.uniform_(self.label_embs_concat)
def __init__(self, config: Config, embed_dim: int, padding_idx: Tensor) -> None: super().__init__(config) self.padding_idx = padding_idx self.pooling_type = config.pooling_type self.dropout = nn.Dropout(config.encoder_config.dropout) input_embed_dim = embed_dim self.embed_scale = math.sqrt( input_embed_dim) # todo: try with input_embed_dim self.max_source_positions = config.encoder_config.max_source_positions self.no_token_positional_embeddings = ( config.encoder_config.no_token_positional_embeddings) # creating this is also conditional self.project_in_dim = ( Linear(input_embed_dim, config.encoder_config.encoder_embed_dim) if config.encoder_config.encoder_embed_dim != input_embed_dim else PlaceholderIdentity()) layers = [] # Overwrite the config.layer_config.encoder_embed_dim so that it will always match with config.encoder_config.encoder_embed_dim config.layer_config.encoder_embed_dim = config.encoder_config.encoder_embed_dim for size in config.encoder_kernel_size_list: layers.append(create_module(config.layer_config, kernel_size=size)) self.layers = nn.ModuleList(layers) self.embed_layer_norm = LayerNorm( config.encoder_config.encoder_embed_dim) self.combine_pos_embed = config.encoder_config.combine_pos_embed.value if config.encoder_config.combine_pos_embed == PostionalEmbedCombine.SUM: pos_embed_dim = config.encoder_config.encoder_embed_dim elif config.encoder_config.combine_pos_embed == PostionalEmbedCombine.CONCAT: pos_embed_dim = config.encoder_config.encoder_embed_dim - input_embed_dim else: raise NotImplementedError if not config.encoder_config.no_token_positional_embeddings: if (config.encoder_config.positional_embedding_type == PostionalEmbedType.LEARNED): self.embed_positions = PositionalEmbedding( config.encoder_config.max_source_positions, pos_embed_dim, self.padding_idx, ) elif (config.encoder_config.positional_embedding_type == PostionalEmbedType.SINUSOIDAL or config.encoder_config.positional_embedding_type == PostionalEmbedType.HYBRID): self.embed_positions = SinusoidalPositionalEmbedding( pos_embed_dim, self.padding_idx, init_size=config.encoder_config.max_source_positions, learned_embed=config.encoder_config. positional_embedding_type == PostionalEmbedType.HYBRID, ) else: raise NotImplementedError( "Positional embedding type not supported") else: self.embed_positions = PlaceholderIdentity() self.normalize = config.encoder_config.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm( config.encoder_config.encoder_embed_dim) else: self.layer_norm = PlaceholderIdentity() log_class_usage(__class__)