def __init__(self, in_dim, out_dim, n_layers=3, kernel_size=3, stride=2, layerdrop=0., layernorm=False, proj=False): super().__init__() self.proj, self.proj_ln = None, None self.post_proj, self.post_proj_ln = None, None if proj: self.proj = nn.Sequential(nn.Linear(in_dim, in_dim * 4), nn.ReLU(), nn.Linear(in_dim * 4, in_dim)) self.proj_ln = LayerNorm(in_dim) self.post_proj = nn.Sequential(nn.Linear(out_dim, out_dim * 4), nn.ReLU(), nn.Linear(out_dim * 4, out_dim)) self.post_proj_ln = LayerNorm(out_dim) self.layers = nn.ModuleList( nn.Conv1d( in_dim if i == 0 else out_dim, out_dim * 2, kernel_size, stride=stride, padding=kernel_size // 2, ) for i in range(n_layers)) self.stride = stride self.layerdrop = layerdrop self.layernorm = LayerNorm(in_dim) if layernorm else None
def __init__( self, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = 'relu', export: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, init_fn: Callable = None, ) -> None: super().__init__() if init_fn is not None: init_fn() # Initialize parameters self.embedding_dim = embedding_dim self.dropout_module = FairseqDropout(dropout, module_name=self.__class__.__name__) self.activation_dropout_module = FairseqDropout(activation_dropout, module_name=self.__class__.__name__) # Initialize blocks self.activation_fn = utils.get_activation_fn(activation_fn) self.self_attn = self.build_self_attention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, self_attention=True, q_noise=q_noise, qn_block_size=qn_block_size, ) # layer norm associated with the self attention layer self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export) self.fc1 = self.build_fc1( self.embedding_dim, ffn_embedding_dim, q_noise=q_noise, qn_block_size=qn_block_size, ) self.fc2 = self.build_fc2( ffn_embedding_dim, self.embedding_dim, q_noise=q_noise, qn_block_size=qn_block_size, ) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
def __init__(self, args, config, dictionary, embedding_token): super().__init__() self.args = args self.config = config self.dict = dictionary self.embedding_token = embedding_token decoder_dim = args.decoder_dim hidden_size = args.hidden_size if args.reduce_dim > 0: self.linear_answer = nn.Linear(hidden_size, decoder_dim) self.ln_answer = LayerNorm(decoder_dim) self.attention_layer = TransformerDecoder(decoder_dim, self.args.decoder_head, decoder_dim * 4, args.decoder_layers, 0.2) self.pointer_layer = PointerDecoder(decoder_dim, decoder_dim, dropout=0.2) self.vocab_size = len(dictionary) self.out = nn.Linear(decoder_dim, self.vocab_size) self.apply(self.init_bert_weights) if args.share_decoder_input_output_embed: if self.args.reduce_dim > 0: self.project = nn.Linear(decoder_dim, hidden_size) self.out.weight = embedding_token.word_embeddings.weight
def __init__(self, args): super().__init__() self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu') or "relu" ) self.norm = LayerNorm(args.decoder_embed_dim, export=False) self.ff1 = torch.nn.Linear(args.decoder_embed_dim, args.decoder_ffn_embed_dim) self.ff2 = torch.nn.Linear(args.decoder_ffn_embed_dim, args.decoder_embed_dim) self.ff2.weight.data.zero_()
def __init__( self, embedding_dim: float = 768, ffn_embedding_dim: float = 3072, num_attention_heads: float = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = "relu", layer_norm_first: bool = False, ) -> None: super().__init__() # Initialize parameters self.embedding_dim = embedding_dim self.dropout = dropout self.activation_dropout = activation_dropout # Initialize blocks self.activation_fn = utils.get_activation_fn(activation_fn) self.self_attn = MultiheadAttention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, self_attention=True, ) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(self.activation_dropout) self.dropout3 = nn.Dropout(dropout) self.layer_norm_first = layer_norm_first # layer norm associated with the self attention layer self.self_attn_layer_norm = LayerNorm(self.embedding_dim) self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim)
def __init__(self, args): super().__init__(None) self.w2v_encoder = Wav2VecEncoder(args) self.is_v0_arch = not args.adaptor_proj self.w2v_proj_ln = None if not self.is_v0_arch and self.w2v_encoder.proj is not None: self.w2v_proj_ln = LayerNorm(args.decoder_embed_dim) self.adaptor = self.build_adaptor(args) self.num_updates = 0 self.freezing_updates = args.w2v_freezing_updates self.finetuning_params = args.finetune_w2v_params for k, p in self.w2v_encoder.w2v_model.named_parameters(): p.requires_grad = need_finetuning(self.finetuning_params, k)
def __init__(self, in_dim, out_dim, n_layers=3, kernel_size=3, stride=2, add_layernorm=False): super().__init__() self.layers = nn.ModuleList( nn.Conv1d(in_dim if i == 0 else out_dim, out_dim * 2, kernel_size, stride=stride, padding=kernel_size // 2) for i in range(n_layers)) self.layernorms = None if add_layernorm: self.layernorms = nn.ModuleList( LayerNorm(out_dim) for _ in range(n_layers)) self.stride = stride
def __init__(self, args): super().__init__() self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.pos_conv = nn.Conv1d( self.embedding_dim, self.embedding_dim, kernel_size=args.conv_pos, padding=args.conv_pos // 2, groups=args.conv_pos_groups, ) dropout = 0 std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) nn.init.normal_(self.pos_conv.weight, mean=0, std=std) nn.init.constant_(self.pos_conv.bias, 0) self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) self.layers = nn.ModuleList( [ TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, ) for _ in range(args.encoder_layers) ] ) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) self.layerdrop = args.encoder_layerdrop self.apply(init_bert_params)
def __init__(self, config, *inputs, **kwargs): super().__init__(config) self.config = config args = kwargs["args"] self.is_defined_position = args.defined_position self.max_source_positions = args.max_source_positions self.bert = BertPreTrainedModel(config) decoder_dim = config.hidden_size self.reduce_dim = args.reduce_dim if self.reduce_dim > 0: decoder_dim = self.reduce_dim # self.linear_answer = nn.Linear(config.hidden_size, decoder_dim) self.linear_context = nn.Linear(config.hidden_size, decoder_dim) # self.ln_answer = LayerNorm(decoder_dim) self.ln_context = LayerNorm(decoder_dim) args.decoder_dim = decoder_dim args.hidden_size = config.hidden_size self.apply(self.init_bert_weights)
def __init__(self, args): super().__init__() self.args = args feature_enc_layers = eval(args.conv_feature_layers) self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=args.extractor_mode, conv_bias=args.conv_bias, ) self.post_extract_proj = ( nn.Linear(self.embed, args.encoder_embed_dim) if self.embed != args.encoder_embed_dim and not args.quantize_input else None ) self.mask_prob = args.mask_prob self.mask_selection = args.mask_selection self.mask_other = args.mask_other self.mask_length = args.mask_length self.no_mask_overlap = args.no_mask_overlap self.mask_min_space = args.mask_min_space self.mask_channel_prob = args.mask_channel_prob self.mask_channel_selection = args.mask_channel_selection self.mask_channel_other = args.mask_channel_other self.mask_channel_length = args.mask_channel_length self.no_mask_channel_overlap = args.no_mask_channel_overlap self.mask_channel_min_space = args.mask_channel_min_space self.dropout_input = nn.Dropout(args.dropout_input) self.dropout_features = nn.Dropout(args.dropout_features) self.feature_grad_mult = args.feature_grad_mult self.quantizer = None self.input_quantizer = None self.n_negatives = args.num_negatives self.cross_sample_negatives = args.cross_sample_negatives self.codebook_negatives = args.codebook_negatives self.negatives_from_everywhere = args.negatives_from_everywhere self.logit_temp = args.logit_temp final_dim = args.final_dim if args.final_dim > 0 else args.encoder_embed_dim if args.quantize_targets: vq_dim = args.latent_dim if args.latent_dim > 0 else final_dim self.quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=args.latent_vars, temp=eval(args.latent_temp), groups=args.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, ) self.project_q = nn.Linear(vq_dim, final_dim) else: self.project_q = nn.Linear(self.embed, final_dim) if args.quantize_input: if args.same_quantizer and self.quantizer is not None: vq_dim = final_dim self.input_quantizer = self.quantizer else: vq_dim = ( args.latent_dim if args.latent_dim > 0 else args.encoder_embed_dim ) self.input_quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=args.latent_vars, temp=eval(args.latent_temp), groups=args.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, ) self.project_inp = nn.Linear(vq_dim, args.encoder_embed_dim) self.mask_emb = nn.Parameter( torch.FloatTensor(args.encoder_embed_dim).uniform_() ) self.encoder = TransformerEncoder(args) self.layer_norm = LayerNorm(self.embed) self.target_glu = None if args.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU() ) self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim)
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, layerdrop: float = 0.0, max_seq_len: int = 256, num_segments: int = 2, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = False, apply_bert_init: bool = False, activation_fn: str = "relu", learned_pos_embedding: bool = True, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, traceable: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout_module = FairseqDropout(dropout, module_name=self.__class__.__name__) self.layerdrop = layerdrop self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.traceable = traceable self.tpu = False # whether we're on TPU self.embed_tokens = self.build_embedding( self.vocab_size, self.embedding_dim, self.padding_idx ) self.embed_scale = embed_scale if q_noise > 0: self.quant_noise = apply_quant_noise_( nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), q_noise, qn_block_size, ) else: self.quant_noise = None self.segment_embeddings = ( nn.Embedding(self.num_segments, self.embedding_dim, padding_idx=None) if self.num_segments > 0 else None ) self.embed_positions = ( PositionalEmbedding( self.max_seq_len, self.embedding_dim, padding_idx=(self.padding_idx if offset_positions_by_padding else None), learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None ) if self.layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_transformer_sentence_encoder_layer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout_module.p, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, export=export, q_noise=q_noise, qn_block_size=qn_block_size, ) for _ in range(num_encoder_layers) ]) if encoder_normalize_before: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings: freeze_module_params(self.embed_tokens) freeze_module_params(self.segment_embeddings) freeze_module_params(self.embed_positions) freeze_module_params(self.emb_layer_norm) for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer])
def __init__(self, d_model, **kargs): nn.Module.__init__(self) self.attn = MultiHeadSeqAttention(d_model=d_model, **kargs) self.norm1 = LayerNorm(d_model) self.ff = FeedForwardLayer(d_model=d_model, **kargs) self.norm2 = LayerNorm(d_model)
def __init__(self, layer, d_model, dropout_ratio): super().__init__() self.layer = layer self.dropout = nn.Dropout(dropout_ratio) self.layernorm = LayerNorm(d_model)