def __init__(self, args): super().__init__() self.encoders = [] for i in range(0, 4): self.encoders.append({}) self.encoders[i]['embed_dim'] = args.encoder_embed_dim self.encoders[i]['quant_noise'] = getattr(args, "quant_noise_pq", 0) self.encoders[i]['quant_noise_block_size'] = getattr(args, "quant_noise_pq_block_size", 8) self.encoders[i]['self_attn'] = self.build_self_attention(self.encoders[i]['embed_dim'], i, args) self.encoders[i]['self_attn_layer_norm'] = LayerNorm(self.encoders[i]['embed_dim']) self.encoders[i]['dropout_module'] = FairseqDropout(args.dropout, module_name=self.__class__.__name__) self.encoders[i]['activation_fn'] = utils.get_activation_fn( activation=getattr(args, "activation_fn", "relu") ) activation_dropout_p = getattr(args, "activation_dropout", 0) if activation_dropout_p == 0: # for backwards compatibility with models that use args.relu_dropout activation_dropout_p = getattr(args, "relu_dropout", 0) self.encoders[i]['activation_dropout_module'] = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) self.encoders[i]['normalize_before'] = args.encoder_normalize_before self.encoders[i]['fc1'] = self.build_fc1( self.encoders[i]['embed_dim'], args.encoder_ffn_embed_dim, self.encoders[i]['quant_noise'], self.encoders[i]['quant_noise_block_size'] ) self.encoders[i]['fc2'] = self.build_fc2( args.encoder_ffn_embed_dim, self.encoders[i]['embed_dim'], self.encoders[i]['quant_noise'], self.encoders[i]['quant_noise_block_size'] ) self.encoders[i]['final_layer_norm'] = LayerNorm(self.encoders[i]['embed_dim'])
def __init__(self, cfg, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = cfg.decoder.embed_dim self.dropout_module = FairseqDropout( cfg.dropout, module_name=self.__class__.__name__) self.quant_noise = cfg.quant_noise.pq self.quant_noise_block_size = cfg.quant_noise.pq_block_size self.cross_self_attention = cfg.cross_self_attention self.self_attn = self.build_self_attention( self.embed_dim, cfg, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.activation_fn = utils.get_activation_fn( activation=cfg.activation_fn) activation_dropout_p = cfg.activation_dropout if activation_dropout_p == 0: # for backwards compatibility with models that use cfg.relu_dropout activation_dropout_p = cfg.relu_dropout or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__) self.normalize_before = cfg.decoder.normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = self.build_encoder_attention( self.embed_dim, cfg) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.fc1 = self.build_fc1( self.embed_dim, cfg.decoder.ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( cfg.decoder.ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.need_attn = True self.onnx_trace = False
def __init__( self, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = "relu", export: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, init_fn: Callable = None, ) -> None: super().__init__() if init_fn is not None: init_fn() # Initialize parameters self.embedding_dim = embedding_dim self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.activation_dropout_module = FairseqDropout( activation_dropout, module_name=self.__class__.__name__ ) # Initialize blocks self.activation_fn = utils.get_activation_fn(activation_fn) self.self_attn = self.build_self_attention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, self_attention=True, q_noise=q_noise, qn_block_size=qn_block_size, ) # layer norm associated with the self attention layer self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export) self.fc1 = self.build_fc1( self.embedding_dim, ffn_embedding_dim, q_noise=q_noise, qn_block_size=qn_block_size, ) self.fc2 = self.build_fc2( ffn_embedding_dim, self.embedding_dim, q_noise=q_noise, qn_block_size=qn_block_size, ) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False self.enable_fairseq_version = True
def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, ) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) # replace with gelu # self.activation_fn = utils.get_activation_fn( # activation=getattr(args, 'activation_fn', 'relu') or "relu" # ) # activation_dropout_p = getattr(args, "activation_dropout", 0) or 0 # if activation_dropout_p == 0: # # for backwards compatibility with models that use args.relu_dropout # activation_dropout_p = getattr(args, "relu_dropout", 0) or 0 # self.activation_dropout_module = FairseqDropout( # float(activation_dropout_p), module_name=self.__class__.__name__ # ) self.normalize_before = args.encoder_normalize_before self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = nn.Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = nn.Linear(args.encoder_ffn_embed_dim, self.embed_dim) # self.layer_norms = nn.ModuleList([BertLayerNorm(self.embed_dim) for i in range(2)]) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__( self, input_size, kernel_size=1, padding_l=None, weight_softmax=False, num_heads=1, weight_dropout=0., bias=False, renorm_padding=False, conv_bias=False, query_size=None, ): super(DynamicconvLayer, self).__init__() self.input_size = input_size self.query_size = input_size if query_size is None else query_size self.kernel_size = kernel_size self.padding_l = padding_l self.num_heads = num_heads self.weight_softmax = weight_softmax self.weight_dropout_module = FairseqDropout( weight_dropout, module_name=self.__class__.__name__) self.renorm_padding = renorm_padding self.bias = bias self.weight_linear = nn.Linear(input_size, num_heads * kernel_size, bias) if conv_bias: self.conv_bias = nn.Parameter(torch.Tensor(input_size)) else: self.conv_bias = None self.reset_parameters()
def __init__(self, args, input_dim, middle_dim, output_dim): super(ClassificationLayer, self).__init__() self.fc_1 = nn.Linear(input_dim, middle_dim) self.fc_2 = nn.Linear(middle_dim, output_dim) self.dropout = FairseqDropout(args.dropout, module_name=self.__class__.__name__) self.grad_reversal_scaling_factor = args.grad_reversal_scaling_factor
def __init__(self, args): super().__init__() self.blockatt = args.use_module_communication == "True" or args.use_module_communication == "true" self.embed_dim = args.encoder_embed_dim self.quant_noise = getattr(args, "quant_noise_pq", 0) self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8) print('encoder embed_dim', self.embed_dim) self.nb = args.num_modules self.norm_blocks = args.num_modules self.self_attn = self.build_self_attention( self.embed_dim, args ) #should divide embed_dim by nb. Then raise embed_dim in args self.self_attn_layer_norm = NormLayer( self.norm_blocks, self.embed_dim // self.norm_blocks) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.activation_fn = utils.get_activation_fn( activation=getattr(args, "activation_fn", "relu")) print("SETUP TRANSFORMER LAYER", 'blocks', self.nb) activation_dropout_p = getattr(args, "activation_dropout", 0) if activation_dropout_p == 0: # for backwards compatibility with models that use args.relu_dropout activation_dropout_p = getattr(args, "relu_dropout", 0) self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__) self.normalize_before = args.encoder_normalize_before self.fc1 = self.build_fc1(self.embed_dim, args.encoder_ffn_embed_dim, self.quant_noise, self.quant_noise_block_size) self.fc2 = self.build_fc2(args.encoder_ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size) self.final_layer_norm = NormLayer(self.norm_blocks, self.embed_dim // self.norm_blocks) if self.blockatt: self.comm = Attention(args.encoder_attention_heads, self.nb, self.embed_dim) self.comm_norm = NormLayer(self.norm_blocks, self.embed_dim // self.norm_blocks)
def __init__(self, args): super().__init__(args) self.domain_dropout = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.domain_projection = self.build_domain_projection( self.embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, )
def __init__( self, embed_dim, ffn_embed_dim, nhead, dropout, attn_dropout, activation_dropout, normalize_before=True, activation_fn="relu", quant_noise=0, quant_noise_block_size=8, ): super().__init__() self.embed_dim = embed_dim self.quant_noise = quant_noise self.quant_noise_block_size = quant_noise_block_size self.self_attn = self.build_self_attention(self.embed_dim, nhead, attn_dropout) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.activation_fn = utils.get_activation_fn(activation=activation_fn) activation_dropout_p = activation_dropout self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) self.normalize_before = normalize_before self.fc1 = self.build_fc1( self.embed_dim, ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__( self, embed_dim, num_heads, dropout=0.0, bias=True, tie_kv=True, q_noise=0.0, qn_block_size=8, parallel=True, ): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.parallel = parallel self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.pq_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.pc_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if tie_kv: self.c_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.k_proj = self.v_proj = None else: self.k_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.c_proj = None self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.reset_parameters() self.onnx_trace = False self.tpu = False
def __init__(self, args, no_encoder_attn=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.self_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout ''' self.activation_fn = utils.get_activation_fn( activation=getattr(args, "activation_fn", "gelu") ) self.activation_dropout = getattr(args, "activation_dropout", 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, "relu_dropout", 0) ''' self.normalize_before = args.decoder_normalize_before export = getattr(args, "char_inputs", False) # self.self_attn_layer_norm = BertLayerNorm(self.embed_dim) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) # self.encoder_attn_layer_norm = BertLayerNorm(self.embed_dim) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = nn.Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = nn.Linear(args.decoder_ffn_embed_dim, self.embed_dim) # self.final_layer_norm = BertLayerNorm(self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False
def __init__( self, cfg, return_fc=False, positional_embedding: Optional[RelativePositionalEmbedding] = None, ): super().__init__() self.cfg = cfg self.return_fc = return_fc self.embed_dim = cfg.encoder.embed_dim self.quant_noise = cfg.quant_noise.pq self.quant_noise_block_size = cfg.quant_noise.pq_block_size self.ffn1 = FeedForwardModule( input_feat=self.embed_dim, hidden_units=cfg.encoder.ffn_embed_dim, dropout1=cfg.activation_dropout, dropout2=cfg.dropout, activation_fn="swish", ) self.self_attn = self.build_self_attention( self.embed_dim, cfg, positional_embedding=positional_embedding) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.dropout_module = FairseqDropout( cfg.dropout, module_name=self.__class__.__name__) self.conv_module = ConvolutionModule( embed_dim=self.embed_dim, channels=self.embed_dim, depthwise_kernel_size=cfg.encoder.depthwise_conv_kernel_size, dropout=cfg.dropout, activation_fn="swish", ) self.ffn2 = FeedForwardModule( input_feat=self.embed_dim, hidden_units=cfg.encoder.ffn_embed_dim, dropout1=cfg.activation_dropout, dropout2=cfg.dropout, activation_fn="swish", ) self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
def __init__(self, args, dictionary, embed_tokens, embed_scale=None, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.eos_idx = dictionary.eos() self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(args.encoder_embed_dim) if embed_scale is None else embed_scale self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, # left_pad=left_pad, learned=args.encoder_learned_pos, ) if not args.no_enc_token_positional_embeddings else None self.embed_lengths = nn.Embedding(args.max_target_positions, embed_dim) nn.init.normal_(self.embed_lengths.weight, mean=0, std=0.02) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before self.layer_norm = None if self.normalize: self.layer_norm = LayerNorm(embed_dim)