def __init__(self, args, src_dict, dst_dict, embed_tokens, left_pad=False): super().__init__(dst_dict) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.embed_tokens = embed_tokens self.lstm_units = args.decoder_lstm_units self.attention_dim = args.encoder_embed_dim self.num_layers = args.decoder_layers self.initial_rnn_layer = nn.LSTM(input_size=embed_dim, hidden_size=self.lstm_units) self.proj_layer = None if self.lstm_units != self.attention_dim: self.proj_layer = fairseq_transformer.Linear( self.lstm_units, self.attention_dim) self.attention = fairseq_transformer.MultiheadAttention( self.attention_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.extra_rnn_layers = nn.ModuleList([]) for _ in range(self.num_layers - 1): self.extra_rnn_layers.append( nn.LSTM( input_size=self.lstm_units + self.attention_dim, hidden_size=self.lstm_units, )) out_embed_dim = args.decoder_out_embed_dim self.bottleneck_layer = fairseq_transformer.Linear( self.attention_dim + self.lstm_units, out_embed_dim) self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False
def _init_components(self, args, src_dict, dst_dict, embed_tokens): self.initial_rnn_layer = nn.LSTM( input_size=self.initial_input_dim, hidden_size=self.lstm_units ) self.proj_encoder_layer = None if self.attention_dim != self.encoder_output_dim: self.proj_encoder_layer = fairseq_transformer.Linear( self.encoder_output_dim, self.attention_dim ) self.proj_layer = None if self.lstm_units != self.attention_dim: self.proj_layer = fairseq_transformer.Linear( self.lstm_units, self.attention_dim ) self.attention = MultiheadAttention( self.attention_dim, self.num_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.extra_rnn_layers = nn.ModuleList([]) for _ in range(self.num_layers - 1): self.extra_rnn_layers.append( nn.LSTM(input_size=self.input_dim, hidden_size=self.lstm_units) ) self.bottleneck_layer = None if self.bottleneck_dim is not None: self.out_embed_dim = self.bottleneck_dim self.bottleneck_layer = fairseq_transformer.Linear( self.input_dim, self.out_embed_dim ) else: self.out_embed_dim = self.input_dim self.embed_out = nn.Parameter(torch.Tensor(len(dst_dict), self.out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.out_embed_dim ** -0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16 ) self.onnx_trace = False
def __init__(self, args, dictionary, embed_tokens, left_pad=True): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.encoder_learned_pos, ) self.all_layer_position_embed = args.all_layer_position_embed self.layers = nn.ModuleList([]) self.layers.extend([ fairseq_transformer.TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.output_fc = None if args.encoder_embed_dim != args.decoder_embed_dim: self.output_fc = fairseq_transformer.Linear( embed_dim, args.decoder_embed_dim) # Variable tracker self.tracker = VariableTracker() # Initialize adversarial mode self.set_gradient_tracking_mode(False)
def __init__(self, args): super().__init__() self.embed_dim = args.decoder_embed_dim self.cross_self_attention = getattr(args, "cross_self_attention", False) self.avg_attn = AverageAttention(self.embed_dim, dropout=args.attention_dropout) # differently than original paper, we use a single gate self.aan_gating_fc = fairseq_transformer.Linear( self.embed_dim * 2, self.embed_dim) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, "activation_fn", "relu")) self.activation_dropout = getattr(args, "activation_dropout", 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, "relu_dropout", 0) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, "char_inputs", False) self.avg_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, kdim=getattr(args, "encoder_embed_dim", None), vdim=getattr(args, "encoder_embed_dim", None), dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = fairseq_transformer.Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = fairseq_transformer.Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False
def __init__(self, args, src_dict, dst_dict, embed_tokens): super().__init__(dst_dict) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.layers = nn.ModuleList([]) self.layers.extend([ fairseq_transformer.TransformerDecoderLayer(args) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = fairseq_transformer.Linear( embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False
def _build_layer(self): self.src_len_norm = getattr(self.args, 'src_len_norm', 'sqrt') self.dwstack_proj_act = getattr(self.args, 'dwstack_proj_act', 'none') self.head_dim = self.embed_dim // self.num_heads assert self.head_dim * self.num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.in_proj_weight = Parameter( torch.Tensor(3 * self.embed_dim, self.embed_dim)) if self.bias: self.in_proj_bias = Parameter(torch.Tensor(3 * self.embed_dim)) else: self.register_parameter('in_proj_bias', None) self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.bias) if self.add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, self.embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, self.embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = self.add_zero_attn self.nstack_linear_layer = NstackLinear( self.head_dim, self.head_dim, False) if self.nstack_linear else None self.dwstack_linear = transformer.Linear(self.embed_dim, self.num_heads) self.project_dwstack_key = lambda x: self.dwstack_linear(x) if self.dwstack_proj_act == 'sigmoid': self.project_dwstack_key = lambda x: self.dwstack_linear( x).sigmoid() elif self.dwstack_proj_act == 'tanh': self.project_dwstack_key = lambda x: self.dwstack_linear(x).tanh() self.hier_embed_positions = self.get_hier_embed_positions() self.embed_positions = PositionalEmbedding( self.args.max_source_positions, self.head_dim, self.padding_idx, left_pad=False, learned=self.nstack_pos_embed_learned, ) if self.nstack_pos_embed else None assert not (self.hier_embed_positions is not None and self.embed_positions is not None) self.reset_parameters() self.onnx_trace = False
def __init__(self, args, proj_to_decoder): super().__init__() self.layers = nn.ModuleList([]) self.layers.extend([ fairseq_transformer.TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.output_fc = None if args.encoder_embed_dim != args.decoder_embed_dim and proj_to_decoder: self.output_fc = fairseq_transformer.Linear( args.encoder_embed_dim, args.decoder_embed_dim)
def __init__(self, args, num_embeddings, embedding_dim, padding_idx, dictionary, pretrain_path, freeze=True): super().__init__() self.args = args # self.pretrain_dim = getattr(args, 'pretrain_dim', 300) self.tune_epoch = getattr(args, 'tune_epoch', 10000000) self.bert_name = getattr(args, 'bert_name', 'bert-base-uncased') self.bert_layer = getattr(args, 'bert_layer', 11) self.freeze = freeze self.current_epoch = 0 self.finetuning = False self.flip_switch = True self.embedding_dim = embedding_dim self.padding_idx = padding_idx self.num_embeddings = num_embeddings self.dictionary = dictionary self.pretrain_path = pretrain_path self.unknown_idx = None self.mask_factor = None """ dictionary: {word: idx} bert: {word: idx} """ self.pretrain_dim = self.get_pretrain_dim() self.index_remapping, self.bert_model = self.build_bert_dict_remapping( ) if self.embedding_dim != self.pretrain_dim: self.reproj = transformer.Linear(self.pretrain_dim, self.embedding_dim) else: self.reproj = lambda x: x self.embedding = Embedding(num_embeddings, self.embedding_dim, padding_idx) self.weight = self.embedding.weight
def __init__(self, args, num_embeddings, embedding_dim, padding_idx, dictionary, pretrain_path): super().__init__() self.args = args self.dropout = getattr(args, 'dropout', 0.0) self.pretrain_dim = getattr(args, 'pretrain_dim', 300) self.dropout_layer = nn.Dropout(self.dropout) self.embedding_dim = embedding_dim self.padding_idx = padding_idx self.num_embeddings = num_embeddings self.dictionary = dictionary self.pretrain_path = pretrain_path self.embedding = PretrainedEmbedding(num_embeddings, self.pretrain_dim, padding_idx, dictionary, pretrain_path) self.linear = transformer.Linear(self.pretrain_dim, embedding_dim, bias=False) self.layer = nn.Sequential(self.embedding, self.dropout_layer, self.linear)
def __init__( self, args, dictionary, embed_tokens, num_chars=50, embed_dim=32, char_cnn_params="[(128, 3), (128, 5)]", char_cnn_nonlinear_fn="tanh", char_cnn_pool_type="max", char_cnn_num_highway_layers=0, char_cnn_output_dim=-1, use_pretrained_weights=False, finetune_pretrained_weights=False, weights_file=None, ): super().__init__(dictionary) convolutions_params = literal_eval(char_cnn_params) self.char_cnn_encoder = char_encoder.CharCNNModel( dictionary, num_chars, embed_dim, convolutions_params, char_cnn_nonlinear_fn, char_cnn_pool_type, char_cnn_num_highway_layers, char_cnn_output_dim, use_pretrained_weights, finetune_pretrained_weights, weights_file, ) self.embed_tokens = embed_tokens token_embed_dim = embed_tokens.embedding_dim self.word_layer_norm = nn.LayerNorm(token_embed_dim) char_embed_dim = ( char_cnn_output_dim if char_cnn_output_dim != -1 else sum(out_dim for (out_dim, _) in convolutions_params) ) self.char_layer_norm = nn.LayerNorm(char_embed_dim) self.word_dim = char_embed_dim + token_embed_dim self.char_scale = math.sqrt(char_embed_dim / self.word_dim) self.word_scale = math.sqrt(token_embed_dim / self.word_dim) if self.word_dim != args.encoder_embed_dim: self.word_to_transformer_embed = fairseq_transformer.Linear( self.word_dim, args.encoder_embed_dim ) self.dropout = args.dropout self.padding_idx = dictionary.pad() self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, args.encoder_embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) self.transformer_encoder_given_embeddings = TransformerEncoderGivenEmbeddings( args=args, proj_to_decoder=True ) # Variable tracker self.tracker = VariableTracker() # Initialize adversarial mode self.set_gradient_tracking_mode(False) self.set_embed_noising_mode(False) # disables sorting and word-length thresholding if True # (enables ONNX tracing of length-sorted input with batch_size = 1) self.onnx_export_model = False
def __init__(self, args, src_dict, dst_dict, embed_tokens): super().__init__(dst_dict) self.dropout = args.dropout self.decoder_layerdrop = 0 if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0: self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.aan = args.aan decoder_layer_class = (AANDecoderLayer if self.aan else fairseq_transformer.TransformerDecoderLayer) self.layers = nn.ModuleList([]) self.layers.extend( [decoder_layer_class(args) for i in range(args.decoder_layers)]) if hasattr(args, "decoder_layers_to_keep") and args.decoder_layers_to_keep: layers_to_keep = sorted( int(x) for x in args.decoder_layers_to_keep.split(",")) self.decoder_layers_to_keep = { layer_id: layer_idx for layer_idx, layer_id in enumerate(layers_to_keep) } self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = fairseq_transformer.Linear( embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False # Use quantizable nn.Linear for output projection instead of F.linear self.output_projection = None if self.vocab_reduction_module is None: if self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0]) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.embed_out.shape[1], self.embed_out.shape[0]) self.output_projection.weight = self.embed_out