Esempio n. 1
0
    def __init__(self, args, src_dict, dst_dict, embed_tokens, left_pad=False):
        super().__init__(dst_dict)
        self.dropout = args.dropout

        embed_dim = embed_tokens.embedding_dim
        self.embed_tokens = embed_tokens

        self.lstm_units = args.decoder_lstm_units
        self.attention_dim = args.encoder_embed_dim
        self.num_layers = args.decoder_layers

        self.initial_rnn_layer = nn.LSTM(input_size=embed_dim,
                                         hidden_size=self.lstm_units)

        self.proj_layer = None
        if self.lstm_units != self.attention_dim:
            self.proj_layer = fairseq_transformer.Linear(
                self.lstm_units, self.attention_dim)

        self.attention = fairseq_transformer.MultiheadAttention(
            self.attention_dim,
            args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )

        self.extra_rnn_layers = nn.ModuleList([])
        for _ in range(self.num_layers - 1):
            self.extra_rnn_layers.append(
                nn.LSTM(
                    input_size=self.lstm_units + self.attention_dim,
                    hidden_size=self.lstm_units,
                ))

        out_embed_dim = args.decoder_out_embed_dim
        self.bottleneck_layer = fairseq_transformer.Linear(
            self.attention_dim + self.lstm_units, out_embed_dim)

        self.embed_out = nn.Parameter(
            torch.Tensor(len(dst_dict), out_embed_dim))
        nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False
Esempio n. 2
0
    def _init_components(self, args, src_dict, dst_dict, embed_tokens):
        self.initial_rnn_layer = nn.LSTM(
            input_size=self.initial_input_dim, hidden_size=self.lstm_units
        )

        self.proj_encoder_layer = None
        if self.attention_dim != self.encoder_output_dim:
            self.proj_encoder_layer = fairseq_transformer.Linear(
                self.encoder_output_dim, self.attention_dim
            )

        self.proj_layer = None
        if self.lstm_units != self.attention_dim:
            self.proj_layer = fairseq_transformer.Linear(
                self.lstm_units, self.attention_dim
            )

        self.attention = MultiheadAttention(
            self.attention_dim,
            self.num_attention_heads,
            dropout=args.attention_dropout,
            encoder_decoder_attention=True,
        )

        self.extra_rnn_layers = nn.ModuleList([])
        for _ in range(self.num_layers - 1):
            self.extra_rnn_layers.append(
                nn.LSTM(input_size=self.input_dim, hidden_size=self.lstm_units)
            )

        self.bottleneck_layer = None
        if self.bottleneck_dim is not None:
            self.out_embed_dim = self.bottleneck_dim
            self.bottleneck_layer = fairseq_transformer.Linear(
                self.input_dim, self.out_embed_dim
            )
        else:
            self.out_embed_dim = self.input_dim

        self.embed_out = nn.Parameter(torch.Tensor(len(dst_dict), self.out_embed_dim))
        nn.init.normal_(self.embed_out, mean=0, std=self.out_embed_dim ** -0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16
            )

        self.onnx_trace = False
Esempio n. 3
0
    def __init__(self, args, dictionary, embed_tokens, left_pad=True):
        super().__init__(dictionary)
        self.dropout = args.dropout

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024,
            embed_dim,
            self.padding_idx,
            left_pad=left_pad,
            learned=args.encoder_learned_pos,
        )
        self.all_layer_position_embed = args.all_layer_position_embed

        self.layers = nn.ModuleList([])
        self.layers.extend([
            fairseq_transformer.TransformerEncoderLayer(args)
            for i in range(args.encoder_layers)
        ])

        self.output_fc = None
        if args.encoder_embed_dim != args.decoder_embed_dim:
            self.output_fc = fairseq_transformer.Linear(
                embed_dim, args.decoder_embed_dim)

        # Variable tracker
        self.tracker = VariableTracker()

        # Initialize adversarial mode
        self.set_gradient_tracking_mode(False)
Esempio n. 4
0
    def __init__(self, args):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.cross_self_attention = getattr(args, "cross_self_attention",
                                            False)

        self.avg_attn = AverageAttention(self.embed_dim,
                                         dropout=args.attention_dropout)

        # differently than original paper, we use a single gate
        self.aan_gating_fc = fairseq_transformer.Linear(
            self.embed_dim * 2, self.embed_dim)

        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, "activation_fn", "relu"))
        self.activation_dropout = getattr(args, "activation_dropout", 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, "relu_dropout", 0)
        self.normalize_before = args.decoder_normalize_before

        # use layerNorm rather than FusedLayerNorm for exporting.
        # char_inputs can be used to determint this.
        # TODO  remove this once we update apex with the fix
        export = getattr(args, "char_inputs", False)
        self.avg_attn_layer_norm = LayerNorm(self.embed_dim, export=export)

        self.encoder_attn = MultiheadAttention(
            self.embed_dim,
            args.decoder_attention_heads,
            kdim=getattr(args, "encoder_embed_dim", None),
            vdim=getattr(args, "encoder_embed_dim", None),
            dropout=args.attention_dropout,
            encoder_decoder_attention=True,
        )
        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)

        self.fc1 = fairseq_transformer.Linear(self.embed_dim,
                                              args.decoder_ffn_embed_dim)
        self.fc2 = fairseq_transformer.Linear(args.decoder_ffn_embed_dim,
                                              self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
        self.need_attn = True

        self.onnx_trace = False
Esempio n. 5
0
    def __init__(self, args, src_dict, dst_dict, embed_tokens):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            fairseq_transformer.TransformerDecoderLayer(args)
            for i in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = fairseq_transformer.Linear(
                embed_dim, args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False
    def _build_layer(self):
        self.src_len_norm = getattr(self.args, 'src_len_norm', 'sqrt')
        self.dwstack_proj_act = getattr(self.args, 'dwstack_proj_act', 'none')

        self.head_dim = self.embed_dim // self.num_heads
        assert self.head_dim * self.num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.in_proj_weight = Parameter(
            torch.Tensor(3 * self.embed_dim, self.embed_dim))
        if self.bias:
            self.in_proj_bias = Parameter(torch.Tensor(3 * self.embed_dim))
        else:
            self.register_parameter('in_proj_bias', None)
        self.out_proj = nn.Linear(self.embed_dim,
                                  self.embed_dim,
                                  bias=self.bias)

        if self.add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, self.embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, self.embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = self.add_zero_attn

        self.nstack_linear_layer = NstackLinear(
            self.head_dim, self.head_dim,
            False) if self.nstack_linear else None

        self.dwstack_linear = transformer.Linear(self.embed_dim,
                                                 self.num_heads)
        self.project_dwstack_key = lambda x: self.dwstack_linear(x)
        if self.dwstack_proj_act == 'sigmoid':
            self.project_dwstack_key = lambda x: self.dwstack_linear(
                x).sigmoid()
        elif self.dwstack_proj_act == 'tanh':
            self.project_dwstack_key = lambda x: self.dwstack_linear(x).tanh()

        self.hier_embed_positions = self.get_hier_embed_positions()

        self.embed_positions = PositionalEmbedding(
            self.args.max_source_positions,
            self.head_dim,
            self.padding_idx,
            left_pad=False,
            learned=self.nstack_pos_embed_learned,
        ) if self.nstack_pos_embed else None

        assert not (self.hier_embed_positions is not None
                    and self.embed_positions is not None)

        self.reset_parameters()

        self.onnx_trace = False
Esempio n. 7
0
    def __init__(self, args, proj_to_decoder):
        super().__init__()

        self.layers = nn.ModuleList([])
        self.layers.extend([
            fairseq_transformer.TransformerEncoderLayer(args)
            for i in range(args.encoder_layers)
        ])

        self.output_fc = None
        if args.encoder_embed_dim != args.decoder_embed_dim and proj_to_decoder:
            self.output_fc = fairseq_transformer.Linear(
                args.encoder_embed_dim, args.decoder_embed_dim)
Esempio n. 8
0
    def __init__(self,
                 args,
                 num_embeddings,
                 embedding_dim,
                 padding_idx,
                 dictionary,
                 pretrain_path,
                 freeze=True):
        super().__init__()

        self.args = args
        # self.pretrain_dim = getattr(args, 'pretrain_dim', 300)
        self.tune_epoch = getattr(args, 'tune_epoch', 10000000)
        self.bert_name = getattr(args, 'bert_name', 'bert-base-uncased')
        self.bert_layer = getattr(args, 'bert_layer', 11)
        self.freeze = freeze

        self.current_epoch = 0
        self.finetuning = False
        self.flip_switch = True

        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        self.num_embeddings = num_embeddings
        self.dictionary = dictionary
        self.pretrain_path = pretrain_path

        self.unknown_idx = None
        self.mask_factor = None
        """
        dictionary: {word: idx}
        bert:       {word: idx}
        """

        self.pretrain_dim = self.get_pretrain_dim()
        self.index_remapping, self.bert_model = self.build_bert_dict_remapping(
        )

        if self.embedding_dim != self.pretrain_dim:
            self.reproj = transformer.Linear(self.pretrain_dim,
                                             self.embedding_dim)
        else:
            self.reproj = lambda x: x

        self.embedding = Embedding(num_embeddings, self.embedding_dim,
                                   padding_idx)
        self.weight = self.embedding.weight
Esempio n. 9
0
    def __init__(self, args, num_embeddings, embedding_dim, padding_idx,
                 dictionary, pretrain_path):
        super().__init__()
        self.args = args
        self.dropout = getattr(args, 'dropout', 0.0)
        self.pretrain_dim = getattr(args, 'pretrain_dim', 300)
        self.dropout_layer = nn.Dropout(self.dropout)
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        self.num_embeddings = num_embeddings
        self.dictionary = dictionary
        self.pretrain_path = pretrain_path

        self.embedding = PretrainedEmbedding(num_embeddings, self.pretrain_dim,
                                             padding_idx, dictionary,
                                             pretrain_path)
        self.linear = transformer.Linear(self.pretrain_dim,
                                         embedding_dim,
                                         bias=False)
        self.layer = nn.Sequential(self.embedding, self.dropout_layer,
                                   self.linear)
    def __init__(
        self,
        args,
        dictionary,
        embed_tokens,
        num_chars=50,
        embed_dim=32,
        char_cnn_params="[(128, 3), (128, 5)]",
        char_cnn_nonlinear_fn="tanh",
        char_cnn_pool_type="max",
        char_cnn_num_highway_layers=0,
        char_cnn_output_dim=-1,
        use_pretrained_weights=False,
        finetune_pretrained_weights=False,
        weights_file=None,
    ):
        super().__init__(dictionary)

        convolutions_params = literal_eval(char_cnn_params)
        self.char_cnn_encoder = char_encoder.CharCNNModel(
            dictionary,
            num_chars,
            embed_dim,
            convolutions_params,
            char_cnn_nonlinear_fn,
            char_cnn_pool_type,
            char_cnn_num_highway_layers,
            char_cnn_output_dim,
            use_pretrained_weights,
            finetune_pretrained_weights,
            weights_file,
        )

        self.embed_tokens = embed_tokens
        token_embed_dim = embed_tokens.embedding_dim
        self.word_layer_norm = nn.LayerNorm(token_embed_dim)

        char_embed_dim = (
            char_cnn_output_dim
            if char_cnn_output_dim != -1
            else sum(out_dim for (out_dim, _) in convolutions_params)
        )
        self.char_layer_norm = nn.LayerNorm(char_embed_dim)
        self.word_dim = char_embed_dim + token_embed_dim
        self.char_scale = math.sqrt(char_embed_dim / self.word_dim)
        self.word_scale = math.sqrt(token_embed_dim / self.word_dim)
        if self.word_dim != args.encoder_embed_dim:
            self.word_to_transformer_embed = fairseq_transformer.Linear(
                self.word_dim, args.encoder_embed_dim
            )

        self.dropout = args.dropout

        self.padding_idx = dictionary.pad()
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024,
            args.encoder_embed_dim,
            self.padding_idx,
            learned=args.encoder_learned_pos,
        )

        self.transformer_encoder_given_embeddings = TransformerEncoderGivenEmbeddings(
            args=args, proj_to_decoder=True
        )

        # Variable tracker
        self.tracker = VariableTracker()
        # Initialize adversarial mode
        self.set_gradient_tracking_mode(False)
        self.set_embed_noising_mode(False)

        # disables sorting and word-length thresholding if True
        # (enables ONNX tracing of length-sorted input with batch_size = 1)
        self.onnx_export_model = False
Esempio n. 11
0
    def __init__(self, args, src_dict, dst_dict, embed_tokens):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.decoder_layerdrop = 0
        if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0:
            self.decoder_layerdrop = args.decoder_layerdrop

        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.aan = args.aan
        decoder_layer_class = (AANDecoderLayer if self.aan else
                               fairseq_transformer.TransformerDecoderLayer)

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [decoder_layer_class(args) for i in range(args.decoder_layers)])
        if hasattr(args,
                   "decoder_layers_to_keep") and args.decoder_layers_to_keep:
            layers_to_keep = sorted(
                int(x) for x in args.decoder_layers_to_keep.split(","))
            self.decoder_layers_to_keep = {
                layer_id: layer_idx
                for layer_idx, layer_id in enumerate(layers_to_keep)
            }

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = fairseq_transformer.Linear(
                embed_dim, args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False

        # Use quantizable nn.Linear for output projection instead of F.linear
        self.output_projection = None
        if self.vocab_reduction_module is None:
            if self.share_input_output_embed:
                self.output_projection = nn.Linear(
                    self.embed_tokens.weight.shape[1],
                    self.embed_tokens.weight.shape[0])
                self.output_projection.weight = self.embed_tokens.weight
            else:
                self.output_projection = nn.Linear(self.embed_out.shape[1],
                                                   self.embed_out.shape[0])
                self.output_projection.weight = self.embed_out