Esempio n. 1
0
    def __init__(self, args):
        super().__init__()
        self.embedding_dim = args.decoder_embed_dim
        self.self_attn1 = MultiheadAttention(
            self.embedding_dim,
            args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.self_attn2 = MultiheadAttention(
            self.embedding_dim,
            args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.decoder_normalize_before

        self.self_attn_layer_norm_1 = LayerNorm(self.embedding_dim)
        self.self_attn_layer_norm_2 = LayerNorm(self.embedding_dim)
        self.self_attn_layer_norm = LayerNorm(self.embedding_dim * 2)

        self.fc1 = Linear(self.embedding_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embedding_dim)

        self.final_layer_norm = LayerNorm(self.embedding_dim)

        self.need_attn = True

        self.onnx_trace = False
Esempio n. 2
0
 def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
     # Lite
     lite_args = deepcopy(args)
     lite_args.decoder_layers = 1
     super().__init__(
         lite_args,
         dictionary,
         embed_tokens,
         no_encoder_attn,
     )
     # Always do encoder attention in NAT
     self.bottom_nat = NATransformerDecoder(
         args,
         dictionary,
         embed_tokens,
         no_encoder_attn=False,
     )
     self.bos = dictionary.bos()
     self.unk = dictionary.unk()
     self.eos = dictionary.eos()
     self.pad = dictionary.pad()
     if self.args.project_nat:
         self.project_nat = Linear(self.output_embed_dim,
                                   self.output_embed_dim,
                                   bias=True)
     if self.args.project_at:
         self.project_at = Linear(self.output_embed_dim,
                                  self.output_embed_dim,
                                  bias=True)
    def __init__(self, args):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.self_attn = BidirectionalMultiheadSelfAttention(
            self.embed_dim,
            args.decoder_attention_heads,
            dropout=args.attention_dropout,
            mask_curr_state=not args.unmask_curr_state,
        )
        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, 'activation_fn', 'relu'))
        self.activation_dropout = getattr(args, 'activation_dropout', 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, 'relu_dropout', 0)
        self.normalize_before = args.decoder_normalize_before

        self.fwd_layer_norm = LayerNorm(self.embed_dim,
                                        export=args.char_inputs)
        self.bwd_layer_norm = LayerNorm(self.embed_dim,
                                        export=args.char_inputs)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim,
                                          export=args.char_inputs)
    def build_model(cls, args, task):
        mode = {
            e.split('=')[0]: e.split('=')[1] if len(e.split('=')) > 1 else None
            for e in args.user_mode.split(',')
        }
        if 'gated' in mode:
            tmodel = GatedTransformerModel.build_model(args, task)
        elif any([m in mode for m in ['decomposable', 'sep_lm', 'sep_lm1']]):
            tmodel = DecomposableTransformerModel.build_model(args, task)
        elif any([m in mode for m in ['attn_endorse', 'dbg_log_endorsement']]):
            tmodel = SimpleTransformerModel.build_model(
                args, task, DecoderModelLayer=UserTransformerDecoderLayer)
        else:
            tmodel = SimpleTransformerModel.build_model(args, task)

        model = DistantTransformerModel(tmodel)
        model.args = args
        model.user_mode = mode
        model.sampler_grad = SequenceGeneratorGrad(
            model.model.decoder.dictionary, beam_size=1, max_len_b=60)
        model.sampler = SequenceGenerator(model.model.decoder.dictionary,
                                          beam_size=1,
                                          max_len_b=60)
        model.decoder = ProxyDecoder(tmodel, model.user_mode, args, task,
                                     model.sampler_grad, model.sampler)
        model.encoder = ProxyEncoder(tmodel, model.user_mode, args, task,
                                     model.sampler_grad, model.sampler)
        tmodel.encoder.user_mode = mode
        tmodel.decoder.user_mode = mode
        if any([
                m in mode for m in [
                    'diff_lm', 'pretrain_lm', 'sep_lm', 'max_lm_margin',
                    'sep_lm2', 'sep_lm3'
                ]
        ]):
            model.lm = TransformerDecoder(args,
                                          tmodel.decoder.dictionary,
                                          tmodel.decoder.embed_tokens,
                                          no_encoder_attn=True)
            model.decoder.lm = model.lm
        if 'sep_lm3' in mode:
            tmodel.decoder.gate_fc1 = Linear(
                len(tmodel.decoder.dictionary) * 2,
                len(tmodel.decoder.dictionary))
            tmodel.decoder.gate_fc2 = Linear(len(tmodel.decoder.dictionary), 1)
        if any([m in mode for m in ['endorsement', 'rl_edm', 'beam_endorse']]):
            model.edm = EndorsementDetectorModel.build_model(args, task)
            model.decoder.edm = model.encoder.edm = model.edm
            model.encoder.edm.decoder.user_mode = model.encoder.edm.encoder.user_mode = mode
            if any([m in mode for m in ['self_align']]):
                model.self_edm = EndorsementDetectorModel.build_model(
                    args, task)
                model.decoder.self_edm = model.encoder.self_edm = model.self_edm
                model.encoder.self_edm.decoder.user_mode = model.encoder.self_edm.encoder.user_mode = mode

        return model
 def __init__(self, model, uer_mode, args, task, sampler_grad, sampler):
     super(ProxyDecoder, self).__init__(model, uer_mode, args, task)
     self.sampler_grad = sampler_grad
     self.sampler = sampler
     if self.has_mode('sep_lm2'):
         self.gate_fc1 = Linear(
             len(self.model.decoder.dictionary) * 2,
             len(self.model.decoder.dictionary))
         self.gate_fc2 = Linear(len(self.model.decoder.dictionary),
                                len(self.model.decoder.dictionary))
Esempio n. 6
0
    def __init__(self, args, domain_adv):
        super().__init__()

        self.embed_dim = args.encoder_embed_dim
        self.domain_adv = domain_adv
        self.label = dict()
        for i, domain in enumerate(args.domains):
            self.label[domain] = i
        self.fc1 = Linear(self.embed_dim, self.embed_dim, bias=False)
        self.fc2 = Linear(self.embed_dim, 1, bias=False)
        self.fc3 = Linear(self.embed_dim, len(args.domains), bias=False)
    def __init__(self,
                 args,
                 no_encoder_attn=False,
                 add_bias_kv=False,
                 add_zero_attn=False):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.cross_self_attention = getattr(args, "cross_self_attention",
                                            False)
        self.self_attn = MultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=args.decoder_attention_heads,
            dropout=args.attention_dropout,
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
            self_attention=not self.cross_self_attention,
        )
        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, "activation_fn", "relu"))
        self.activation_dropout = getattr(args, "activation_dropout", 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, "relu_dropout", 0)
        self.normalize_before = args.decoder_normalize_before

        # use layerNorm rather than FusedLayerNorm for exporting.
        # char_inputs can be used to determint this.
        # TODO  remove this once we update apex with the fix
        export = getattr(args, "char_inputs", False)
        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                kdim=getattr(args, "encoder_embed_dim", None),
                vdim=getattr(args, "encoder_embed_dim", None),
                dropout=args.attention_dropout,
                encoder_decoder_attention=True,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim,
                                                     export=export)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
        self.need_attn = True

        self.onnx_trace = False
Esempio n. 8
0
    def __init__(self, args):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.self_attn = MultiheadAttention(
            self.embed_dim,
            args.encoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)

        self.layer_norms = nn.ModuleList(
            [LayerNorm(self.embed_dim) for i in range(2)])
Esempio n. 9
0
    def __init__(
        self,
        args,
        conv_layers_before=None,
        input_size=83,
        transformer_context=None,
        num_targets=None,
        chunk_width=None,
        chunk_left_context=0,
        training_stage=True,
    ):
        super().__init__(
            args,
            conv_layers_before=conv_layers_before,
            input_size=input_size,
            transformer_context=transformer_context,
        )
        receptive_field_radius = sum(conv.padding[0] for conv in conv_layers_before.convolutions) \
            if conv_layers_before is not None else 0
        assert chunk_width is None or chunk_width > 0
        assert (conv_layers_before is None and chunk_left_context >= 0) or \
            (conv_layers_before is not None and chunk_left_context >= receptive_field_radius)
        self.out_chunk_begin = self.output_lengths(chunk_left_context + 1) - 1
        self.out_chunk_end = self.output_lengths(chunk_left_context + chunk_width) \
            if chunk_width is not None else None
        self.training_stage = training_stage

        # only for encoder-only model
        self.fc_out = Linear(args.encoder_embed_dim, num_targets, dropout=self.dropout_module.p) \
            if num_targets is not None else None
Esempio n. 10
0
    def __init__(
        self,
        args,
        dictionary,
        embed_tokens,
        embed_other_list,
        no_encoder_attn,
        channel_sizes,
    ):
        super().__init__(args,
                         dictionary,
                         embed_tokens,
                         no_encoder_attn=no_encoder_attn)

        # embed each channel and project if dimensions do not match
        self.embed_other_list = torch.nn.ModuleList(embed_other_list)
        self.proj_other_list = torch.nn.ModuleList()
        dim = embed_tokens.embedding_dim
        for embed_other in embed_other_list:
            other_dim = 1 if embed_other is None else embed_other.embedding_dim
            self.proj_other_list.append(
                nn.Linear(other_dim, dim) if other_dim != dim else None)

        # tranformer output to prediction
        self.channel_sizes = channel_sizes
        self.project_out_dim = Linear(embed_tokens.embedding_dim,
                                      sum(channel_sizes),
                                      bias=False)
Esempio n. 11
0
    def __init__(self, args):
        super().__init__(args)

        self.spk_emb_proj = None
        if args.target_speaker_embed:
            self.spk_emb_proj = Linear(
                args.encoder_embed_dim + args.speaker_embed_dim,
                args.encoder_embed_dim)
Esempio n. 12
0
    def __init__(self,
                 args,
                 conv_layers_before=None,
                 input_size=83,
                 transformer_context=None):
        self.args = args
        super(TransformerEncoder, self).__init__(None)  # no src dictionary
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = args.encoder_embed_dim
        self.max_source_positions = args.max_source_positions

        self.conv_layers_before = conv_layers_before
        self.fc0 = Linear(input_size,
                          embed_dim) if input_size != embed_dim else None

        self.embed_positions = (PositionalEmbedding(
            self.output_lengths(self.max_source_positions),
            embed_dim,
            0,
            learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_encoder_layer(args) for i in range(args.encoder_layers)
        ])
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.transformer_context = transformer_context
Esempio n. 13
0
    def __init__(self, args, no_encoder_attn=False):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.self_attn = MultiheadAttention(
            self.embed_dim,
            args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.decoder_normalize_before

        self.self_attn_layer_norm = LayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.source_encoder_attn = None
            self.mask_encoder_attn = None
            self.encoder_attn_layer_norm = None
            self.concat_dense = None
        else:
            self.source_encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                dropout=args.attention_dropout,
            )
            self.mask_encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                dropout=args.attention_dropout,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
            self.concat_dense = Linear(2 * self.embed_dim,
                                       self.embed_dim,
                                       bias=True)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = True

        self.onnx_trace = False
    def __init__(self, args):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.self_attn = MultiheadAttention(
            self.embed_dim,
            args.decoder_attention_heads,
            dropout=args.attention_dropout,
            add_bias_kv=not args.no_bias_kv,
            add_zero_attn=args.no_bias_kv,
        )
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.decoder_normalize_before

        self.self_attn_layer_norm = LayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
    def __init__(self, args):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.self_attn = BidirectionalMultiheadSelfAttention(
            self.embed_dim,
            (args.decoder_attention_heads *
             2) if args.double_final_heads else args.decoder_attention_heads,
            dropout=args.attention_dropout,
            concat_final_q=args.concat_final_q,
        )
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.decoder_normalize_before

        self.fwd_layer_norm = LayerNorm(self.embed_dim)
        self.bwd_layer_norm = LayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
Esempio n. 16
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        # use the TransformerDecoder's __init__
        super(LevenshteinTransformerDecoder, self).__init__(
            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
        )

        self.dictionary = dictionary
        self.bos = dictionary.bos()
        self.unk = dictionary.unk()
        self.eos = dictionary.eos()
        self.pool_out = Linear(self.output_embed_dim * 2, self.output_embed_dim)

        self.label_tau = getattr(args, "label_tau", None)
 def __init__(self,
              args,
              dictionary,
              embed_tokens,
              char_model,
              no_encoder_attn=False):
     super().__init__(args,
                      dictionary,
                      embed_tokens,
                      no_encoder_attn=no_encoder_attn)
     self.char_model = char_model
     self.project_in_combine_dim = Linear(args.input_dim,
                                          args.decoder_embed_dim,
                                          bias=False)
Esempio n. 18
0
    def __init__(self, num_embeddings, embed_dim, padding_idx, num_stacked=1):
        super().__init__(num_embeddings, embed_dim, padding_idx)
        # follow transformer.Embedding
        nn.init.normal_(self.weight, mean=0, std=embed_dim**-0.5)
        nn.init.constant_(self.weight[padding_idx], 0)

        self.offset = (
            4  # skip <bos>, <pad>, <eos>, <unk>, specific to fairseq dictionary
        )
        self.vocab_size = num_embeddings - self.offset
        self.num_stacked = num_stacked

        if self.num_stacked > 1:
            self.project_in_dim = Linear(embed_dim * num_stacked,
                                         embed_dim,
                                         bias=False)
Esempio n. 19
0
    def __init__(
        self,
        args,
        dictionary,
        embed_tokens,
        no_encoder_attn=False,
        output_projection=None,
    ):
        super().__init__(args, dictionary, embed_tokens, no_encoder_attn,
                         output_projection)
        self.n_frames_per_step = args.n_frames_per_step

        self.out_proj_n_frames = (Linear(
            self.output_embed_dim,
            self.output_embed_dim * self.n_frames_per_step,
            bias=False,
        ) if self.n_frames_per_step > 1 else None)
Esempio n. 20
0
    def __init__(self, args, conv_layers_before=None, input_size=83):
        super(TransformerEncoder, self).__init__(None)  # no src dictionary
        self.register_buffer('version', torch.Tensor([3]))

        self.dropout = args.dropout

        self.conv_layers_before = conv_layers_before
        self.fc0 = Linear(input_size, args.encoder_embed_dim) \
            if input_size != args.encoder_embed_dim else None
        self.max_source_positions = args.max_source_positions

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerEncoderLayer(args) for i in range(args.encoder_layers)
        ])

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(args.encoder_embed_dim)
        else:
            self.layer_norm = None
Esempio n. 21
0
    def __init__(self, args, dictionary, embed_tokens, add_topic_pre, add_topic_post):
        super().__init__(args, dictionary, embed_tokens)

        self.add_topic_pre, self.add_topic_post = add_topic_pre, add_topic_post

        with open("/cache/code_dir/ETM/checkpoint", 'rb') as f:
            sys.modules["etm"] = etm
            m = torch.load(f)
        m = m.cuda()

        self.topic_embedding = m.rho.weight

        with open('/cache/code_dir/ETM/vocab.pkl', 'rb') as f:
            self.vo = pickle.load(f)

        self.vocab = []
        f = open('/cache/data_dir/dict.txt')
        for row in f.readlines():
            self.vocab.append(row.split(" ")[0])
        f.close()

        self.t = Linear(300, 512)
    def __init__(self, args, dictionary, embed_tokens, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.unk_idx = dictionary.unk()
        self.eos_idx = dictionary.eos()
        self.max_target_positions = args.max_target_positions
        self.output_dim = args.decoder_embed_dim

        self.self_target = args.self_target
        self.future_target = args.future_target
        self.past_target = args.past_target

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)

        self.input_dropout = torch.tensor(
            args.input_dropout) if args.input_dropout > 0 else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.forward_layers = nn.ModuleList([
            TransformerDecoderLayer(args) for _ in range(args.decoder_layers)
        ])
        self.backward_layers = nn.ModuleList([
            TransformerDecoderLayer(args) for _ in range(args.decoder_layers)
        ]) if not args.single_tower else self.forward_layers
        self.single_tower = args.single_tower

        self.full_attn_layer = None
        self.full_linear_layer = None

        if self.self_target:
            if args.linear_final_layer:
                self.full_linear_layer = Linear(embed_dim * 2, embed_dim,
                                                args.linear_final_layer_bias)
            else:
                self.full_attn_layer = BidirectionalTransformerDecoderLayer(
                    args)

        self.load_softmax = not getattr(args, 'remove_head', False)
        self.embed_out = None
        self.adaptive_softmax = None

        if self.load_softmax:
            if args.adaptive_softmax_cutoff is not None:
                self.adaptive_softmax = AdaptiveSoftmax(
                    len(dictionary),
                    args.decoder_embed_dim,
                    options.eval_str_list(args.adaptive_softmax_cutoff,
                                          type=int),
                    dropout=args.adaptive_softmax_dropout,
                    adaptive_inputs=embed_tokens
                    if args.tie_adaptive_weights else None,
                    factor=args.adaptive_softmax_factor,
                    tie_proj=args.tie_adaptive_proj,
                )
            elif not self.share_input_output_embed:
                self.embed_out = nn.Parameter(
                    torch.Tensor(len(dictionary), embed_dim))
                nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5)
        else:
            self.share_input_output_embed = False
    def __init__(
        self,
        cfg,
        dictionary,
        embed_tokens,
        no_encoder_attn=False,
        output_projection=None,
        scheduled_sampling_rate_scheduler=None,
    ):
        is_no_token_positional_embeddings_changed = False
        if (not cfg.no_token_positional_embeddings
                and cfg.decoder.relative_positional_embeddings):
            cfg.no_token_positional_embeddings = True
            is_no_token_positional_embeddings_changed = True
            logger.info(
                "disabled decoder's absolute positional embeddings as decoder_relative_positional_embeddings is True."
            )

        self.cfg = cfg
        super(TransformerDecoderBase, self).__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout_module = FairseqDropout(
            cfg.dropout,
            module_name=module_name_fordropout(self.__class__.__name__))
        self.decoder_layerdrop = cfg.decoder.layerdrop
        self.share_input_output_embed = cfg.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = cfg.decoder.embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = cfg.decoder.output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = cfg.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(
            embed_dim)

        if not cfg.adaptive_input and cfg.quant_noise.pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                cfg.quant_noise.pq,
                cfg.quant_noise.pq_block_size,
            )
        else:
            self.quant_noise = None

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)
        self.embed_positions = (PositionalEmbedding(
            self.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=cfg.decoder.learned_pos,
        ) if not cfg.no_token_positional_embeddings else None)
        if cfg.layernorm_embedding:
            self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layernorm_embedding = None

        self.cross_self_attention = cfg.cross_self_attention

        if cfg.decoder.relative_positional_embeddings:
            if cfg.decoder.learned_pos:
                rel_pos_embed_list = [
                    RelativePositionalEmbedding(
                        cfg.decoder.embed_dim,
                        padding_idx=None,
                        max_size=cfg.max_target_positions,
                        learned=True,
                    ) for _ in range(cfg.decoder.layers)
                ]
            else:
                rel_pos_embed = RelativePositionalEmbedding(
                    cfg.decoder.embed_dim,
                    padding_idx=None,
                    max_size=None,
                    learned=False,
                )
                # single instance referenced across layers
                rel_pos_embed_list = [rel_pos_embed] * cfg.decoder.layers
        else:
            rel_pos_embed_list = [None] * cfg.decoder.layers

        if self.decoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_decoder_layer(
                cfg,
                no_encoder_attn,
                positional_embedding=rel_pos_embed_list[i])
            for i in range(cfg.decoder.layers)
        ])
        self.num_layers = len(self.layers)

        if cfg.decoder.normalize_before and not cfg.no_decoder_final_norm:
            self.layer_norm = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layer_norm = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False)
                                if embed_dim != self.output_embed_dim
                                and not cfg.tie_adaptive_weights else None)

        self.adaptive_softmax = None
        self.output_projection = output_projection
        if self.output_projection is None:
            self.build_output_projection(cfg, dictionary, embed_tokens)

        if is_no_token_positional_embeddings_changed:
            cfg.no_token_positional_embeddings = not cfg.no_token_positional_embeddings

        self.scheduled_sampling_rate_scheduler = scheduled_sampling_rate_scheduler
        for layer in self.layers:
            if isinstance(
                    layer,
                    TransformerWithRelativePositionalEmbeddingDecoderLayerBase
            ):
                layer.need_attn = False  # make validation fast
Esempio n. 24
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 no_encoder_decoder_attn=False):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = (PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayerPhase2(args, no_encoder_decoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False)
                                if embed_dim != self.output_embed_dim
                                and not args.tie_adaptive_weights else None)

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out,
                            mean=0,
                            std=self.output_embed_dim**-0.5)

        if args.decoder_normalize_before and not getattr(
                args, "no_decoder_final_norm", False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
Esempio n. 25
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([3]))

        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        self.embed_tokens = embed_tokens

        self.lstm_units = args.decoder_lstm_units
        self.num_layers = args.decoder_layers
        self.initial_input_dim = embed_dim

        self.encoder_output_dim = args.encoder_embed_dim
        if args.decoder_reduced_attention_dim is None:
            self.attention_dim = self.encoder_output_dim
        else:
            self.attention_dim = args.decoder_reduced_attention_dim
        self.input_dim = self.lstm_units + self.attention_dim

        self.num_attention_heads = args.decoder_attention_heads
        self.bottleneck_dim = args.decoder_out_embed_dim


        self.initial_rnn_layer = nn.LSTM(
            input_size=self.initial_input_dim, hidden_size=self.lstm_units
        )
        self.initial_layernorm = LayerNorm(self.lstm_units)

        self.proj_encoder_layer = None
        if self.attention_dim != self.encoder_output_dim:
            self.proj_encoder_layer = Linear(
                self.encoder_output_dim, self.attention_dim
            )

        self.proj_layer = None
        if self.lstm_units != self.attention_dim:
            self.proj_layer = Linear(
                self.lstm_units, self.attention_dim
            )

        self.attention = MultiheadAttention(
            self.attention_dim,
            self.num_attention_heads,
            dropout=args.attention_dropout,
            encoder_decoder_attention=True,
        )

        self.extra_rnn_layers = nn.ModuleList([])
        self.extra_layernorms = nn.ModuleList([])
        for _ in range(self.num_layers - 1):
            self.extra_rnn_layers.append(
                nn.LSTM(input_size=self.input_dim, hidden_size=self.lstm_units)
            )
            self.extra_layernorms.append(
                LayerNorm(self.lstm_units)
            )

        self.bottleneck_layer = None
        if self.bottleneck_dim is not None:
            self.out_embed_dim = self.bottleneck_dim
            self.bottleneck_layer = Linear(
                self.input_dim, self.out_embed_dim
            )
        else:
            self.out_embed_dim = self.input_dim

        if not self.share_input_output_embed:
            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=self.out_embed_dim ** -0.5)
        else:
            assert self.bottleneck_dim == args.decoder_embed_dim, (self.bottleneck_dim, args.decoder_embed_dim)
 def __init__(self, embed_dim, num_classes):
     super().__init__()
     self.proj = Linear(2 * embed_dim, num_classes)
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 classification_head=None):
        super().__init__(dictionary)
        self.onnx_trace = False
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        self.embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.self_target = args.self_target
        self.future_target = args.future_target
        self.past_target = args.past_target
        self.char_inputs = args.char_inputs

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(self.embed_dim)

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            self.embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.forward_layers = nn.ModuleList([
            TransformerDecoderLayer(
                args,
                no_encoder_attn=True,
                add_bias_kv=not args.no_bias_kv,
                add_zero_attn=args.no_bias_kv,
            ) for _ in range(args.decoder_layers)
        ])
        self.backward_layers = nn.ModuleList([
            TransformerDecoderLayer(
                args,
                no_encoder_attn=True,
                add_bias_kv=not args.no_bias_kv,
                add_zero_attn=args.no_bias_kv,
            ) for _ in range(args.decoder_layers)
        ])

        self.full_attn_layer = None
        self.full_linear_layer = None

        if self.self_target:
            if args.linear_final_layer:
                self.full_linear_layer = Linear(self.embed_dim * 2,
                                                self.embed_dim,
                                                args.linear_final_layer_bias)
            else:
                self.full_attn_layer = BidirectionalTransformerDecoderLayer(
                    args)

        self.load_softmax = not getattr(args, 'remove_head', False)
        self.embed_out = None
        self.adaptive_softmax = None
        self.classification_head = classification_head

        if self.load_softmax:
            if args.adaptive_softmax_cutoff is not None:
                self.adaptive_softmax = AdaptiveSoftmax(
                    len(dictionary),
                    args.decoder_embed_dim,
                    options.eval_str_list(args.adaptive_softmax_cutoff,
                                          type=int),
                    dropout=args.adaptive_softmax_dropout,
                )
            elif not self.share_input_output_embed:
                self.embed_out = nn.Parameter(
                    torch.Tensor(len(dictionary), self.embed_dim))
                nn.init.normal_(self.embed_out,
                                mean=0,
                                std=self.embed_dim**-0.5)
Esempio n. 28
0
    def __init__(
        self,
        cfg,
        return_fc=False,
        pre_encoder=None,
        input_size=83,
        transformer_context=None,
    ):
        self.cfg = cfg
        super(TransformerEncoderBase, self).__init__(None)  # no src dictionary
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__)
        )
        self.encoder_layerdrop = cfg.encoder.layerdrop
        self.return_fc = return_fc

        embed_dim = cfg.encoder.embed_dim
        self.max_source_positions = cfg.max_source_positions

        self.pre_encoder = pre_encoder
        self.fc0 = Linear(input_size, embed_dim) if input_size != embed_dim else None

        self.embed_scale = (
            1.0
            if cfg.no_scale_embedding
            or self.fc0 is not None  # always diable scaling if fc0 is present
            else math.sqrt(embed_dim)
        )

        if (
            not cfg.no_token_positional_embeddings
            and cfg.encoder.relative_positional_embeddings
        ):
            logger.info(
                "disabled encoder's absolute positional embeddings as encoder_relative_positional_embeddings is True."
            )
        self.embed_positions = (
            PositionalEmbedding(
                self.output_lengths(self.max_source_positions),
                embed_dim,
                0,
                learned=cfg.encoder.learned_pos,
            )
            if not cfg.no_token_positional_embeddings
            and not cfg.encoder.relative_positional_embeddings
            else None
        )

        if cfg.layernorm_embedding:
            self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layernorm_embedding = None

        if not cfg.adaptive_input and cfg.quant_noise.pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                cfg.quant_noise.pq,
                cfg.quant_noise.pq_block_size,
            )
        else:
            self.quant_noise = None

        if cfg.encoder.relative_positional_embeddings:
            if cfg.encoder.learned_pos:
                rel_pos_embed_list = [
                    RelativePositionalEmbedding(
                        cfg.encoder.embed_dim,
                        padding_idx=None,
                        max_size=self.output_lengths(cfg.max_source_positions),
                        learned=True,
                    )
                    for _ in range(cfg.encoder.layers)
                ]
            else:
                rel_pos_embed = RelativePositionalEmbedding(
                    cfg.encoder.embed_dim,
                    padding_idx=None,
                    max_size=None,
                    learned=False,
                )
                # single instance referenced across layers
                rel_pos_embed_list = [rel_pos_embed] * cfg.encoder.layers
        else:
            rel_pos_embed_list = [None] * cfg.encoder.layers

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend(
            [
                self.build_encoder_layer(
                    cfg, positional_embedding=rel_pos_embed_list[i]
                )
                for i in range(cfg.encoder.layers)
            ]
        )
        self.num_layers = len(self.layers)

        if cfg.encoder.normalize_before and cfg.encoder.layer_type != "conformer":
            self.layer_norm = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layer_norm = None

        self.transformer_context = transformer_context