def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.add_template = args.add_template if self.add_template: self.template_layers = nn.ModuleList([]) self.template_layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.tp_layer_norm = LayerNorm(embed_dim) else: self.tp_layer_norm = None self.positionwise = PositionWise(embed_dim, embed_dim, self.dropout) self.two_encoder_mix = nn.Linear(2 * embed_dim, embed_dim) self.attention = MultiheadAttention(embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args): super().__init__(None) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__) self.embed_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_embedding: self.embed_scale = 1.0 self.padding_idx = 1 self.subsample = Conv1dSubsampler( args.input_feat_per_channel * args.input_channels, args.conv_channels, args.encoder_embed_dim, [int(k) for k in args.conv_kernel_sizes.split(",")], ) self.embed_positions = PositionalEmbedding(args.max_source_positions, args.encoder_embed_dim, self.padding_idx) self.transformer_layers = nn.ModuleList([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) # pdb.set_trace() # self.dropout = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3] self.dropout = [0, 0, 0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3] self.index = None self.encoder_layerdrop = args.encoder_layerdrop # self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim], # [int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4), embed_dim]) embed_dim = embed_tokens.embedding_dim self.embed_dim = embed_dim # self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim], # [int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4), embed_dim]) self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim], [int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim]) self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend( [TransformerEncoderLayer(args) for i in range(args.encoder_layers)] ) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def build_encoder_layer(self, args, i=None): modular_layer_indices = eval(args.encoder_modular_layer_indices) if type(modular_layer_indices) is int: modular_layer_indices = [modular_layer_indices] if i in modular_layer_indices: return TransformerModularEncoderLayer(args) else: return TransformerEncoderLayer(args)
def build_encoder_layer(self, args): layer = TransformerEncoderLayer(args) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = (getattr(args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP)) layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions if args.Tfixup: temp_state_dict = embed_tokens.state_dict() temp_state_dict["weight"] = (9 * args.encoder_layers) ** (- 1 / 4) * temp_state_dict["weight"] embed_tokens.load_state_dict(temp_state_dict) self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend( [TransformerEncoderLayer(args) for i in range(args.encoder_layers)] ) # ======================== fixup calling initialization in layers ================================== if args.Tfixup: for encoder_layer in self.layers: encoder_layer.fixup_initialization(args) # ================================================================= self.num_layers = len(self.layers) if args.encoder_normalize_before and not args.dont_use_layernorm: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False) and not args.dont_use_layernorm: self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, task): super().__init__(None) assert args.pretrained_model is not None pretrained_models, _ = checkpoint_utils.load_model_ensemble( [args.pretrained_model], task=task) self.audio_encoder = pretrained_models[0].encoder if args.freeze_pretrained != "none": for p_name, p_val in self.audio_encoder.named_parameters(): p_val.requires_grad = False self.n_layers = args.context_encoder_layers self.dropout = args.dropout self.layers = nn.ModuleList( [TransformerEncoderLayer(args) for _ in range(self.n_layers)])
def build_encoder_layer(self, args): layer = TransformerEncoderLayer(args) checkpoint = getattr(args, "checkpoint_activations", False) if checkpoint: offload_to_cpu = getattr(args, "offload_activations", False) layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = (getattr(args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP) if not checkpoint else 0) layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layer_norm = LayerNorm(args.encoder_embed_dim) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ])
def __init__(self, ninp, nhead, nhid, nout, nlayers, dropout=0.5): super(TransformerEnc, self).__init__() from torch.nn import TransformerEncoder, TransformerEncoderLayer self.model_type = 'Transformer' self.src_mask = None self.pos_encoder = PositionalEncoding(ninp, dropout, max_len=100) encoder_layers = TransformerEncoderLayer(nhid, nhead, nhid, dropout) #self.linear_pos_enc = LinearPositionalEmbedding(max_len=100) self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) # self.encoder = nn.Embedding(ntoken, ninp) self.ninp = ninp self.hidden2pose_projection = nn.Linear(nhid, nout) self.pose2hidden_projection = nn.Linear(ninp, nhid)
def __init__(self, args, src_dict, embed_speaker): super().__init__(src_dict) self.padding_idx = src_dict.pad() self.embed_speaker = embed_speaker self.spk_emb_proj = None if embed_speaker is not None: self.spk_emb_proj = nn.Linear( args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim ) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__ ) self.embed_tokens = nn.Embedding( len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx ) assert args.encoder_conv_kernel_size % 2 == 1 self.prenet = nn.ModuleList( nn.Sequential( nn.Conv1d( args.encoder_embed_dim, args.encoder_embed_dim, kernel_size=args.encoder_conv_kernel_size, padding=((args.encoder_conv_kernel_size - 1) // 2), ), nn.BatchNorm1d(args.encoder_embed_dim), nn.ReLU(), nn.Dropout(args.encoder_dropout), ) for _ in range(args.encoder_conv_layers) ) self.prenet_proj = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, self.padding_idx ) self.pos_emb_alpha = nn.Parameter(torch.ones(1)) self.transformer_layers = nn.ModuleList( TransformerEncoderLayer(args) for _ in range(args.encoder_transformer_layers) ) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None self.apply(encoder_init)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.embed_positions = (PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.src_drop = args.src_drop self.RTD = args.RTD self.drop_method = args.drop_method if self.drop_method == 'drop_tag': self.mask = dictionary.indices['<dropped>'] elif self.drop_method == 'unk_tag': self.mask = dictionary.indices['<unk>']
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout # self.args = args embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx print(self.padding_idx) self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) # if 'adaptive' in args.init_type and not args.encoder_normalize_before: # print('adaptive init') self.layers.extend([ TransformerEncoderLayer(args, LayerNum=i) for i in range(args.encoder_layers) ]) # else: # self.layers.extend([ # TransformerEncoderLayer(args) # for i in range(args.encoder_layers) # ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if args.fp16: self.out_type = torch.half else: self.out_type = torch.float
def __init__(self, args, dictionary, word_encoder_embed_dim, encoder_embed_dim): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop if word_encoder_embed_dim != encoder_embed_dim: self.word_projection_layer = Linear(word_encoder_embed_dim, encoder_embed_dim) else: self.word_projection_layer = None embed_dim = encoder_embed_dim self.output_units = encoder_embed_dim self.padding_idx = dictionary.pad() self.max_source_positions = args.max_source_positions self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend( [TransformerEncoderLayer(args) for i in range(args.encoder_layers)] ) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args): """Construct an Encoder object.""" super().__init__(None) self.dropout = args.dropout self.embed_scale = (1.0 if args.no_scale_embedding else math.sqrt( args.encoder_embed_dim)) self.padding_idx = 1 self.in_channels = 1 self.input_dim = args.input_feat_per_channel self.conv = torch.nn.Sequential( torch.nn.Conv2d(1, args.conv_out_channels, 3, stride=2, padding=3 // 2), torch.nn.ReLU(), torch.nn.Conv2d( args.conv_out_channels, args.conv_out_channels, 3, stride=2, padding=3 // 2, ), torch.nn.ReLU(), ) transformer_input_dim = self.infer_conv_output_dim( self.in_channels, self.input_dim, args.conv_out_channels) self.out = torch.nn.Linear(transformer_input_dim, args.encoder_embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, self.padding_idx, learned=False, ) self.transformer_layers = nn.ModuleList([]) self.transformer_layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None
def __init__(self, args): # pass an empty dictionary super().__init__(dict()) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop embed_dim = args.encoder_embed_dim self.max_source_positions = args.max_source_positions self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) # self.embed_scale = 1.0 self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, padding_idx=0, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) # downsize the input embedding self.early_proj = args.early_proj if self.early_proj: self.inp_fc = Linear(args.src_embed_dim, embed_dim) # add encoder layers self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.agg_method = args.agg_method self.agg_layers = args.agg_layers self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.attn = MultiheadAttention(embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True) self.fc = Linear(args.agg_layers * embed_dim, embed_dim) self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, embed_dim, dictionary=None): super(VideoTransformerEncoder, self).__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.args = args self.dropout = args.dropout resnet18 = models.resnet18(pretrained=False) self.spatio_enc = nn.Sequential(*list(resnet18.children())[:-1]) if args.cnn_parameters_freeze: for p in self.spatio_enc.parameters(): p.requires_grad = False if args.cnn_normalize_after: self.batchnorm = nn.BatchNorm1d(embed_dim) self.relu = nn.ReLU() self.padding_idx = 0 # 这里不同于 tgt_dict 的 padding_idx. self.embed_positions = (PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.encoder_layerdrop = args.encoder_layerdrop self.temporal_enc_layers = nn.ModuleList([]) self.temporal_enc_layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.num_layer = len(self.temporal_enc_layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, tgt_dict, embed_tokens): super().__init__(tgt_dict) self.n_layers = args.context_encoder_layers self.embed_dim = args.decoder_embed_dim self.embed_scale = math.sqrt( self.embed_dim) # todo: try with input_embed_dim self.embed_tokens = embed_tokens self.dropout = args.dropout self.embed_positions = PositionalEmbedding( args.max_target_positions, self.embed_dim, self.embed_tokens.padding_idx, learned=args.decoder_learned_pos, ) self.layers = nn.ModuleList( [TransformerEncoderLayer(args) for _ in range(self.n_layers)]) input_embed_dim = embed_tokens.embedding_dim self.project_in_dim = Linear(input_embed_dim, self.embed_dim, bias=False) \ if self.embed_dim != input_embed_dim else None
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.args = args self.dropout = args.dropout self.bgt_setting = self.args.bgt_setting embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.hidden2mean = nn.Linear(embed_dim, self.args.latent_size, bias=False) if self.bgt_setting == "bgt": self.hidden2logv = nn.Linear(embed_dim, self.args.latent_size, bias=False) self.latent2hidden = nn.Linear(self.args.latent_size, embed_dim, bias=False)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.join_layer = args.join_layer if self.join_layer: self.num_branches = args.encoder_branches self.branch_dropout = args.branch_dropout self.layers_branches = nn.ModuleList([]) for _ in range(args.encoder_layers): layer_i_branches = nn.ModuleList([]) self.layers_branches.append(layer_i_branches) for _ in range(self.num_branches): layer_i_branches.append(TransformerEncoderLayer(args)) else: self.layers = nn.ModuleList([]) self.layers.extend([ TransformerMBEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.sde = args.sde self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.scale_norm: scale = embed_dim**0.5 else: scale = None if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim, scale=scale) else: self.layer_norm = None self.tracker = VariableTracker() self.track_gradients = False
def __init__(self, args, dictionary): super().__init__(dictionary) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__) self.embed_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_embedding: self.embed_scale = 1.0 self.padding_idx = 1 self.subsample = Conv1dSubsampler( args.input_feat_per_channel * args.input_channels, args.conv_channels, args.encoder_embed_dim, [int(k) for k in args.conv_kernel_sizes.split(",")], ) self.embed_positions = PositionalEmbedding(args.max_source_positions, args.encoder_embed_dim, self.padding_idx) self.transformer_layers = nn.ModuleList([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None # ctc self.ctc_compress_out = args.ctc_compress_out if self.ctc_compress_out: self.ctc_fc = nn.Linear(args.encoder_embed_dim, len(dictionary)) assert args.criterion == "ctc_multi_loss" self.ctc_layer = args.ctc_encoder_layer self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
def build_encoder_layer(self, args): return TransformerEncoderLayer(args)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) try: if args.PRUNE_BOOL: self.encoder_self_attn_path = args.ENCODER_SELF_ATTN_PATH self.encoder_self_attn_pattern = torch.from_numpy(np.load(self.encoder_self_attn_path)) #(no_layers, 1, no_head, 1024, 1024) if args.CUDA: self.encoder_self_attn_pattern = self.encoder_self_attn_pattern.cuda() except: #backward compatibility args.PRUNE_BOOL = False args.PRUNE_ENC_SELF_ATTN = False args.PRUNE_DEC_SELF_ATTN = False args.PRUNE_ENC_DEC_ATTN = False args.TAU = 0 args.USE_ENTMAX = False args.ENCODER_SELF_ATTN_PATH = None args.DECODER_SELF_ATTN_PATH = None args.ENCODER_DECODER_ATTN_PATH = None args.CUDA = True args.RANDOM_PRUNE = False self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) if args.PRUNE_BOOL: self.layers.extend( [TransformerEncoderLayer(args, self.encoder_self_attn_pattern[i]) for i in range(args.encoder_layers)] ) else: self.layers.extend( [TransformerEncoderLayer(args, None) for i in range(args.encoder_layers)] ) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None #image section self.img_dim = 2048 self.text_dim = embed_dim self.L2norm = args.L2norm self.total_num_img = args.total_num_img self.per_num_img = args.per_num_img # cap2image_file = args.cap2image_file # image_embedding_file = args.image_embedding_file cap2image_file = getattr(args, "cap2image_file", "data/cap2image.pickle") image_embedding_file = getattr(args, "image_embedding_file", "features_resnet50/train-resnet50-avgpool.npy") self.cap2image = pickle.load(open(cap2image_file, "rb")) #cap_id to image_id #print("image embedding processing...") embeding_weights = np.load(image_embedding_file) img_vocab, img_dim = embeding_weights.shape embeddings_matrix = np.zeros((img_vocab + 1, img_dim)) embeddings_matrix[1:] = embeding_weights self.img_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(embeddings_matrix), freeze=args.image_emb_fix) # update embedding # self.img_embeddings.load_state_dict({'weight': embeddings_matrix}) # if args.image_emb_fix: # self.img_embeddings.weight.requires_grad = False self.merge_option = args.merge_option self.dense = nn.Linear(self.img_dim, self.text_dim) self.mergeImage = nn.Linear(self.total_num_img, 1) if self.merge_option == "att-mul-concat": self.proj_attention = SCAttention(self.text_dim, 128) self.dense2 = nn.Linear(self.text_dim, 384) elif self.merge_option == "att-concat": self.dense2 = nn.Linear(2 * self.text_dim, self.text_dim) elif self.merge_option == "att-gate": self.gate_type = args.gate_type self.proj_attention = SCAttention(self.text_dim, self.text_dim) if self.gate_type == "neural-gate": self.sigmoid = nn.Sigmoid() self.gate_dense = nn.Linear(2*self.text_dim, self.text_dim) elif self.gate_type == "scalar-gate": self.sigmoid = nn.Sigmoid() self.gate_dense = nn.Linear(2*self.text_dim, 1) else: self.image_weight = args.image_weight else: self.proj_attention = SCAttention(self.text_dim, self.text_dim)
def __init__( self, input_feat_per_channel, vggblock_config=DEFAULT_ENC_VGGBLOCK_CONFIG, transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG, encoder_output_dim=512, in_channels=1, transformer_context=None, transformer_sampling=None, ): """constructor for VGGTransformerEncoder Args: - input_feat_per_channel: feature dim (not including stacked, just base feature) - in_channel: # input channels (e.g., if stack 8 feature vector together, this is 8) - vggblock_config: configuration of vggblock, see comments on DEFAULT_ENC_VGGBLOCK_CONFIG - transformer_config: configuration of transformer layer, see comments on DEFAULT_ENC_TRANSFORMER_CONFIG - encoder_output_dim: final transformer output embedding dimension - transformer_context: (left, right) if set, self-attention will be focused on (t-left, t+right) - transformer_sampling: an iterable of int, must match with len(transformer_config), transformer_sampling[i] indicates sampling factor for i-th transformer layer, after multihead att and feedfoward part """ super().__init__(None) self.num_vggblocks = 0 if vggblock_config is not None: if not isinstance(vggblock_config, Iterable): raise ValueError("vggblock_config is not iterable") self.num_vggblocks = len(vggblock_config) self.conv_layers = nn.ModuleList() self.in_channels = in_channels self.input_dim = input_feat_per_channel self.pooling_kernel_sizes = [] if vggblock_config is not None: for _, config in enumerate(vggblock_config): ( out_channels, conv_kernel_size, pooling_kernel_size, num_conv_layers, layer_norm, ) = config self.conv_layers.append( VGGBlock( in_channels, out_channels, conv_kernel_size, pooling_kernel_size, num_conv_layers, input_dim=input_feat_per_channel, layer_norm=layer_norm, )) self.pooling_kernel_sizes.append(pooling_kernel_size) in_channels = out_channels input_feat_per_channel = self.conv_layers[-1].output_dim transformer_input_dim = self.infer_conv_output_dim( self.in_channels, self.input_dim) # transformer_input_dim is the output dimension of VGG part self.validate_transformer_config(transformer_config) self.transformer_context = self.parse_transformer_context( transformer_context) self.transformer_sampling = self.parse_transformer_sampling( transformer_sampling, len(transformer_config)) self.transformer_layers = nn.ModuleList() if transformer_input_dim != transformer_config[0][0]: self.transformer_layers.append( Linear(transformer_input_dim, transformer_config[0][0])) self.transformer_layers.append( TransformerEncoderLayer( prepare_transformer_encoder_params(*transformer_config[0]))) for i in range(1, len(transformer_config)): if transformer_config[i - 1][0] != transformer_config[i][0]: self.transformer_layers.append( Linear(transformer_config[i - 1][0], transformer_config[i][0])) self.transformer_layers.append( TransformerEncoderLayer( prepare_transformer_encoder_params( *transformer_config[i]))) self.encoder_output_dim = encoder_output_dim self.transformer_layers.extend([ Linear(transformer_config[-1][0], encoder_output_dim), LayerNorm(encoder_output_dim), ])
def build_encoder_layer(self, args): layer = TransformerEncoderLayer(args) if getattr(args, "checkpoint_activations", False): layer = checkpoint_wrapper(layer) return layer
def build_encoder_layer(self, args): layer = TransformerEncoderLayer(args) if getattr(args, "checkpoint_activations", False): offload_to_cpu = getattr(args, "offload_activations", False) layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) return layer