def __init__( self, h, d_model, p, d_ff, attn_p=0.1, ): super(LMDecoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=1) ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.Constants.static) self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0): super(ParallelEncoderLayer, self).__init__() self.version = version self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__(self, opt, dicts, positional_encoder): super(TransformerEncoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.version = opt.version self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.positional_encoder = positional_encoder self.layer_modules = nn.ModuleList([EncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)]) self.pretrained_point = -1
def __init__(self, opt, embedding, positional_encoder, attribute_embeddings=None, ignore_source=False): super(TransformerDecoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.version = opt.version self.encoder_type = opt.encoder_type self.ignore_source = ignore_source self.encoder_cnn_downsampling = opt.cnn_downsampling if opt.time == 'positional_encoding': self.time_transformer = positional_encoder else: raise NotImplementedError self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = embedding # Using feature embeddings in models if attribute_embeddings is not None: self.use_feature = True self.attribute_embeddings = attribute_embeddings self.feature_projector = nn.Linear( opt.model_size + opt.model_size * attribute_embeddings.size(), opt.model_size) else: self.use_feature = None self.positional_encoder = positional_encoder len_max = self.positional_encoder.len_max mask = torch.ByteTensor( np.triu(np.ones((len_max, len_max)), k=1).astype('uint8')) self.register_buffer('mask', mask) self.build_modules()
def __init__(self, opt, dicts, positional_encoder): super(StochasticTransformerDecoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.death_rate = opt.death_rate if hasattr(opt, 'grow_dropout'): self.grow_dropout = opt.grow_dropout if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) #self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) self.positional_encoder = positional_encoder self.layer_modules = nn.ModuleList() for l in range(self.layers): # linearly decay the death rate death_r = ( l + 1 ) / self.layers * self.death_rate block = StochasticDecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout, death_rate=death_r) self.layer_modules.append(block) e_length = expected_length(self.layers, self.death_rate) print("Stochastic Decoder with %.2f expected layers" % e_length) # self.layer_modules = nn.ModuleList([DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(e_length)]) len_max = self.positional_encoder.len_max # print(len_max) mask = torch.ByteTensor(np.triu(np.ones((len_max,len_max)), k=1).astype('uint8')) self.register_buffer('mask', mask)
def __init__(self, opt, embedding, positional_encoder, encoder_type='text'): super(TransformerEncoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1: self.layers = opt.encoder_layers else: self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.version = opt.version self.input_type = encoder_type self.cnn_downsampling = opt.cnn_downsampling self.channels = 1 feature_size = opt.input_size if encoder_type != "text": if not self.cnn_downsampling: self.audio_trans = nn.Linear(feature_size, self.model_size) torch.nn.init.xavier_uniform_(self.audio_trans.weight) else: channels = self.channels cnn = [nn.Conv2d(channels, 64, kernel_size=(3, 3), stride=2), nn.ReLU(True), nn.BatchNorm2d(64), nn.Conv2d(64, 64, kernel_size=(3, 3), stride=2), nn.ReLU(True), nn.BatchNorm2d(64)] self.audio_trans = nn.Sequential(*cnn) # self.model_size = feat_size = (((feature_size // channels) - 3) // 4) * 64 assert self.model_size == feat_size, \ "The model dimension doesn't match with the feature dim, expecting %d " % feat_size else: self.word_lut = embedding if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.positional_encoder = positional_encoder self.build_modules()
def __init__(self, opt, dicts, positional_encoder): super(FCTransformerDecoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.version = opt.version if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) if self.version == 1.0: self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) self.positional_encoder = positional_encoder self.layer_modules = nn.ModuleList([ FCTDecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers) ]) len_max = self.positional_encoder.len_max mask = torch.ByteTensor( np.triu(np.ones((len_max, len_max)), k=1).astype('uint8')) self.register_buffer('mask', mask)
def __init__(self, opt, dicts, positional_encoder): super(TransformerEncoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1: self.layers = opt.encoder_layers else: self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.version = opt.version self.input_type = opt.encoder_type if opt.encoder_type != "text": self.audio_trans = nn.Linear(dicts, self.model_size) else: self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.positional_encoder = positional_encoder self.build_modules()
def __init__(self, opt, dicts, positional_encoder): super(StochasticTransformerEncoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.death_rate = opt.death_rate if hasattr(opt, 'grow_dropout'): self.grow_dropout = opt.grow_dropout self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) #self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n', elementwise_affine=True) self.positional_encoder = positional_encoder self.layer_modules = nn.ModuleList() for l in range(self.layers): # linearly decay the death rate death_r = ( l + 1.0 ) / self.layers * self.death_rate block = StochasticEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout, death_rate=death_r) self.layer_modules.append(block) e_length = expected_length(self.layers, self.death_rate) print("Stochastic Encoder with %.2f expected layers" % e_length)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(EncoderLayer, self).__init__() self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da') self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da') self.rnn = nn.LSTM(d_model, d_model//2, 1, bidirectional=True) #~ feedforward = FeedForward(d_model, d_ff, p) self.ffn = FeedForward(d_model, d_ff, p)
def __init__(self, opt, dicts, positional_encoder, ignore_source=False): super(TransformerDecoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.version = opt.version self.encoder_type = opt.encoder_type self.ignore_source = ignore_source if opt.time == 'positional_encoding': self.time_transformer = positional_encoder else: raise NotImplementedError # elif opt.time == 'gru': # self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) # elif opt.time == 'lstm': # self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) self.positional_encoder = positional_encoder len_max = self.positional_encoder.len_max mask = torch.ByteTensor( np.triu(np.ones((len_max, len_max)), k=1).astype('uint8')) self.register_buffer('mask', mask) self.build_modules()
def __init__(self, opt, dicts, positional_encoder): super(TransformerEncoder, self).__init__() self.model_size = opt.model_size #dmodel which is the dimension between sublayers self.n_heads = opt.n_heads #heads in multihead attention self.inner_size = opt.inner_size #Size of feed forward network in sublayer self.layers = opt.layers #Amount of stacked encoder/decoder layers in the model self.dropout = opt.dropout self.word_dropout = opt.word_dropout #D.S: Dropout which is applied by converting input to embedding self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.residual_dropout = opt.residual_dropout self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) #Performs Preprocessing (here its dropout) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) #Performs Postprocessing (here its layerNorm) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.positional_encoder = positional_encoder self.build_modules()
def __init__(self, opt, embeddings, positional_encoder, attribute_embeddings=None, generator=None): """ :param opt: Options :param embeddings: a list of two embedding tables [src tgt] :param positional_encoder: The sinusoidal positional encoding :param attribute_embeddings: To be implemented """ super(RelativeTransformer, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.version = opt.version self.encoder_type = opt.encoder_type self.encoder_cnn_downsampling = opt.cnn_downsampling self.variational_dropout = opt.variational_dropout self.switchout = opt.switchout self.death_rate = opt.death_rate self.layer_modules = None self.use_feature = False self.d_head = self.model_size // self.n_heads` if self.switchout > 0: self.word_dropout = 0 self.positional_encoder = positional_encoder self.relative = True # two embedding layers for src and tgt self.src_word_lut = embeddings[0] self.tgt_word_lut = embeddings[1] self.generator = generator self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', variational=self.variational_dropout) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.build_modules()
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(RelativeTransformerEncoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.preprocess_attn_rev = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.d_head = d_head = d_model // h self.multihead_fwd = RelPartialLearnableMultiHeadAttn(h // 2, d_model, d_head, dropatt=attn_p) self.multihead_bwd = RelPartialLearnableMultiHeadAttn(h // 2, d_model, d_head, dropatt=attn_p) self.attn_out = Linear(h * self.d_head, d_model) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.Constants.static) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.Constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, static=onmt.Constants.static) self.feedforward = feedforward
def add_layers(self, n_new_layer): self.new_modules = list() self.layers += n_new_layer for i in range(n_new_layer): layer = ParallelEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) # the first layer will use the preprocessing which is the last postprocessing if i == 0: layer.preprocess_attn.load_state_dict(self.postprocess_layer.state_dict()) #~ layer.preprocess_attn.layer_norm.function.weight.requires_grad = False #~ layer.preprocess_attn.layer_norm.function.bias.requires_grad = False #~ if hasattr(layer.postprocess_attn, 'k'): #~ layer.postprocess_attn.k.data.fill_(0.01) # replace the last postprocessing layer with a new one self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.layer_modules.append(layer)
def __init__(self, opt, dicts): self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout super(RecurrentDecoder, self).__init__() self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d') self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.layer_modules = nn.ModuleList([DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)])
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False): super(RelativeTransformerDecoderLayer, self).__init__() self.variational = variational self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) self.variational = variational self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', variational=self.variational) d_head = d_model // h self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h, d_model, d_head, dropatt=attn_p) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, variational=self.variational) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.Constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, variational=self.variational) else: raise NotImplementedError self.feedforward = Bottle(feedforward)
def __init__(self, opt, dicts, positional_encoder, time_encoder): super(UniversalTransformerDecoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.positional_encoder = positional_encoder self.time_encoder = time_encoder self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) self.positional_encoder = positional_encoder self.recurrent_layer = UniversalDecoderLayer( self.n_heads, self.model_size, self.dropout, self.inner_size, self.positional_encoder, self.time_encoder, self.attn_dropout) len_max = self.positional_encoder.len_max mask = torch.ByteTensor( np.triu(np.ones((len_max, len_max)), k=1).astype('uint8')) self.register_buffer('mask', mask)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0, ignore_source=False): super(RelativeTransformerDecoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.ignore_source = ignore_source if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_src_attn = PrePostProcessing( d_model, p, sequence='da', static=onmt.Constants.static) self.multihead_src = MultiHeadAttention( h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=2) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) # self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=1) d_head = d_model // h self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h, d_model, d_head, dropatt=attn_p) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p, static=onmt.Constants.static) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) elif onmt.Constants.activation_layer == 'linear_swish_linear': ff_p = p feedforward = FeedForwardSwish(d_model, d_ff, ff_p, static=onmt.Constants.static) self.feedforward = feedforward
def __init__(self, h, d_model, p, d_ff, pos_encoder, time_encoder, attn_p=0.1, version=1.0): super(UniversalEncoderLayer, self).__init__() self.version = version # position and time embedding is added into the input before the layer self.pos_encoder = pos_encoder self.time_encoder = time_encoder self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=onmt.Constants.static) self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(FCTEncoderLayer, self).__init__() self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da', static=True) #~ self.multihead = HierarchicalMultiHeadAttention(h, d_model, attn_p=attn_p) self.multihead = UniformMultiHeadAttention(h, d_model, attn_p=attn_p) self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da', static=True) if onmt.Constants.activation_layer == 'linear_relu_linear': ff_p = p feedforward = FeedForward(d_model, d_ff, ff_p) elif onmt.Constants.activation_layer == 'maxout': k = int(math.ceil(d_ff / d_model)) feedforward = MaxOut(d_model, d_model, k) self.feedforward = Bottle(feedforward)
def __init__(self, opt, dicts): super().__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.encoder_type = opt.encoder_type self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) self.rnn = nn.LSTM(self.model_size, self.model_size, num_layers=3, dropout=self.dropout) self.postprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.h = None self.c = None
def __init__(self, opt, dicts, positional_encoder, time_encoder): super(UniversalTransformerEncoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) self.positional_encoder = positional_encoder self.time_encoder = time_encoder self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.positional_encoder = positional_encoder self.recurrent_layer = UniversalEncoderLayer( self.n_heads, self.model_size, self.dropout, self.inner_size, self.positional_encoder, self.time_encoder, self.attn_dropout)
def add_layers(self, n_new_layer): self.new_modules = list() self.layers += n_new_layer for i in range(n_new_layer): layer = EncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) # the first layer will use the preprocessing which is the last postprocessing if i == 0: layer.preprocess_attn = self.postprocess_layer # replace the last postprocessing layer with a new one self.postprocess_layer = PrePostProcessing(d_model, 0, sequence='n') self.layer_modules.append(layer)
def __init__(self, h, d_model, p, d_ff, attn_p=0.1): super(DecoderLayer, self).__init__() self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da') self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da') self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n') self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da') self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p) self.rnn = nn.LSTM(d_model, d_model, 1, bidirectional=False) feedforward = FeedForward(d_model, d_ff, p) self.feedforward = feedforward
class ParallelTransformerEncoder(nn.Module): """Encoder in 'Attention is all you need' Args: opt: list of options ( see train.py ) dicts : dictionary (for source language) """ def __init__(self, opt, dicts, positional_encoder): super(ParallelTransformerEncoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time if hasattr(opt, 'grow_dropout'): self.grow_dropout = opt.grow_dropout self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) #~ self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.positional_encoder = positional_encoder self.layer_modules = nn.ModuleList([ParallelEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)]) def add_layers(self, n_new_layer): self.new_modules = list() self.layers += n_new_layer for i in range(n_new_layer): layer = ParallelEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) # the first layer will use the preprocessing which is the last postprocessing if i == 0: layer.preprocess_attn.load_state_dict(self.postprocess_layer.state_dict()) #~ layer.preprocess_attn.layer_norm.function.weight.requires_grad = False #~ layer.preprocess_attn.layer_norm.function.bias.requires_grad = False #~ if hasattr(layer.postprocess_attn, 'k'): #~ layer.postprocess_attn.k.data.fill_(0.01) # replace the last postprocessing layer with a new one self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.layer_modules.append(layer) def mark_pretrained(self): self.pretrained_point = self.layers def forward(self, input, grow=False): """ Inputs Shapes: input: batch_size x len_src (wanna tranpose) Outputs Shapes: out: batch_size x len_src x d_model mask_src """ if grow: return self.forward_grow(input) """ Embedding: batch_size x len_src x d_model """ emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) """ Scale the emb by sqrt(d_model) """ if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze(1) # batch_size x len_src x 1 for broadcasting pad_mask = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src #~ pad_mask = None context = emb.contiguous() memory_bank = list() for i, layer in enumerate(self.layer_modules): if len(self.layer_modules) - i <= onmt.Constants.checkpointing and self.training: context, norm_input = checkpoint(custom_layer(layer), context, mask_src, pad_mask) #~ print(type(context)) else: context, norm_input = layer(context, mask_src, pad_mask) # batch_size x len_src x d_model if i > 0: # don't keep the norm input of the first layer (a.k.a embedding) memory_bank.append(norm_input) # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. context = self.postprocess_layer(context) # make a huge memory bank on the encoder side memory_bank.append(context) memory_bank = torch.stack(memory_bank) return memory_bank, mask_src def forward_grow(self, input): """ Inputs Shapes: input: batch_size x len_src (wanna tranpose) Outputs Shapes: out: batch_size x len_src x d_model mask_src """ with torch.no_grad(): """ Embedding: batch_size x len_src x d_model """ emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) """ Scale the emb by sqrt(d_model) """ if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze(1) # batch_size x len_src x 1 for broadcasting pad_mask = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src #~ pad_mask = None context = emb.contiguous() memory_bank = list() for i in range(self.pretrained_point): layer = self.layer_modules[i] context, norm_input = layer(context, mask_src, pad_mask) # batch_size x len_src x d_model if i > 0: # don't keep the norm input of the first layer (a.k.a embedding) memory_bank.append(norm_input) for i in range(self.layers - self.pretrained_point): res_drop_rate = 0.0 if i == 0: res_drop_rate = self.grow_dropout layer = self.layer_modules[self.pretrained_point + i] context, norm_input = layer(context, mask_src, pad_mask, residual_dropout=res_drop_rate) # batch_size x len_src x d_model memory_bank.append(norm_input) # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. context = self.postprocess_layer(context) # make a huge memory bank on the encoder side memory_bank.append(context) memory_bank = torch.stack(memory_bank) return memory_bank, mask_src
def __init__(self, opt, vec_linear, positional_encoder, encoder_type='text'): super(TransformerEncoder, self).__init__() # # by me # assert bert_embeddings is not None self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1: self.layers = opt.encoder_layers else: self.layers = opt.layers self.dropout = opt.hidden_dropout # src对应的worddropout 在bert里面 # self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.enc_emb_dropout = opt.enc_emb_dropout self.enc_gradient_checkpointing = opt.enc_gradient_checkpointing self.time = opt.time self.version = opt.version self.input_type = encoder_type self.cnn_downsampling = opt.cnn_downsampling self.switchout = opt.switchout self.varitional_dropout = opt.variational_dropout self.fp16 = opt.fp16 # disable word dropout when switch out is in action # if self.switchout > 0.0: # self.word_dropout = 0.0 feature_size = opt.input_size self.channels = 1 # n. audio channels if opt.upsampling: feature_size = feature_size // 4 if encoder_type != "text": if not self.cnn_downsampling: self.audio_trans = nn.Linear(feature_size, self.model_size) torch.nn.init.xavier_uniform_(self.audio_trans.weight) else: channels = self.channels cnn = [ nn.Conv2d(channels, 32, kernel_size=(3, 3), stride=2), nn.ReLU(True), nn.BatchNorm2d(32), nn.Conv2d(32, 32, kernel_size=(3, 3), stride=2), nn.ReLU(True), nn.BatchNorm2d(32) ] feat_size = (((feature_size // channels) - 3) // 4) * 32 # cnn.append() self.audio_trans = nn.Sequential(*cnn) self.linear_trans = nn.Linear(feat_size, self.model_size) # assert self.model_size == feat_size, \ # "The model dimension doesn't match with the feature dim, expecting %d " % feat_size else: self.word_lut = None # 【4*768, model_size】 self.vec_linear = vec_linear # 【bert_hidden_size, transformer_model_size】 if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) self.preprocess_layer = PrePostProcessing( self.model_size, self.enc_emb_dropout, sequence='d', variational=self.varitional_dropout) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.positional_encoder = positional_encoder self.build_modules()
def __init__(self, opt, dicts, positional_encoder, encoder_type): super(TransformerEncoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1: self.layers = opt.encoder_layers else: self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.version = opt.version self.input_type = encoder_type # input lookup table if encoder_type != "text": self.audio_trans = nn.Linear(dicts, self.model_size) else: self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.positional_encoder = positional_encoder self.limit_rhs_steps = opt.limit_rhs_steps self.build_modules(limit_rhs_steps=opt.limit_rhs_steps) if self.limit_rhs_steps is not None: largest_rhs_mask = positional_encoder.len_max + self.limit_rhs_steps rhs_mask = torch.BoolTensor( np.triu(np.ones((largest_rhs_mask, largest_rhs_mask)), k=1 + self.limit_rhs_steps).astype('uint8')) self.register_buffer('rhs_mask', rhs_mask) if opt.freeze_encoder: for p in self.parameters(): p.requires_grad = False print(p.requires_grad)
def __init__(self, opt, dicts, positional_encoder, ignore_source=False, feature_embedding=None): super(TransformerDecoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time self.version = opt.version self.encoder_type = opt.encoder_type self.ignore_source = ignore_source self.fixed_target_length = 0 if hasattr(opt, 'fixed_target_length'): if opt.fixed_target_length == "int": self.fixed_target_length = 1 print('Embedding') elif opt.fixed_target_length == "encoding": self.fixed_target_length = 2 print('Encoding') elif opt.fixed_target_length == "forward_backward_encoding": self.fixed_target_length = 3 print('Forward backward encoding') elif opt.fixed_target_length == "no": print('No fixed target len.') else: raise NotImplementedError if opt.time == 'positional_encoding': self.time_transformer = positional_encoder else: raise NotImplementedError # elif opt.time == 'gru': # self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) # elif opt.time == 'lstm': # self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) # self.feat_lut = feature_embedding # if self.feat_lut is not None: # self.enable_feature = True # self.feature_projector = nn.Linear(opt.model_size * 2, opt.model_size) # else: self.enable_feature = False self.positional_encoder = positional_encoder if self.fixed_target_length == 1: self.length_lut = nn.Embedding(8192, opt.model_size, padding_idx=onmt.Constants.PAD) self.length_projector = nn.Linear(opt.model_size * 2, opt.model_size) len_max = self.positional_encoder.len_max mask = torch.ByteTensor( np.triu(np.ones((len_max, len_max)), k=1).astype('uint8')) self.register_buffer('mask', mask) self.build_modules()
class ParallelTransformerDecoder(nn.Module): """Encoder in 'Attention is all you need' Args: opt dicts """ def __init__(self, opt, dicts, positional_encoder): super(ParallelTransformerDecoder, self).__init__() self.model_size = opt.model_size self.n_heads = opt.n_heads self.inner_size = opt.inner_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.time = opt.time if hasattr(opt, 'grow_dropout'): self.grow_dropout = opt.grow_dropout if opt.time == 'positional_encoding': self.time_transformer = positional_encoder elif opt.time == 'gru': self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True) elif opt.time == 'lstm': self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True) #~ self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = nn.Embedding(dicts.size(), self.model_size, padding_idx=onmt.Constants.PAD) self.positional_encoder = positional_encoder self.layer_modules = nn.ModuleList([DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)]) len_max = self.positional_encoder.len_max mask = torch.ByteTensor(np.triu(np.ones((len_max,len_max)), k=1).astype('uint8')) self.register_buffer('mask', mask) def renew_buffer(self, new_len): self.positional_encoder.renew(new_len) mask = torch.ByteTensor(np.triu(np.ones((new_len,new_len)), k=1).astype('uint8')) self.register_buffer('mask', mask) def mark_pretrained(self): self.pretrained_point = self.layers def add_layers(self, n_new_layer): self.new_modules = list() self.layers += n_new_layer for i in range(n_new_layer): layer = DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) # the first layer will use the preprocessing which is the last postprocessing if i == 0: # layer.preprocess_attn = self.postprocess_layer layer.preprocess_attn.load_state_dict(self.postprocess_layer.state_dict()) #~ layer.preprocess_attn.layer_norm.function.weight.requires_grad = False #~ layer.preprocess_attn.layer_norm.function.bias.requires_grad = False # replace the last postprocessing layer with a new one #~ if hasattr(layer.postprocess_attn, 'k'): #~ layer.postprocess_attn.k.data.fill_(0.01) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.layer_modules.append(layer) def forward(self, input, context, src, grow=False): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ """ Embedding: batch_size x len_tgt x d_model """ if grow: return self.forward_grow(input, context, src) emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1) pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD)) len_tgt = input.size(1) mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt] mask_tgt = torch.gt(mask_tgt, 0) output = emb.contiguous() pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1)) #~ memory_bank = None for i, layer in enumerate(self.layer_modules): if len(self.layer_modules) - i <= onmt.Constants.checkpointing and self.training: output, coverage = checkpoint(custom_layer(layer), output, context[i], mask_tgt, mask_src, pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model else: output, coverage = layer(output, context[i], mask_tgt, mask_src, pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.postprocess_layer(output) return output, coverage def forward_grow(self, input, context, src): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ """ Embedding: batch_size x len_tgt x d_model """ with torch.no_grad(): emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1) pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD)) len_tgt = input.size(1) mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt] mask_tgt = torch.gt(mask_tgt, 0) output = emb.contiguous() pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1)) for i in range(self.pretrained_point): layer = self.layer_modules[i] output, coverage = layer(output, context[i], mask_tgt, mask_src, pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model for i in range(self.layers - self.pretrained_point): res_drop_rate = 0.0 if i == 0: res_drop_rate = self.grow_dropout layer = self.layer_modules[self.pretrained_point + i] output, coverage = layer(output, context[self.pretrained_point + i], mask_tgt, mask_src, pad_mask_tgt, pad_mask_src, residual_dropout=res_drop_rate) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.postprocess_layer(output) return output, coverage #~ def step(self, input, context, src, buffer=None): def step(self, input, decoder_state): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src buffer (List of tensors) List of batch_size * len_tgt-1 * d_model for self-attention recomputing Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ # note: transpose 1-2 because the first dimension (0) is the number of layer context = decoder_state.context.transpose(1, 2) buffer = decoder_state.buffer src = decoder_state.src.transpose(0, 1) if decoder_state.input_seq is None: decoder_state.input_seq = input else: # concatenate the last input to the previous input sequence decoder_state.input_seq = torch.cat([decoder_state.input_seq, input], 0) input = decoder_state.input_seq.transpose(0, 1) input_ = input[:,-1].unsqueeze(1) output_buffer = list() batch_size = input.size(0) input_ = input[:,-1].unsqueeze(1) # print(input_.size()) """ Embedding: batch_size x 1 x d_model """ emb = self.word_lut(input_) if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ if self.time == 'positional_encoding': emb = self.time_transformer(emb, t=input.size(1)) else: prev_h = buffer[0] if buffer is None else None emb = self.time_transformer(emb, prev_h) buffer[0] = emb[1] if isinstance(emb, tuple): emb = emb[0] # emb should be batch_size x 1 x dim # Preprocess layer: adding dropout emb = self.preprocess_layer(emb) # batch_size x 1 x len_src mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1) pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD)) len_tgt = input.size(1) mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt] # mask_tgt = self.mask[:len_tgt, :len_tgt].unsqueeze(0).repeat(batch_size, 1, 1) mask_tgt = torch.gt(mask_tgt, 0) mask_tgt = mask_tgt[:, -1, :].unsqueeze(1) output = emb.contiguous() pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1)) memory_bank = None for i, layer in enumerate(self.layer_modules): buffer_ = buffer[i] if buffer is not None else None assert(output.size(1) == 1) output, coverage, buffer_ = layer.step(output, context[i], mask_tgt, mask_src, pad_mask_tgt=None, pad_mask_src=None, buffer=buffer_) # batch_size x len_src x d_model output_buffer.append(buffer_) buffer = torch.stack(output_buffer) # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.postprocess_layer(output) decoder_state._update_state(buffer) return output, coverage