def forward(self, input, **kwargs): """ Inputs Shapes: input: (Variable) len_tgt x batch_size Outputs Shapes: out: len_tgt x batch_size x d_model """ emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) emb = self.preprocess_layer(emb) if self.h is None: lstm_mem = None else: lstm_mem = (self.h.detach(), self.c.detach()) output, (h, c) = self.rnn(emb, lstm_mem) output = self.postprocess_layer(output) output_dict = defaultdict(lambda: None) output_dict['hidden'] = output output_dict['lstm_mem'] = (h, c) self.h = h self.c = c return output_dict
def process_embedding(self, input, atbs=None): # if self.switchout == 0: # input_ = input # if self.switchout > 0 and self.training: # vocab_size = self.word_lut.weight.size(0) # input_ = switchout(input, vocab_size, self.switchout) # else: input_ = input emb = embedded_dropout( self.word_lut, input_, dropout=self.word_dropout if self.training else 0) if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if self.use_feature: len_tgt = emb.size(1) atb_emb = self.attribute_embeddings(atbs).unsqueeze(1).repeat( 1, len_tgt, 1) # B x H to 1 x B x H emb = torch.cat([emb, atb_emb], dim=-1) emb = torch.relu(self.feature_projector(emb)) return emb
def forward(self, input, context, src, hidden=None): """ Inputs: context (Variable): len_src * batch_size * H input ( Variable): len_tgt * batch_size src ( Variable) : len_src * batch_size """ emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) # transpose to have batch first to fit attention format mask_src = src.data.eq(onmt.Constants.PAD).transpose(0, 1).unsqueeze(1) # normalize the embedding emb = self.preprocess_layer(emb) output = emb rnn_hiddens = list() for layer in self.layer_modules: output, rnn_hidden, coverage = layer(output, context, mask_src) rnn_hiddens.append(rnn_hidden) output = self.postprocess_layer(output) return output, rnn_hiddens, coverage
def forward(self, input, context, src, **kwargs): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ """ Embedding: batch_size x len_tgt x d_model """ emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1) pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD)) len_tgt = input.size(1) mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt] mask_tgt = torch.gt(mask_tgt, 0) output = emb.contiguous() pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1)) for i, layer in enumerate(self.layer_modules): if len(self.layer_modules) - i <= onmt.Constants.checkpointing and self.training: output, coverage = checkpoint(custom_layer(layer), output, context, mask_tgt, mask_src, pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model else: output, coverage = layer(output, context, mask_tgt, mask_src, pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.postprocess_layer(output) return output, coverage
def forward(self, input): """ Inputs Shapes: input: batch_size x len_src (wanna tranpose) Outputs Shapes: out: batch_size x len_src x d_model mask_src """ """ Embedding: batch_size x len_src x d_model """ emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) """ Scale the emb by sqrt(d_model) """ if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze( 1) # batch_size x len_src x 1 for broadcasting pad_mask = torch.autograd.Variable(input.data.ne( onmt.Constants.PAD)) # batch_size x len_src #~ pad_mask = None context = emb.contiguous() memory_bank = None for i, layer in enumerate(self.layer_modules): if len(self.layer_modules ) - i <= onmt.Constants.checkpointing and self.training: context, memory_bank = checkpoint(custom_layer(layer), context, memory_bank, mask_src, pad_mask) #~ print(type(context)) else: context, memory_bank = layer( context, memory_bank, mask_src, pad_mask) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. context = self.postprocess_layer(context) # make a huge memory bank on the encoder side memory_bank = torch.cat([memory_bank, context.unsqueeze(0)], dim=0) return memory_bank, mask_src
def forward(self, input, **kwargs): """ Inputs Shapes: input: batch_size x len_src (wanna tranpose) Outputs Shapes: out: batch_size x len_src x d_model mask_src """ """ Embedding: batch_size x len_src x d_model """ if self.input_type == "text": mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze( 1) # batch_size x len_src x 1 for broadcasting emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) else: mask_src = input.narrow(2, 0, 1).squeeze(2).eq( onmt.Constants.PAD).unsqueeze(1) input = input.narrow(2, 1, input.size(2) - 1) emb = self.audio_trans(input.contiguous().view( -1, input.size(2))).view(input.size(0), input.size(1), -1) """ Scale the emb by sqrt(d_model) """ emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) emb = self.preprocess_layer(emb) context = emb.transpose(0, 1).contiguous() for i, layer in enumerate(self.layer_modules): if len(self.layer_modules ) - i <= onmt.Constants.checkpointing and self.training: context = checkpoint(custom_layer(layer), context, mask_src) else: context = layer(context, mask_src) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. context = self.postprocess_layer(context) output_dict = {'context': context, 'src_mask': mask_src} # return context, mask_src return output_dict
def forward(self, input, **kwargs): """ Inputs Shapes: input: batch_size x len_src (wanna tranpose) Outputs Shapes: out: batch_size x len_src x d_model mask_src """ """ Embedding: batch_size x len_src x d_model """ #D.S: self.training is always 0 #D.S: word_lut is look up table which contains embedding for each emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) """ Scale the emb by sqrt(d_model) """ emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) emb = self.preprocess_layer(emb) #D.S. tensor.eq computes elementwise equality (Compares each element. If elements are the same then return tensor has 1 at this element position, 0 otherwise. #D.S: Input tensor have to be the same dimensions #D.S: mask_src is 1 where input is 0. Mask is set size one in dimension 1 #D.S: TODO: mask_src: Not sure how this is working?? mask_src = input.eq(onmt.Constants.PAD).unsqueeze( 1) # batch_size x len_src x 1 for broadcasting #~ pad_mask = input.ne(onmt.Constants.PAD)) # batch_size x len_src context = emb.transpose(0, 1).contiguous() for i, layer in enumerate(self.layer_modules): #D.S: TODO: self.training is never set, so if always fails if len(self.layer_modules ) - i <= onmt.Constants.checkpointing and self.training: context = checkpoint(custom_layer(layer), context, mask_src) else: context = layer(context, mask_src) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. context = self.postprocess_layer(context) return context, mask_src
def forward(self, input): """ Inputs Shapes: input: len_src x batch_size (wanna tranpose) """ # first, create the inputs for packed sequence mask = input.data.ne(onmt.Constants.PAD) lengths = Variable(torch.sum(mask, dim=0)) # sort the lengths by descending order # remember the ind to unsort the output tensors sorted_lengths, ind = torch.sort(lengths, 0, descending=True) # sort the input by length sorted_input = input.index_select(1, ind) packed_input = pack(sorted_input, sorted_lengths) batch_sizes = packed_input.batch_sizes emb = embedded_dropout(self.word_lut, packed_input.data, dropout=self.word_dropout if self.training else 0) # add dropout ( works on 2D tensor) emb = self.preprocess_layer(emb) # pack the input in a PackedSequence packed_input = PackedSequence(emb, batch_sizes) rnn_hiddens = [] output = packed_input for layer in self.layer_modules: output, rnn_hidden = layer(output) # len_src x batch_size x d_model rnn_hiddens.append(rnn_hidden) output = PackedSequence(self.postprocess_layer(output.data), batch_sizes) # restore the mask to the tensor context = unpack(output)[0] # unsort the context and the rnn_hiddens context = unsort(context, ind, dim=1) #~ #~ for i, hidden in rnn_hiddens: #~ rnn_hiddens[i] = unsort(hidden, ind, dim=1) return context, rnn_hiddens
def process_embedding(self, input, atbs=None): emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) # Adding dropout emb = self.preprocess_layer(emb) if self.use_feature: atb_emb = self.attribute_embeddings(atbs).unsqueeze(1).expand_as(emb) # B x H to 1 x B x H emb = torch.cat([emb, atb_emb], dim=-1) emb = torch.relu(self.feature_projector(emb)) return emb
def forward(self, input, **kwargs): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ """ Embedding: batch_size x len_tgt x d_model """ emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) len_tgt = input.size(1) mask_tgt = input.data.eq( onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt] mask_tgt = torch.gt(mask_tgt, 0) output = emb.transpose(0, 1).contiguous() for i, layer in enumerate(self.layer_modules): output, coverage = layer( output, mask_tgt) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.postprocess_layer(output) output_dict = {'hidden': output, 'coverage': coverage} # return output, None return output_dict
def embedding_processing(self, input, input_attbs, freeze_embeddings=False): # len_tgt = input.size(1) # target length # input_attbs = input_attbs.unsqueeze(1).repeat(1, len_tgt) # make into same lenth as target len # if self.switchout > 0 and self.training: # vocab_size = self.word_lut.weight.size(0) # input = switchout(input, vocab_size, self.switchout) # if freeze_embeddings: # with torch.no_grad: # emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) # if self.feat_lut is not None: # attb_emb = self.feat_lut(input_attbs) # else: # attb_emb = [] # else: emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) # if self.feat_lut is not None: # attb_emb = self.feat_lut(input_attbs) # else: attb_emb = [] if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ if self.fixed_target_length == 2 or self.fixed_target_length == 3: if self.fixed_target_length == 3: emb = self.time_transformer(emb) emb = emb * math.sqrt(self.model_size) # add target length encoding tgt_length = input.data.ne( onmt.Constants.PAD).sum(1).unsqueeze(1).expand_as(input.data) index = torch.arange(input.data.size(1)).unsqueeze(0).expand_as( tgt_length).type_as(tgt_length) tgt_length = (tgt_length - index) * input.data.ne( onmt.Constants.PAD).long() num_timescales = self.model_size // 2 log_timescale_increment = math.log(10000) / (num_timescales - 1) inv_timescales = torch.exp( torch.arange(0, num_timescales).float() * -log_timescale_increment) scaled_time = tgt_length.float().unsqueeze( 2) * inv_timescales.unsqueeze(0).unsqueeze(0).type_as(emb) pos_emb = torch.cat( (torch.sin(scaled_time), torch.cos(scaled_time)), 2) emb = emb + pos_emb else: emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] # now emb should have size B x T x H # expand B to B x T if self.enable_feature: emb = torch.cat([emb, attb_emb], dim=-1) emb = torch.relu(self.feature_projector(emb)) if self.fixed_target_length == 1: tgt_length = input.data.ne( onmt.Constants.PAD).sum(1).unsqueeze(1).expand_as(input.data) index = torch.arange(input.data.size(1)).unsqueeze(0).expand_as( tgt_length).type_as(tgt_length) tgt_length = (tgt_length - index) * input.data.ne( onmt.Constants.PAD).long() tgt_emb = self.length_lut(tgt_length) emb = torch.cat([emb, tgt_emb], dim=-1) emb = torch.relu(self.length_projector(emb)) return emb
def forward_seq2seq(self, batch, target_masking=None, zero_encoder=False): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ src = batch.get('source') tgt = batch.get('target_input') input = torch.cat([src, tgt], dim=0) """ Embedding: batch_size x len_tgt x d_model """ # we work with two embeddings at the same time src_emb = embedded_dropout(self.src_word_lut, src, dropout=self.word_dropout if self.training else 0) tgt_emb = embedded_dropout(self.tgt_word_lut, tgt, dropout=self.word_dropout if self.training else 0) # Concatenate the embeddings by time dimension emb = torch.cat([src_emb, tgt_emb], dim=0) # Add dropout and scale emb = self.preprocess_layer(emb) emb = emb * math.sqrt(self.model_size) klen, batch_size = emb.size(0), emb.size(1) # Prepare positional encoding: pos_seq = torch.arange(klen - 1, -1, -1.0, device=emb.device, dtype=emb.dtype) # pos_seq = torch.arange(0, klen, device=emb.device, dtype=emb.dtype) pos_emb = self.preprocess_layer(self.positional_encoder(pos_seq)) if self.use_feature: raise NotImplementedError # No feature/attributes for the moment # attention masking qlen = klen mlen = 0 # we don't have any memory in this mode # print(input) dec_attn_mask = torch.triu( emb.new_ones(qlen, klen), diagonal=1 + mlen).byte()[:, :, None] # Size T x T ? pad_mask = input.eq(onmt.Constants.PAD).byte().unsqueeze(1) # Size 1 x T x B # pad_mask = input.new(*input.size()).zero_() mask = dec_attn_mask + pad_mask mask = torch.gt(mask, 0).bool() # mask = dec_attn_mask mask = mask.bool() output = emb for i, layer in enumerate(self.layer_modules): output, coverage = layer(output, pos_emb, self.r_w_bias, self.r_r_bias, mask) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.postprocess_layer(output) all_output = output src_len = src.size(0) context = output[src_len:, :, :] tgt_len = tgt.size(0) tgt_hiddens = output[:tgt_len, :, :] # output_dict = {'hidden': output, 'coverage': coverage, 'context': context} output_dict = defaultdict(lambda: None) output_dict['hidden'] = tgt_hiddens output_dict['encoder'] = context output_dict['src_mask'] = mask[src_len:, :, :] output = tgt_hiddens # This step removes the padding to reduce the load for the final layer if target_masking is not None: output = output.contiguous().view(-1, output.size(-1)) mask = target_masking """ We remove all positions with PAD """ flattened_mask = mask.view(-1) non_pad_indices = torch.nonzero(flattened_mask).squeeze(1) output = output.index_select(0, non_pad_indices) # final layer: computing softmax logprobs = self.generator[0](output) output_dict['logprobs'] = logprobs # return output, None return output_dict
def forward(self, input, **kwargs): """ Inputs Shapes: input: batch_size x len_src (wanna tranpose) Outputs Shapes: out: batch_size x len_src x d_model mask_src """ """ Embedding: batch_size x len_src x d_model """ if self.input_type == "text": mask_src = input.eq(onmt.Constants.PAD).unsqueeze( 1) # batch_size x len_src x 1 for broadcasting # apply switchout # if self.switchout > 0 and self.training: # vocab_size = self.word_lut.weight.size(0) # input = switchout(input, vocab_size, self.switchout) emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) else: if not self.cnn_downsampling: mask_src = input.narrow(2, 0, 1).squeeze(2).eq( onmt.Constants.PAD).unsqueeze(1) input = input.narrow(2, 1, input.size(2) - 1) emb = self.audio_trans(input.contiguous().view( -1, input.size(2))).view(input.size(0), input.size(1), -1) else: long_mask = input.narrow(2, 0, 1).squeeze(2).eq(onmt.Constants.PAD) input = input.narrow(2, 1, input.size(2) - 1) # first resizing to fit the CNN format input = input.view(input.size(0), input.size(1), -1, self.channels) input = input.permute(0, 3, 1, 2) input = self.audio_trans(input) input = input.permute(0, 2, 1, 3).contiguous() input = input.view(input.size(0), input.size(1), -1) # print(input.size()) input = self.linear_trans(input) mask_src = long_mask[:, 0:input.size(1) * 4:4].unsqueeze(1) # the size seems to be B x T ? emb = input if torch_version >= 1.2: mask_src = mask_src.bool() """ Scale the emb by sqrt(d_model) """ emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) # B x T x H -> T x B x H context = emb.transpose(0, 1) context = self.preprocess_layer(context) for i, layer in enumerate(self.layer_modules): if len(self.layer_modules ) - i <= onmt.Constants.checkpointing and self.training: context = checkpoint(custom_layer(layer), context, mask_src) else: context = layer(context, mask_src) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. context = self.postprocess_layer(context) output_dict = {'context': context, 'src_mask': mask_src} # return context, mask_src return output_dict
def forward_grow(self, input, context, src): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ """ Embedding: batch_size x len_tgt x d_model """ with torch.no_grad(): emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1) pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD)) len_tgt = input.size(1) mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt] mask_tgt = torch.gt(mask_tgt, 0) output = emb.contiguous() pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1)) for i in range(self.pretrained_point): layer = self.layer_modules[i] output, coverage = layer(output, context[i], mask_tgt, mask_src, pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model for i in range(self.layers - self.pretrained_point): res_drop_rate = 0.0 if i == 0: res_drop_rate = self.grow_dropout layer = self.layer_modules[self.pretrained_point + i] output, coverage = layer(output, context[self.pretrained_point + i], mask_tgt, mask_src, pad_mask_tgt, pad_mask_src, residual_dropout=res_drop_rate) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.postprocess_layer(output) return output, coverage
def forward_grow(self, input): """ Inputs Shapes: input: batch_size x len_src (wanna tranpose) Outputs Shapes: out: batch_size x len_src x d_model mask_src """ with torch.no_grad(): """ Embedding: batch_size x len_src x d_model """ emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) """ Scale the emb by sqrt(d_model) """ if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze(1) # batch_size x len_src x 1 for broadcasting pad_mask = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src #~ pad_mask = None context = emb.contiguous() memory_bank = list() for i in range(self.pretrained_point): layer = self.layer_modules[i] context, norm_input = layer(context, mask_src, pad_mask) # batch_size x len_src x d_model if i > 0: # don't keep the norm input of the first layer (a.k.a embedding) memory_bank.append(norm_input) for i in range(self.layers - self.pretrained_point): res_drop_rate = 0.0 if i == 0: res_drop_rate = self.grow_dropout layer = self.layer_modules[self.pretrained_point + i] context, norm_input = layer(context, mask_src, pad_mask, residual_dropout=res_drop_rate) # batch_size x len_src x d_model memory_bank.append(norm_input) # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. context = self.postprocess_layer(context) # make a huge memory bank on the encoder side memory_bank.append(context) memory_bank = torch.stack(memory_bank) return memory_bank, mask_src
def forward(self, input, context, src, atbs=None, **kwargs): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ """ Embedding: batch_size x len_tgt x d_model """ input = input.transpose(0, 1) # B x T to T x B klen, batch_size = input.size() emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) # Adding dropout emb = self.preprocess_layer(emb) emb = emb * math.sqrt(self.model_size) # Prepare positional encoding: pos_seq = torch.arange(klen - 1, -1, -1.0, device=emb.device, dtype=emb.dtype) # pos_seq = torch.arange(0, klen, device=emb.device, dtype=emb.dtype) pos_emb = self.preprocess_layer(self.positional_encoder(pos_seq)) if self.use_feature: raise NotImplementedError if context is not None: if self.encoder_type == "audio": if not self.encoder_cnn_downsampling: mask_src = src.narrow(2, 0, 1).squeeze(2).eq( onmt.Constants.PAD).unsqueeze(1) else: long_mask = src.narrow(2, 0, 1).squeeze(2).eq(onmt.Constants.PAD) mask_src = long_mask[:, 0:context.size(0) * 4:4].unsqueeze(1) else: mask_src = src.eq(onmt.Constants.PAD).unsqueeze(1) else: mask_src = None # mask_src = mask_src.bool() # attention masking qlen = klen # mask_tgt = torch.triu(emb.new_ones(qlen, klen), diagonal=1).unsqueeze(-1).byte() # mask_tgt = mask_tgt + input.eq(onmt.Constants.PAD).byte().unsqueeze(0) # mask_tgt = torch.gt(mask_tgt, 0) # convert all 2s to 1 # mask_tgt = mask_tgt.bool() mask_tgt = input.t().eq(onmt.Constants.PAD).unsqueeze(1) + \ torch.triu(emb.new_ones(qlen, klen), diagonal=1).byte() mask_tgt = torch.gt(mask_tgt, 0) # mask_tgt = mask_tgt.bool() output = emb for i, layer in enumerate(self.layer_modules): output, coverage = layer( output, pos_emb, context, mask_tgt, mask_src) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.postprocess_layer(output) output_dict = {'hidden': output, 'coverage': coverage} # return output, None return output_dict
def forward(self, input, **kwargs): """ Inputs Shapes: input: batch_size x len_src (wanna tranpose) Outputs Shapes: out: batch_size x len_src x d_model mask_src """ """ Embedding: batch_size x len_src x d_model """ emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) """ Scale the emb by sqrt(d_model) """ if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ #~ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze( 1) # batch_size x len_src x 1 for broadcasting pad_mask = torch.autograd.Variable(input.data.ne( onmt.Constants.PAD)) # batch_size x len_src #~ pad_mask = None context = emb.contiguous() memory_bank = list() for t in range(self.layers): context = self.recurrent_layer( context, mask_src, t, pad_mask) # batch_size x len_src x d_model #~ for i, layer in enumerate(self.layer_modules): #~ #~ #~ if len(self.layer_modules) - i <= onmt.Constants.checkpointing and self.training: #~ context, norm_input = checkpoint(custom_layer(layer), context, mask_src, pad_mask) #~ #~ print(type(context)) #~ else: #~ context, norm_input = layer(context, mask_src, pad_mask) # batch_size x len_src x d_model #~ #~ if i > 0: # don't keep the norm input of the first layer (a.k.a embedding) #~ memory_bank.append(norm_input) #~ # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. context = self.postprocess_layer(context) return context, mask_src
def forward(self, input, context, src, atbs=None, **kwargs): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ """ Embedding: batch_size x len_tgt x d_model """ self.history.clean() emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) if self.time == 'positional_encoding': emb = emb * math.sqrt(self.model_size) """ Adding positional encoding """ emb = self.time_transformer(emb) if isinstance(emb, tuple): emb = emb[0] emb = self.preprocess_layer(emb) if self.use_feature: atb_emb = self.attribute_embeddings(atbs).unsqueeze(1).repeat( 1, emb.size(1)) # B x H to 1 x B x H emb = torch.cat([emb, atb_emb], dim=-1) emb = torch.relu(self.feature_projector(emb)) if context is not None: if self.encoder_type == "audio": mask_src = src.data.narrow(2, 0, 1).squeeze(2).eq( onmt.Constants.PAD).unsqueeze(1) else: mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1) else: mask_src = None if context is not None: if self.encoder_type == "audio": mask_src = src.data.narrow(2, 0, 1).squeeze(2).eq( onmt.Constants.PAD).unsqueeze(1) else: mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1) else: mask_src = None len_tgt = input.size(1) mask_tgt = input.data.eq( onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt] mask_tgt = torch.gt(mask_tgt, 0) output = emb.transpose(0, 1).contiguous() self.history.push(output) for i, layer in enumerate(self.layer_modules): output = self.history.pop() if len(self.layer_modules ) - i <= onmt.Constants.checkpointing and self.training: output, coverage = checkpoint(custom_layer(layer), output, context, mask_tgt, mask_src) # batch_size x len_src x d_model else: output, coverage = layer( output, context, mask_tgt, mask_src) # batch_size x len_src x d_model # write into memory self.history.push(output) # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.history.pop() output = self.postprocess_layer(output) output_dict = {'hidden': output, 'coverage': coverage} # return output, None return output_dict
def forward(self, input, **kwargs): """ Inputs Shapes: input: batch_size x len_src Outputs Shapes: out: batch_size x len_src x d_model mask_src """ """ Embedding: batch_size x len_src x d_model """ # if self.input_type == "text": # mask_src = input.eq(onmt.Constants.PAD).byte() # batch_size x len_src x 1 for broadcasting # emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0) # else: # raise NotImplementedError # if not self.cnn_downsampling: # mask_src = input.narrow(2, 0, 1).squeeze(2).eq(onmt.Constants.PAD) # input = input.narrow(2, 1, input.size(2) - 1) # emb = self.audio_trans(input.contiguous().view(-1, input.size(2))).view(input.size(0), # input.size(1), -1) # else: # long_mask = input.narrow(2, 0, 1).squeeze(2).eq(onmt.Constants.PAD) # input = input.narrow(2, 1, input.size(2) - 1) # # # first resizing to fit the CNN format # input = input.view(input.size(0), input.size(1), -1, self.channels) # input = input.permute(0, 3, 1, 2) # # input = self.audio_trans(input) # input = input.permute(0, 2, 1, 3).contiguous() # input = input.view(input.size(0), input.size(1), -1) # # mask_src = long_mask[:, 0:input.size(1) * 4:4] # emb = input input = input.transpose(0, 1) # B x T to T x B klen, batch_size = input.size() """ Scale the emb by sqrt(d_model) """ emb = embedded_dropout( self.word_lut, input, dropout=self.word_dropout if self.training else 0) emb = emb * (math.sqrt(self.model_size)) # Adding dropout emb = self.preprocess_layer(emb) # Prepare positional encoding: pos_seq = torch.arange(klen - 1, -1, -1.0, device=emb.device, dtype=emb.dtype) pos_emb = self.preprocess_layer(self.positional_encoder(pos_seq)) # attention masking qlen = klen mask = torch.triu(emb.new_ones(qlen, klen), diagonal=1).byte() mask_fwd = input.t().eq(onmt.Constants.PAD).unsqueeze(1).byte() + mask mask_fwd = torch.gt(mask_fwd, 0) # mask_fwd = mask_fwd.bool() input_flip = flip(input, 0) # mask_bwd = mask + input_flip.eq(onmt.Constants.PAD).unsqueeze(0).byte() mask_bwd = input_flip.t().eq(onmt.Constants.PAD).unsqueeze(1).byte() + \ torch.triu(emb.new_ones(qlen, klen), diagonal=1).byte() mask_bwd = torch.gt(mask_bwd, 0) # convert all 2s to 1 # mask_bwd = mask_bwd.bool() context = emb for i, layer in enumerate(self.layer_modules): context = layer(context, pos_emb, mask_fwd, mask_bwd) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. context = self.postprocess_layer(context) output_dict = {'context': context, 'src_mask': None} # return context, mask_src return output_dict