def test_S2SModelLoader(s2smodel_data): path, train, valid, test = s2smodel_data fields = [ ("english", Field(init_token="__init__", eos_token="__eos__", lower=True)), ("french", Field(init_token="__init__", eos_token="__eos__", lower=True)), ("german", Field(init_token="__init__", eos_token="__eos__", lower=True)) ] ds = TabularDatasetFromFiles(path=path / train, fields=fields) for name, field in fields: field.build_vocab(ds) bs = 2 ml = S2SDataLoader(dataset=ds, batch_size=bs, source_names=["english", "french"], target_names=["french"]) assert len(ml) == 200 index = 0 for index, (*X, Y) in enumerate(ml): assert_dims(X, [2, None, bs]) assert_dims(Y, [None, bs]) assert X[1].shape[0] == Y.shape[0] + 1 assert len(ml) == index + 1
def _greedy_forward(self, inputs): inputs = inputs[: 1] # inputs should be only first token initially [1,bs] sl, bs = inputs.size() finished = to_gpu(torch.zeros(bs).byte()) iteration = 0 self.beam_outputs = inputs.clone() layer_outputs = [[] for _ in range(self.nlayers)] raw_layer_outputs = [[] for _ in range(self.nlayers)] while not finished.all() and iteration < self.max_iterations: # output should be List[[sl, bs, layer_dim], ...] sl should be one raw_output, output = self.forward(inputs, 0) for layer_index in range(self.nlayers): layer_outputs[layer_index].append(output[layer_index]) raw_layer_outputs[layer_index].append(raw_output[layer_index]) # inputs are the indices dims [1,bs] _, inputs = output[-1].max(dim=-1) assert_dims(inputs, [1, bs]) iteration += 1 self.beam_outputs = assert_dims( torch.cat([self.beam_outputs, inputs], dim=0), [iteration + 1, bs]) new_finished = inputs.data == self.eos_token finished = finished | new_finished self.beam_outputs = self.beam_outputs.view(-1, bs, 1) # ensure the outputs are a list of layers where each layer is [sl,bs,layerdim] raw_outputs = [torch.cat(i, dim=0) for i in raw_layer_outputs] outputs = [torch.cat(i, dim=0) for i in layer_outputs] return raw_outputs, outputs
def _greedy_forward(self, inputs, hidden=None, constraints=None): inputs = inputs[: 1] # inputs should be only first token initially [1,bs] sl, bs = inputs.size() finished = to_gpu(torch.zeros(bs).byte()) iteration = 0 self.beam_outputs = inputs.clone() layer_outputs = [[] for _ in range(self.nlayers)] while not finished.all() and iteration < self.max_iterations: # output should be List[[sl, bs, layer_dim], ...] sl should be one output = self.forward(inputs, hidden=hidden, num_beams=0) for layer_index in range(self.nlayers): layer_outputs[layer_index].append(output[layer_index]) # step_inputs have shape [1,bs] _, step_inputs = output[-1][-1:].max(dim=-1) iteration += 1 self.beam_outputs = assert_dims( torch.cat([self.beam_outputs, step_inputs], dim=0), [iteration + 1, bs]) new_finished = step_inputs.data == self.eos_token inputs = torch.cat([inputs, step_inputs], dim=0) assert_dims(inputs, [iteration + 1, bs]) finished = finished | new_finished self.beam_outputs = self.beam_outputs.view(-1, bs, 1) outputs = [torch.cat(i, dim=0) for i in layer_outputs] return outputs
def forward(self, *inputs, num_beams=0): with torch.set_grad_enabled(self.training): encoder_inputs, decoder_inputs = inputs # reset the states for the new batch bs = encoder_inputs.size(1) self.encoder.reset(bs) self.decoder.reset(bs) outputs = self.encoder(encoder_inputs) # as initial state we use the initial decoder state (zeros) state = self.decoder.hidden assert_dims( outputs, [self.nlayers[0], None, bs, (self.nhid[0], self.emb_sz[0])]) # pass the encoder outputs as keys to the attention projection_layer self.decoder.projection_layer.reset(keys=outputs[-1]) if self.training: self.decoder.pr_force = self.pr_force nb = 1 if self.pr_force < 1 else 0 else: nb = num_beams outputs_dec = self.decoder(decoder_inputs, hidden=state, num_beams=nb) predictions = outputs_dec[:decoder_inputs.size( 0)] if num_beams == 0 else self.decoder.beam_outputs return predictions, [*outputs, *outputs_dec]
def _greedy_forward(self, inputs, hidden=None, constraints=None): dec_inputs = inputs max_iterations = min(dec_inputs.size(0), self.MAX_STEPS_ALLOWED) if self.training else self.max_iterations inputs = V(inputs[:1].data) # inputs should be only first token initially [1,bs] sl, bs = inputs.size() finished = to_gpu(torch.zeros(bs).byte()) iteration = 0 self.beam_outputs = inputs.clone() final_outputs = [] while not finished.all() and iteration < max_iterations: # output should be List[[sl, bs, layer_dim], ...] sl should be one if 0 < iteration and self.training and 0. < self.random() < self.pr_force: inputs = dec_inputs[iteration].unsqueeze(0) output = self.forward(inputs, hidden=hidden, num_beams=0, constraints=constraints) hidden = self.decoder_layer.hidden final_outputs.append(output) # dim should be [sl=1, bs, nt] # inputs are the indices dims [1,bs] # repackage the var to avoid grad backwards inputs = assert_dims(V(output.data.max(dim=-1)[1]), [1, bs]) iteration += 1 self.beam_outputs = assert_dims(torch.cat([self.beam_outputs, inputs], dim=0), [iteration + 1, bs]) new_finished = inputs.data == self.eos_token finished = finished | new_finished # stop if the output is to big to fit in memory self.beam_outputs = self.beam_outputs.view(-1, bs, 1) # outputs should be [sl, bs, nt] outputs = torch.cat(final_outputs, dim=0) return outputs
def test_layer_norm(): sl = 10 bs = 2 in_features = 32 inputs = to_gpu(V(tr.randn([sl, bs, in_features]))) layernorm = to_gpu(LayerNorm(in_features)) outputs = layernorm(inputs) assert_dims(outputs, [sl, bs, in_features])
def test_transfomer_layer(): sl = 10 bs = 2 in_features = 32 inputs = tr.randn([sl, bs, in_features]) inputs = to_gpu(V(T(inputs))) transfomer = to_gpu(TransformerLayer(in_features=in_features, num_heads=8)) outputs = transfomer(inputs) assert_dims(outputs, [sl, bs, in_features])
def forward(self, decoder_inputs, encoder_inputs): output_tensors = [] sl, bs, input_size = decoder_inputs.size() dec_inputs = assert_dims(decoder_inputs, [sl, bs, self.input_size]) # nlayers, sl, bs, input_size encoder_inputs = assert_dims(encoder_inputs, [self.nlayers, None, bs, self.input_size]) for enc_inputs, layer in zip(encoder_inputs, self.layers): dec_inputs = layer(enc_inputs, dec_inputs) output_tensors.append(dec_inputs) assert_dims(output_tensors, [self.nlayers, sl, bs, self.input_size]) return output_tensors
def test_MultiHeadAttention_with_mask(self_attention_setup): keys, query = self_attention_setup slk, bs, ek = keys.size() slq, bs, eq = query.size() num_heads = 4 nhid = 10 attention = to_gpu( MultiHeadAttention(num_heads=num_heads, nhid=nhid, keys_dim=ek, query_dim=eq, values_dim=ek, dropout=0.3)) mask = T(np.tril(np.ones((bs, num_heads, slq, slk)))).float() result = attention(query=V(query), keys=V(keys), values=V(keys), mask=mask) assert_dims(result, [slq, bs, num_heads * nhid])
def test_MultiHeadAttention(self_attention_setup): keys, query = self_attention_setup slk, bs, ek = keys.size() slq, bs, eq = query.size() num_heads = 4 nhid = 10 attention = to_gpu( MultiHeadAttention(num_heads=num_heads, nhid=nhid, keys_dim=ek, query_dim=eq, values_dim=ek, dropout=0.3)) result = attention(query=V(query), keys=V(keys), values=V(keys)) assert_dims(result, [slq, bs, num_heads * nhid])
def test_transformer_encoder(): sl = 10 bs = 2 in_features = 300 num_layers = 5 inputs = tr.randn([sl, bs, in_features]) inputs = to_gpu(V(T(inputs))) transformer = to_gpu( TransformerEncoderLayers(input_size=in_features, num_heads=8, nhid=512, num_layers=num_layers)) layer_outputs = transformer(inputs) assert_dims(layer_outputs, [num_layers, sl, bs, in_features])
def _train_forward(self, inputs, hidden=None, constraints=None): sl, bs = inputs.size() emb = self.embedding_layer(inputs) final_outputs = [] for step in emb: step = torch.cat( [step, self.projection_layer.get_attention_output(step)], dim=-1).unsqueeze_(0) step = assert_dims(step, [1, bs, self.emb_size * 2]) outputs = self._rnn_step(step, hidden=hidden) rnn_out = assert_dims(outputs[-1], [1, bs, self.emb_size]) final_outputs.append(self.projection_layer(rnn_out[0])) outputs = torch.cat(final_outputs, dim=0) return outputs
def test_attention_projection(attention_projection_setup): encoder_outputs, decoder_output, params = attention_projection_setup module = to_gpu(AttentionProjection(**params)) # When I reset the module module.reset(keys=encoder_outputs) # the attention output will be a zeros array with shape equal to the input assert to_np(module.get_attention_output(decoder_output)).sum() == 0 assert module.get_attention_output(decoder_output) is not module._attention_output # when when I pass an input for the the decoder output results = module(decoder_output) assert_dims(results, [1, 2, params['n_out']]) # the new attention_output is calculated from he attention module and is no longer zero assert to_np(module.get_attention_output(decoder_output)).sum() != 0 assert module.get_attention_output(decoder_output) is module._attention_output assert_dims(module._attention_output, [2, params['n_in']])
def forward(self, *inputs, num_beams=0): encoder_inputs, decoder_inputs = inputs # reset the states for the new batch bs = encoder_inputs.size(1) self.encoder.reset(bs) self.decoder.reset(bs) raw_outpus, outputs = self.encoder(encoder_inputs) state = self.decoder.hidden assert_dims(outputs, [self.nlayers[0], None, bs, (self.nhid[0], self.emb_sz[0])]) # pass the encoder outputs as keys to the attention projection_layer self.decoder.projection_layer.reset(keys=outputs[-1]) raw_outputs_dec, outputs_dec = self.decoder(decoder_inputs, hidden=state, num_beams=num_beams) # outputs_dec[-1].shape == (sl, bs, num_tokens) predictions = outputs_dec[-1] if num_beams == 0 else self.decoder.beam_outputs return predictions, [*raw_outpus, *raw_outputs_dec], [*outputs, *outputs_dec]
def test_MultiHeadAttention(attention_setup): keys, query = attention_setup bs = query.size(0) ed = keys.size(2) eq = query.size(1) num_heads = 4 nhid = 10 attention = to_gpu( MultiHeadAttention(num_heads=num_heads, nhid=nhid, keys_dim=ed, query_dim=eq, values_dim=eq)) result = attention(query=V(query), keys=V(keys), values=V(keys)) assert_dims(result, [bs, num_heads * nhid])
def forward(self, query, keys, values, mask=None): # Query dim [sl, bs, dimQ] # keys dim [slQ, bs, dimK] # values dim [sl, bs, dimV] sl, bs, dimK = keys.size() slq = query.size(0) # [slQ, bs, dimH *NH] - > [bs, NH, slQ, dimH] query_projection = self.query_linear(query).view( slq, bs, self.num_heads, self.nhid).permute(1, 2, 0, 3) # [sl, bs, dimH *NH] -> [bs, NH, dimH, sl] keys_projection = self.keys_linear(keys).view(sl, bs, self.num_heads, self.nhid).permute( 1, 2, 3, 0) # [sl, bs, dimH *NH] -> [bs, NH, sl, dimH] values_projection = self.values_linear(values).view( sl, bs, self.num_heads, self.nhid).permute(1, 2, 0, 3) # [bs, NH, slQ, dimH] x [bs, NH, dimH, sl] = [bs, NH, slQ, sl] scores = query_projection @ keys_projection if mask is not None: scores = scores.masked_fill(mask == 0, -1e20) weights = F.softmax(scores, dim=-1) if self.dropout is not None: weights = self.dropout(weights) # [bs, NH, slQ, sl] x [bs, NH, sl, dimH] = [bs, NH, slQ, dimH] -> [slQ, bs, NH * dimH] attention = (weights @ values_projection).permute( 2, 0, 1, 3).contiguous().view(slq, bs, self.num_heads * self.nhid) output = self.linear(attention) return assert_dims(output, [slq, bs, self.out_dim])
def forward(self, *inputs, num_beams=0): with torch.set_grad_enabled(self.training): encoder_inputs, decoder_inputs = assert_dims( inputs, [2, None, None]) # dims: [sl, bs] for encoder and decoder # reset the states for the new batch bs = encoder_inputs.size(1) self.encoder.reset(bs) self.decoder.reset(bs) outputs = self.encoder(encoder_inputs) state = concat_bidir_state(self.encoder.encoder_layer.hidden, cell_type=self.cell_type, nlayers=self.nlayers, bidir=self.bidir) if self.training: self.decoder.pr_force = self.pr_force nb = 1 if self.pr_force < 1 else 0 else: nb = num_beams outputs_dec = self.decoder(decoder_inputs, hidden=state, num_beams=nb) predictions = outputs_dec[:decoder_inputs.size( 0)] if num_beams == 0 else self.decoder.beam_outputs return predictions, [*outputs, *outputs_dec]
def _train_forward(self, inputs, hidden=None, constraints=None): sl, bs = inputs.size() emb = self.embedding_layer(inputs) layer_outputs = [[] for _ in range(self.nlayers)] for step in emb: step = torch.cat( [step, self.projection_layer.get_attention_output(step)], dim=-1).unsqueeze_(0) step = assert_dims(step, [1, bs, self.emb_size * 2]) outputs = self._rnn_step(step, hidden=hidden) for layer_index in range(self.nlayers): layer_outputs[layer_index].append(outputs[layer_index]) rnn_out = assert_dims(outputs[-1], [1, bs, self.emb_size]) layer_outputs[-1][-1] = self.projection_layer(rnn_out[0]) outputs = [torch.cat(i, dim=0) for i in layer_outputs] return outputs
def forward(self, *inputs, num_beams=0): encoder_inputs, decoder_inputs = assert_dims( inputs, [2, None, None]) # dims: [sl, bs] for encoder and decoder # reset the states for the new batch bs = encoder_inputs.size(2) self.session_encoder.reset(bs) self.decoder.reset(bs) query_encoder_outputs = [] outputs = [] num_utterances, max_sl, *_ = encoder_inputs.size() for index, context in enumerate(encoder_inputs): self.query_encoder.reset(bs) outputs = self.query_encoder(context) # context has size [sl, bs] # BPTT if the dialogue is too long repackage the first half of the outputs to decrease # the gradient backpropagation and fit it into memory # to test before adding back out = repackage_var(outputs[-1][ -1]) if max_sl * num_utterances > self.BPTT_MAX_UTTERANCES and index <= num_utterances // 2 else \ outputs[-1][-1] query_encoder_outputs.append( out) # get the last sl output of the query_encoder query_encoder_outputs = torch.stack(query_encoder_outputs, dim=0) # [cl, bs, nhid] session_outputs = self.session_encoder(query_encoder_outputs) self.decoder.projection_layer.reset(keys=session_outputs[-1]) if self.training: self.decoder.pr_force = self.pr_force nb = 1 if self.pr_force < 1 else 0 else: nb = num_beams state = self.decoder.hidden outputs_dec = self.decoder(decoder_inputs, hidden=state, num_beams=nb) predictions = outputs_dec[-1][:decoder_inputs.size( 0)] if num_beams == 0 else self.decoder.beam_outputs return predictions, [*outputs, *outputs_dec]
def forward(self, *inputs, num_beams=0): with torch.set_grad_enabled(self.training): encoder_inputs, decoder_inputs = assert_dims( inputs, [2, None, None]) # dims: [sl, bs] for encoder and decoder # reset the states for the new batch num_utterances, max_sl, bs = encoder_inputs.size() self.reset_encoders(bs) outputs, session = self.encoder(encoder_inputs) self.encoder.query_encoder.reset(bs) decoder_outputs = self.encoder.query_encoder(decoder_inputs) decoder_out = concat_bidir_state( self.encoder.query_encoder_layer.get_last_hidden_state(), cell_type=self.cell_type, nlayers=1, bidir=self.encoder.bidir) x = torch.cat([session, decoder_out], dim=-1) prior_log_var, prior_mu, recog_log_var, recog_mu, session = self.variational_encoding( session, x) bow_logits = self.bow_network(session).squeeze( 0) if num_beams == 0 else None state, constraints = self.encoder_hidden_state_projection(session) outputs_dec, predictions = self.decoding(decoder_inputs, num_beams, state) if num_beams == 0: return [ predictions, recog_mu, recog_log_var, prior_mu, prior_log_var, bow_logits ], [*outputs, *outputs_dec] else: return predictions, [*outputs, *outputs_dec]
def forward(self, query, keys, values, mask=None): # Query dim [bs, dimQ] # keys dim [sl, bs, dimK] # values dim [sl, bs, dimV] # [bs, dimH *NH] query_projection = self.query_linear(query) sl, bs, dimK = keys.size() # [sl, bs, dimH *NH] keys_projection = self.keys_linear(keys) # [sl, bs, dimH *NH] values_projection = self.values_linear(values) scores = (query_projection * keys_projection).view( sl, bs, self.num_heads, self.nhid).sum(dim=-1).contiguous() / self.scale if mask is not None: scores = scores.masked_fill(mask == 0, -1e20) weights = F.softmax(scores, dim=0) if self.dropout is not None: weights = self.dropout(weights) attention = ( weights.unsqueeze(-1) * values_projection.view(sl, bs, self.num_heads, self.nhid)).sum(0) output = self.linear(attention.view(bs, -1)) return assert_dims(output, [bs, self.out_dim])
def test_S2SModelData_from_file(generalmodel): assert generalmodel is not None # number of batches assert 200 == len(generalmodel.trn_dl) train_iter = iter(generalmodel.trn_dl) batch = next(train_iter) assert isinstance(batch, list) # shape should be equal to sl, bs # The elements in the batch equal the sum of source_names and target_names (in this case 4) # the first three being the sources (inputs to the encoder, and the last the target_names (input to the decoder) assert_dims(batch, [4, None, 2]) sentences = to_np(batch[0]) batch_sentences = generalmodel.itos(sentences, "english") for beam_sentence in batch_sentences: for sentence in beam_sentence: assert sentence in {"goodbye", "hello", "i like to read", "i am hungry"}
def test_transfomer_layer_decoder(): sl = 10 bs = 2 in_features = 32 tr.random.manual_seed(0) encoder_inputs = tr.randn([sl, bs, in_features]) decoder_inputs = tr.randn([sl, bs, in_features]) encoder_inputs = to_gpu(V(T(encoder_inputs))) decoder_inputs = to_gpu(V(T(decoder_inputs))) transformer = to_gpu( TransformerLayerDecoder(input_size=in_features, num_heads=8, nhid=64, dropout=0)) outputs = transformer(encoder_inputs, decoder_inputs) assert_dims(outputs, [sl, bs, in_features]) outputs1 = transformer(encoder_inputs, decoder_inputs[:1]) assert_dims(outputs1, [1, bs, in_features]) assert ((outputs[0] - outputs1[0]).abs() < 1E-6).all()
def test_MultiHeadAttention_with_mask(attention_setup): keys, query = attention_setup bs = query.size(0) ed = keys.size(2) sl = keys.size(0) eq = query.size(1) num_heads = 4 nhid = 10 attention = to_gpu( MultiHeadAttention(num_heads=num_heads, nhid=nhid, keys_dim=ed, query_dim=eq, values_dim=ed, dropout=0.3)) mask = V(T(np.zeros((sl, bs, num_heads)))) mask[0] = 1 result = attention(query=V(query), keys=V(keys), values=V(keys), mask=mask) assert_dims(result, [bs, num_heads * nhid])
def forward(self, *inputs, num_beams=0): encoder_inputs, decoder_inputs = assert_dims( inputs, [2, None, None]) # dims: [sl, bs] for encoder and decoder encoder_outputs = self.encoder(encoder_inputs) decoder_outputs = self.decoder(decoder_inputs, encoder_outputs, num_beams=num_beams) predictions = decoder_outputs[-1][:decoder_inputs.size( 0)] if num_beams == 0 else self.decoder.beam_outputs return predictions, decoder_outputs
def test_transformer_decoder(num_beams, decoder_inputs_transformer): batch_size, emb_size, nlayers, sl, vin, ven = decoder_inputs_transformer ntokens, nhid, max_tokens = 10, 2, 20 embedding = TransformerEmbeddings(ntokens=ntokens, emb_size=emb_size, dropout=0.0, pad_token=1) encoder = TransformerDecoderLayers(nlayers=nlayers, input_size=emb_size, num_heads=2, nhid=emb_size) projection_layer = Projection(output_size=ntokens, input_size=emb_size, tie_encoder=None, dropout=0.0) decoder = TransformerDecoder(decoder_layer=encoder, projection_layer=projection_layer, pad_token=1, eos_token=2, max_tokens=max_tokens, embedding_layer=embedding) decoder = to_gpu(decoder) outputs = decoder(vin, ven, num_beams=num_beams) if num_beams > 0: assert_dims(outputs, [None, num_beams * batch_size, (emb_size, ntokens)]) # actual beam outputs can be found in beam_outputs assert decoder.beam_outputs is not None assert_dims(decoder.beam_outputs, [None, batch_size, num_beams]) # the sl can go up to max_tokens + 1(for the extra 0 token at the end) assert 0 < decoder.beam_outputs.shape[0] <= max_tokens + 1 else: assert_dims(outputs, [None, batch_size, (emb_size, ntokens)]) assert decoder.beam_outputs is None
def process_minibatch(self, minibatch: List[Example]) -> Tuple[LT, LT, LT]: max_sl = max([max(ex.sl) for ex in minibatch]) max_conv = max([len(ex.roles) for ex in minibatch]) padded_examples, targets, padded_lengths, padded_roles = [], [], [], [] for example in minibatch: examples, lens, roles = self.pad(example, max_sl=max_sl, max_conv=max_conv, field=self.text_field) padded_examples.extend(examples) padded_lengths.extend(lens) padded_roles.append(roles) targets.append(example.response) self.text_field.include_lengths = False data = self.text_field.numericalize(padded_examples, device=self.device, train=self.train) batch_size = len(minibatch) assert_dims(data, [max_sl, max_conv * batch_size]) data = data.view(max_sl, batch_size, max_conv).transpose(2, 0).transpose(2, 1).contiguous() self.text_field.fix_length = None padded_targets = self.text_field.pad(targets) targets = self.text_field.numericalize( padded_targets, device=self.device, train=self.train) # [max_sl, batch_size] assert_dims(data, [max_conv, max_sl, batch_size]) assert_dims(targets, [None, batch_size]) return data, targets, targets[1:]
def test_rnn_decoder(rnn_decoder, decoder_inputs): dec_ins, keys = decoder_inputs decoder, params = rnn_decoder decoder.reset(params.batch_size) hidden = decoder.hidden decoder.projection_layer.keys = keys outputs = decoder(dec_ins, hidden=hidden, num_beams=params.num_beams) assert params.nlayers == len(outputs) if params.num_beams > 0: assert_dims(outputs, [ params.nlayers, None, params.num_beams * params.batch_size, (params.nhid, params.ntokens) ]) # actual beam outputs can be found in beam_outputs assert decoder.beam_outputs is not None assert_dims(decoder.beam_outputs, [None, params.batch_size, params.num_beams]) # the sl can go up to max_tokens + 1(for the extra 0 token at the end) assert 0 < decoder.beam_outputs.shape[0] <= params.max_tokens + 1 else: assert_dims(outputs, [ params.nlayers, None, params.batch_size, (params.nhid, params.ntokens) ]) assert decoder.beam_outputs is None
def process_minibatch(self, minibatch: List[Example]) -> Tuple[LT, LT, LT]: max_sl = max([max(ex.sl) for ex in minibatch]) max_conv = max([len(ex.roles) for ex in minibatch]) + 1 # add extra padding sentence for the target padded_examples, padded_targets, padded_lengths, padded_roles = [], [], [], [] for example in minibatch: examples, lens, roles = self.pad(example, max_sl=max_sl, max_conv=max_conv, field=self.text_field) padded_examples.extend(examples) padded_lengths.extend(lens) padded_roles.append(roles) # if self.target_roles is not None we will pad the roles we do not want to train on # this allows for learning only the responses we are interested in targets, *_ = self.pad(example, max_sl=max_sl, max_conv=max_conv, field=self.text_field, target_roles=self.target_roles) padded_targets.extend(targets) self.text_field.include_lengths = False data = self.text_field.numericalize(padded_examples, device=self.device, train=self.train) batch_size = len(minibatch) assert_dims(data, [max_sl, max_conv * batch_size]) data = data.view(max_sl, batch_size, max_conv).transpose(2, 0).transpose(2, 1).contiguous() source = data[:-1] # we remove the extra padding sentence added here targets = self.text_field.numericalize(padded_targets, device=self.device, train=self.train) targets = targets.view(max_sl, batch_size, max_conv).transpose(2, 0).transpose(2, 1).contiguous() # shapes will be max_conv -1 , max_sl, batch_size assert_dims(source, [max_conv - 1, max_sl, batch_size]) assert_dims(targets, [max_conv, max_sl, batch_size]) return source, targets[1:], targets[1:, 1:]
def forward(self, *inputs, num_beams=0): encoder_inputs, decoder_inputs = assert_dims( inputs, [2, None, None]) # dims: [sl, bs] for encoder and decoder # reset the states for the new batch bs = encoder_inputs.size(2) self.session_encoder.reset(bs) self.decoder.reset(bs) query_encoder_raw_outputs, query_encoder_outputs = [], [] raw_outputs, outputs = [], [] num_utterances = encoder_inputs.shape[0] for index, context in enumerate(encoder_inputs): self.query_encoder.reset(bs) raw_outputs, outputs = self.query_encoder(context) query_encoder_raw_outputs.append(raw_outputs) # BPTT if the dialogue is too long repackage the first half of the outputs to decrease # the gradient backpropagation and fit it into memory out = repackage_var( outputs[-1] ) if num_utterances > 20 and index <= num_utterances // 2 else outputs[ -1] query_encoder_outputs.append(out) query_encoder_outputs = torch.cat(query_encoder_outputs, dim=0) raw_outputs_session, session_outputs = self.session_encoder( query_encoder_outputs) state = self.decoder.hidden state[0] = self.create_decoder_state(session_outputs[-1]) raw_outputs_dec, outputs_dec = self.decoder(decoder_inputs, hidden=state, num_beams=num_beams) if num_beams == 0: # use output of the projection module predictions = assert_dims( outputs_dec[-1], [None, bs, self.nt]) # dims: [sl, bs, nt] else: # use argmax or beam search predictions predictions = assert_dims( self.decoder.beam_outputs, [None, bs, num_beams]) # dims: [sl, bs, nb] return predictions, [*raw_outputs, *raw_outputs_dec], [*outputs, *outputs_dec]