def speak(self, you_token, eos_token=None): empty_resp_indices = th.autograd.Variable(cu(th.LongTensor([[0, 1]]))) empty_resp_len = th.autograd.Variable(cu(th.LongTensor([2]))) response_predict, response_score = self.dialogue(empty_resp_indices, empty_resp_len, persist=False, eos_token=eos_token) del response_score['target'] return response_predict, response_score
def compute_reward(sel, other_sel, goal_indices): assert goal_indices.size()[1] == NUM_ITEMS * 2, goal_indices.size() counts = goal_indices[:, cu(th.LongTensor(range(0, NUM_ITEMS * 2, 2)))] values = goal_indices[:, cu(th.LongTensor(range(1, NUM_ITEMS * 2, 2)))] total_claimed = sel + other_sel # feasible = (total_claimed >= 0).prod() * (total_claimed <= counts).prod() feasible = (total_claimed == counts).prod().long() return ((values * sel).sum(1) * feasible).float()
def __init__(self, negotiator, partner, vectorizer, options): super(RLNegotiator, self).__init__() self.negotiator = negotiator self.partner = partner self.vectorizer = vectorizer self.eos = cu(th.LongTensor(self.vectorizer.resp_vec.vectorize(['<eos>'])[0])[0]) self.you = cu(th.LongTensor(self.vectorizer.resp_vec.vectorize(['YOU:'])[0])[0]) self.epsilon = options.rl_epsilon self.max_dialogue_len = options.max_dialogue_len
def start(self): self.negotiator = self.models[0].model.module self.vectorizer = self.models[0].model.vectorizer self.tokenize, self.detokenize = tokenizers.TOKENIZERS[self.models[0].options.tokenizer] with self.use_device(): resp_vec = self.vectorizer.resp_vec self.eos = cu(th.LongTensor(resp_vec.vectorize(['<eos>'])[0])[0]) self.you = cu(th.LongTensor(resp_vec.vectorize(['YOU:'])[0])[0]) self.them = cu(th.LongTensor(resp_vec.vectorize(['THEM:'])[0])[0]) self.sel_token = cu(th.LongTensor(resp_vec.vectorize(['<selection>'])[0])[0])
def forward(self, goal_indices, partner_goal_indices, resp_indices_, resp_len_, sel_indices_, feasible_sels, num_feasible_sels): num_feasible_sels = th.autograd.Variable(cu(th.LongTensor( [feasible_sels.size()[1]] ))) self.negotiator.context(goal_indices) self.partner.context(goal_indices) my_turn = rng.choice([True, False]) dialogue = [] policy_scores = [] for _ in range(self.max_dialogue_len): me = self.negotiator if my_turn else self.partner other = self.partner if my_turn else self.negotiator output_predict, output_score = me.speak(self.you, self.eos) (me_resp_indices, resp_len), policy_score = self.policy(output_predict, output_score) start_with_you = th.autograd.Variable(cu(th.LongTensor([[self.you]]))) me_resp_indices = th.cat([start_with_you.expand(resp_len.size()[0], 1), me_resp_indices], 1) me.listen(me_resp_indices, resp_len + 1) other_resp_indices = self.transform_dialogue(me_resp_indices) other.listen(other_resp_indices, resp_len + 1) dialogue.append(((me_resp_indices if my_turn else other_resp_indices), resp_len)) policy_scores.append(policy_score) if is_selection(me_resp_indices, resp_len, self.sel_token): break my_turn = not my_turn empty_sel_indices = th.autograd.Variable(cu(th.LongTensor([0]))) # TODO: epsilon-greedy here too? selection_predict, selection_score = self.negotiator.selection(empty_sel_indices, feasible_sels, num_feasible_sels) sel_a = selection_predict['beam'] sel_b = self.partner.selection(empty_sel_indices, feasible_sels, num_feasible_sels)[0]['beam'] reward = compute_reward(sel_a, sel_b, goal_indices) partner_reward = compute_reward(sel_b, sel_a, partner_goal_indices) result = (dialogue, sel_a, sel_b, reward, partner_reward) return {'sample': result, 'beam': result}, (th.stack(policy_scores, 0)[:, 0], selection_score)
def __init__(self, module, loss, optimizer, optimizer_params, vectorizer): self.get_options() self.module = cu(module) self.loss = cu(loss) self.optimizer_class = optimizer self.optimizer_params = optimizer_params self.build_optimizer() self.vectorizer = vectorizer summary_path = config.get_file_path('monitoring.tfevents') if summary_path: self.summary_writer = summary.SummaryWriter(summary_path) else: self.summary_writer = None self.step = 0 self.last_timestamp = datetime.datetime.now()
def forward(self, predict, score): dialogue, sel_a, sel_b, reward, partner_reward = predict response_scores, selection_score = score reward_transformed = self.transform_reward(reward) step_rewards = [] discount = th.Variable(cu(th.FloatTensor([1.0]))) for i in range(len(response_scores)): step_rewards.append(discount * reward_transformed) discount = discount * self.gamma loss = th.Variable(cu(th.FloatTensor([0.0]))) for score, step_reward in zip(response_scores, step_rewards): loss -= score * step_reward return loss
def forward(self, src_indices, src_lengths): a = self.activations # TODO: PackedSequence? batch_size = src_indices.size()[0] max_len = src_lengths.data.max() a.in_embed = self.enc_embedding(src_indices[:, :max_len]) conv_stack = [a.in_embed.transpose(1, 2)] for i in range(max_len - 1): conv_stack.append(self.conv(self.nonlinearity(conv_stack[-1]))) a.conv_repr = (th.stack([ conv_stack[n - 1][j, :, 0] for j, n in enumerate(src_lengths.data) ], 0).view(1, batch_size, self.cell_size).repeat(self.num_layers, 1, 1)) init_var = th.autograd.Variable(cu(th.FloatTensor([1.0]))) c_init = (self.c_init(init_var).view(self.num_layers, 1, self.cell_size).repeat( 1, batch_size, 1)) result = a.conv_repr, c_init if not self.monitor_activations: # Free up memory a.__dict__.clear() return result
def make_selection(self): empty_sel_indices = th.autograd.Variable(cu(th.LongTensor([0]))) sel_predict, sel_score = self.negotiator.selection(empty_sel_indices, self.feasible_sels, self.num_feasible_sels) return parse_selection(' '.join(self.vectorizer.sel_vec.unvectorize( thutils.to_numpy(sel_predict['sample'])[0] )), self.game[0])
def forward(self, outputs, src_lengths): a = self.activations assert outputs.dim() == 3, outputs.size() assert outputs.size()[2] == self.repr_size, (outputs.size(), self.repr_size) batch_size, max_len, repr_size = outputs.size() a.attn_h1 = th.nn.Tanh()(self.hidden1(outputs)) a.attn_h2 = self.hidden2(outputs) assert a.attn_h2.size() == (batch_size, max_len, repr_size), \ (a.attn_h2.size(), (batch_size, max_len, repr_size)) init_var = th.autograd.Variable(cu(th.FloatTensor([1.0]))) a.target = self.target(init_var) assert a.target.size() == (repr_size, ), (a.target.size(), repr_size) a.attn_scores = th.matmul(a.attn_h2, a.target) assert a.attn_scores.size() == (batch_size, max_len), \ (a.attn_scores.size(), (batch_size, max_len)) attn_mask = th.autograd.Variable( cu( th.log((lrange(max_len)[None, :] < src_lengths.data[:, None]).float()))) a.attn_weights = th.exp( th.nn.LogSoftmax(dim=1)(a.attn_scores + attn_mask)) assert a.attn_weights.size() == (batch_size, max_len), \ (a.attn_weights.size(), (batch_size, max_len)) a.attn_out = th.matmul(a.attn_weights[:, None, :], outputs)[:, 0, :] assert a.attn_out.size() == (batch_size, repr_size), \ (a.attn_out.size(), (batch_size, repr_size)) self.dump_weights(a.attn_weights.data) result = a.attn_out, a.attn_weights if not self.monitor_activations: # Free up memory a.__dict__.clear() return result
def generate_rnn_state(encoder, h_init_mod, c_init_mod, batch_size): init_var = th.autograd.Variable(cu(th.FloatTensor([1.0]))) h_init = (h_init_mod(init_var).view( encoder.num_layers * encoder.num_directions, 1, encoder.cell_size // encoder.num_directions).repeat(1, batch_size, 1)) if encoder.use_c: c_init = (c_init_mod(init_var).view( encoder.num_layers * encoder.num_directions, 1, encoder.cell_size // encoder.num_directions).repeat( 1, batch_size, 1)) return (h_init, c_init) else: return h_init
def context(self, goal_indices): # "GRU_g": encode goals (values of items) a = self.activations batch_size, goal_size = goal_indices.size() assert goal_size == GOAL_SIZE, goal_indices.size() goal_len = th.autograd.Variable(cu( (th.ones(batch_size) * goal_size).int() )) assert goal_len.size() == (batch_size,), goal_len.size() a.context_repr_seq, _ = self.context_encoder(goal_indices, goal_len) assert a.context_repr_seq.dim() == 3, a.context_repr_seq.size() assert a.context_repr_seq.size()[:2] == (batch_size, goal_size), a.context_repr_seq.size() a.context_repr = a.context_repr_seq[:, -1, :] context_repr_size = a.context_repr_seq.size()[2] assert a.context_repr.size() == (batch_size, context_repr_size), a.context_repr.size() self.dec_state = seq2seq.generate_rnn_state(self.response_encoder, self.h_init, self.c_init, batch_size) if not isinstance(self.dec_state, tuple): self.dec_state = (self.dec_state,)
def forward(self, enc_state, extra_inputs=None, extra_delimiter=None): if not isinstance(enc_state, tuple): enc_state = (enc_state, ) assert len(enc_state[0].size()) == 3, enc_state[0].size() num_layers, batch_size, h_size = enc_state[0].size() state_sizes = [] state = [] for enc_c in enc_state: assert len(enc_c.size()) == 3, enc_c.size() assert enc_c.size()[:2] == (num_layers, batch_size), enc_c.size() c_size = enc_c.size()[2] state_sizes.append(c_size) state.append(enc_c[:, :, None, :].expand(num_layers, batch_size, self.beam_size, c_size)) if extra_inputs is None: extra_inputs = [] else: extra_inputs = [ inp[:, None, ...].expand((inp.size()[0], self.beam_size) + tuple(inp.size()[1:])).contiguous().view( (inp.size()[0] * self.beam_size, 1) + tuple(inp.size()[1:])) for inp in extra_inputs ] def ravel(x): return x.contiguous().view( *tuple(x.size()[:-2]) + (batch_size, self.beam_size, x.size()[-1])) def unravel(x): return x.contiguous().view( *tuple(x.size()[:-3]) + (batch_size * self.beam_size, x.size()[-1])) beam = th.autograd.Variable( cu( th.LongTensor(batch_size, self.beam_size, 1).fill_(self.delimiters[0]))) beam_scores = th.autograd.Variable( cu(th.zeros(batch_size, self.beam_size))) beam_lengths = th.autograd.Variable( cu(th.LongTensor(batch_size, self.beam_size).zero_())) outputs = [] states = [] for length in itertools.count(1): last_tokens = beam[:, :, -1:] assert last_tokens.size() == (batch_size, self.beam_size, 1), last_tokens.size() word_scores, (dec_out, state) = self.decode_fn( unravel(last_tokens), tuple(unravel(c) for c in state), extra_inputs=extra_inputs) word_scores = ravel(word_scores[:, 0, :]) state = tuple(ravel(c) for c in state) states.append(state) outputs.append(dec_out) assert word_scores.size()[:2] == ( batch_size, self.beam_size), word_scores.size() beam, beam_lengths, beam_scores = self.step( word_scores, length, beam, beam_scores, beam_lengths, extra_delimiter=extra_delimiter) if (beam_lengths.data != length).prod() or \ (self.max_len is not None and length == self.max_len): break all_states_collated = [th.stack(s, dim=3) for s in zip(*states)] final_indices = th.clamp(beam_lengths.data, max=self.max_len - 1) final_states = [ s[:, lrange(batch_size)[:, None], lrange(self.beam_size)[None, :], final_indices, :] for s in all_states_collated ] all_outputs = th.stack(outputs, dim=1) return (beam, th.clamp(beam_lengths, max=self.max_len), beam_scores, (all_outputs, final_states))
def policy(self, output_predict, output_score): if rng.random_sample() <= self.epsilon: return output_predict['sample'], output_score['sample'] else: return output_predict['beam'], th.autograd.Variable(cu(th.FloatTensor([0.0])))
def vectorize_response(self, response, you_them): tag = th.autograd.Variable(cu(th.LongTensor([[you_them]]))) resp_indices, resp_len = self.vectorizer.resp_vec.vectorize(self.tokenize(response)) tagged_resp_indices = th.cat([tag.expand(1, 1), thutils.to_torch(resp_indices)[None, :]], 1) return (tagged_resp_indices, thutils.to_torch(resp_len + 1))
def transform_and_predict(self, arrays): return self.module(*(th.autograd.Variable(cu(th.from_numpy(a))) for a in arrays))
def selection(self, sel_indices, feasible_sels, num_feasible_sels): # "GRU_o": encode dialogue for selection a = self.activations assert sel_indices.dim() == 1, sel_indices.size() batch_size = sel_indices.size()[0] a.combined_repr = self.combined_layer(th.cat([a.context_repr, a.dialogue_repr], dim=1)) assert a.combined_repr.dim() == 2, a.combined_repr.size() assert a.combined_repr.size()[0] == batch_size, (a.combined_repr.size(), batch_size) a.all_item_scores = log_softmax(self.selection_layer(a.combined_repr)) assert a.all_item_scores.size() == (batch_size, self.selection_layer.out_features), \ (a.all_item_scores.size(), (batch_size, self.selection_layer.out_features)) a.feasible_item_scores = a.all_item_scores[ lrange(a.all_item_scores.size()[0])[:, None, None], feasible_sels.data ] assert a.feasible_item_scores.size() == (batch_size, MAX_FEASIBLE + 3, NUM_ITEMS), \ (a.feasible_item_scores.size(), batch_size) num_feasible_mask = th.autograd.Variable(cu( (lrange(a.feasible_item_scores.size()[1])[None, :, None] <= num_feasible_sels.data[:, None, None]).float() )) a.feasible_masked = a.feasible_item_scores + th.log(num_feasible_mask) a.full_selection_scores = log_softmax(a.feasible_item_scores.sum(dim=2), dim=1) assert a.full_selection_scores.size() == (batch_size, MAX_FEASIBLE + 3), \ (a.full_selection_scores.size(), batch_size) a.selection_beam_score, selection_beam = a.full_selection_scores.max(dim=1) assert selection_beam.size() == (batch_size,), (selection_beam.size(), batch_size) selection_sample = th.multinomial(th.exp(a.full_selection_scores), 1, replacement=True)[:, 0] a.selection_sample_score = th.exp(a.full_selection_scores)[ lrange(a.full_selection_scores.size()[0]), selection_sample.data ] assert selection_sample.size() == (batch_size,), (selection_sample.size(), batch_size) selection_predict = { 'beam': self.sel_indices_to_selection(feasible_sels, selection_beam), 'sample': self.sel_indices_to_selection(feasible_sels, selection_sample), } assert selection_predict['beam'].size() == (batch_size, NUM_ITEMS), \ (selection_predict['beam'].size(), batch_size) assert selection_predict['sample'].size() == (batch_size, NUM_ITEMS), \ (selection_predict['sample'].size(), batch_size) a.selection_target_score = a.full_selection_scores[ lrange(a.full_selection_scores.size()[0]), sel_indices.data ] assert a.selection_target_score.size() == (batch_size,), (a.selection_score.size(), batch_size) selection_score = { 'target': a.selection_target_score, 'beam': a.selection_beam_score, 'sample': a.selection_sample_score, } return selection_predict, selection_score