def score_step(self, step_batch, train=False): feats = self._featurizer(step_batch.init_obs, step_batch.obs) pol_loss = self._logprob_of(self._flat_policy, feats, step_batch).mean() seg_loss = self._segmenter_obj(self._segmenter(feats), step_batch.final) return pol_loss + seg_loss, { 'pol_loss': unwrap(pol_loss)[0], 'seg_loss': unwrap(seg_loss)[0], }
def best_desc(self, seq_batch, i_task, start, end, descs): scores = [ self.score_span(seq_batch, i_task, start, end, desc).sum() for desc in descs ] scores = [unwrap(score)[0] for score in scores] return min(zip(scores, descs))
def decode(self, init_state, max_len, sample=False): n_stack, n_batch, _ = init_state.shape out = [[self._start_id] for _ in range(n_batch)] tok_inp = [self._start_id for _ in range(n_batch)] done = [False for _ in range(n_batch)] state = init_state for _ in range(max_len): hot_inp = np.zeros((1, n_batch, len(self._vocab))) for i, t in enumerate(tok_inp): hot_inp[0, i, t] = 1 hot_inp = Variable(torch.FloatTensor(hot_inp)) if init_state.is_cuda: hot_inp = hot_inp.cuda() new_state, label_logits = self(state, hot_inp) label_logits = label_logits.squeeze(0) label_probs = unwrap(self._softmax(label_logits)) new_tok_inp = [] for i, row in enumerate(label_probs): if sample: tok = np.random.choice(row.size, p=row) else: tok = row.argmax() new_tok_inp.append(tok) if not done[i]: out[i].append(tok) done[i] = done[i] or tok == self._stop_id state = new_state tok_inp = new_tok_inp if all(done): break return out
def act(self, step_batch, sample=True): feats = self._featurizer(step_batch.init_obs, step_batch.obs) (act_logits, act_pos_logits), _ = self._flat_policy(feats, step_batch) act_probs = unwrap(self._flat_policy._act_softmax(act_logits)) act_pos_probs = unwrap( self._flat_policy._act_pos_softmax(act_pos_logits)) out = [] for i in range(act_probs.shape[0]): arow = act_probs[i, :] aprow = act_pos_probs[i, :] if sample: a = np.random.choice(arow.size, p=arow) ap = np.random.choice(aprow.size, p=aprow) else: a = arow.argmax() ap = aprow.argmax() a, ap = self._dataset.unravel_action((a, ap)) out.append((a, ap)) #print(out) return out
def score_seq(self, seq_batch, train=False): state_feats, _ = self._featurizer(seq_batch.init_obs(), seq_batch.last_obs()) _, desc_logits = self._describer(state_feats.unsqueeze(0), seq_batch.desc) n_tok, n_batch, n_pred = desc_logits.shape desc_loss = self._describer_obj( desc_logits.view(n_tok * n_batch, n_pred), seq_batch.desc_tgt.view(n_tok * n_batch)) return desc_loss, { 'desc_loss': unwrap(desc_loss)[0], }
def step(self, train_loss=None, val_loss=None, hier_loss=None): if train_loss is not None: self._opt.zero_grad() train_loss.backward() self._opt.step() if val_loss is not None: self._sched.step(unwrap(val_loss)[0]) if hier_loss is not None: self._hier_opt.zero_grad() hier_loss.backward() self._hier_opt.step()
def act_hier(self, step_batch, sample=True, feats=None): if feats is None: feats = self._featurizer(step_batch.init_obs, step_batch.obs) (act_logits, _), _ = self._hier_policy(feats, step_batch) descs = self._hier_policy.decode(feats, step_batch) act_probs = unwrap(self._hier_policy._act_softmax(act_logits)) top_actions = [] for i in range(act_probs.shape[0]): arow = act_probs[i, :] if sample: a = np.random.choice(arow.size, p=arow) else: a = arow.argmax() top_actions.append(a) #print() #print(top_actions) #print([a == self._env.SAY for a in top_actions]) ### # TODO modify init obs ### next_descs = [] ### for i, a in enumerate(top_actions): ### if a == self._env.SAY: ### next_descs.append(Variable(data.load_desc_data( ### descs[i:i+1], self._dataset, tokenize=False))) ### else: ### next_descs.append(step_batch.desc_in[:, i:i+1, :]) ### # TODO not here ### next_descs = torch.cat([d.cuda() for d in next_descs], dim=1) next_descs = [] for i, a in enumerate(top_actions): if a == self._env.SAY: next_descs.append(descs[i]) else: next_descs.append(step_batch.desc[i]) next_descs = Variable( data.load_desc_data(next_descs, self._dataset, tokenize=False)) flat_batch = step_batch._replace(desc_in=next_descs).cuda() return self.act(flat_batch, sample=sample)
def _parse_inner(self, seq_batch, i_task, start, end, remaining_depth, top_desc): if remaining_depth <= 0: return [] if end - start < 2: return [] task = seq_batch.tasks[i_task] # TODO only this segment root_scores = self.score_span(seq_batch, i_task, 0, len(seq_batch.act[i_task]) - 1, top_desc) splits = unwrap(self.propose_splits(seq_batch, i_task, start, end)) splits = [int(k) for k in splits] indices = [[(i_task, start, k), (i_task, k, end)] for k in splits] indices = sum(indices, []) descs = self.propose_descs(seq_batch, indices) desc_pairs = [(descs[2 * i], descs[2 * i + 1]) for i in range(len(splits))] candidates = [] for k, (descs1, descs2) in zip(splits, desc_pairs): s1c, desc1 = self.best_desc(seq_batch, i_task, start, k, descs1) s2c, desc2 = self.best_desc(seq_batch, i_task, k, end, descs2) s1p, = unwrap(root_scores[start:k].sum()) s2p, = unwrap(root_scores[k:end].sum()) pick = [None, None] if s1c < s1p: s1 = s1c pick[0] = desc1 else: s1 = s1p if s2c < s2p: s2 = s2c pick[1] = desc2 else: s2 = s2p candidates.append((s1 + s2, k, tuple(pick))) if len(candidates) == 0: return [] score, split, (d1, d2) = min(candidates) actions = [a for a, ap in seq_batch.act[i_task]] out = [(d1, (start, split)), (d2, (split, end))] if not (d1 is None and d2 is None): # and np.random.random() < 0.05: print(self._dataset.render_desc(top_desc), ':', self._dataset.render_desc(d1) if d1 else '_', '>', self._dataset.render_desc(d2) if d2 else '_') print(actions[start:split], actions[split:end]) print() for start_, end_, desc_ in [(start, split, d1), (split, end, d2)]: if desc_ is None: desc_ = top_desc out += self._parse_inner(seq_batch, i_task, start_, end_, remaining_depth - 1, desc_) return out
def score_hier(self, step_batch, train=False): feats = self._featurizer(step_batch.init_obs, step_batch.obs) hier_loss = self._logprob_of(self._hier_policy, feats, step_batch).mean() return hier_loss, {'hier_loss': unwrap(hier_loss)[0]}