def gumbel_softmax(logits, dim=-1, tau=1, hard=False, eps=1e-10): """ Sample from the Gumbel-Softmax distribution and optionally discretize. Args: logits: [batch_size, n_class] unnormalized log-probs dim: along which dim the softmax is performed tau: non-negative scalar temperature hard: if True, take argmax, but differentiate w.r.t. soft sample y eps: eps Returns: [batch_size, n_class] sample from the Gumbel-Softmax distribution. If hard=True, then the returned sample will be one-hot, otherwise it will be a probability distribution that sums to 1 across classes Constraints: - this implementation only works on batch_size x num_features tensor for now based on https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb , (MIT license) """ y_soft = _gumbel_softmax_sample(logits, tau=tau, eps=eps) if hard: _, k = y_soft.data.max(dim=dim) # this bit is based on # https://discuss.pytorch.org/t/stop-gradients-for-st-gumbel-softmax/530/5 y_hard = torch.zeros_like(as_tensor(logits)) set_index_one_hot_(y_hard, dim, k, 1.0) # this cool bit of code achieves two things: # - makes the output value exactly one-hot (since we add then # subtract y_soft value) # - makes the gradient equal to y_soft gradient (since we strip # all other gradients) y = var_with(y_hard - as_tensor(y_soft), y_soft) + y_soft else: y = y_soft return y
def instance_accuracy_nqueens(label, raw_pred, return_float=True, feed_dict=None, args=None): """get instance-wise accuracy for structured prediction task instead of pointwise task""" pred = as_tensor(raw_pred) pred = (pred > 0.5).float() label = as_tensor(label).float() diff = torch.abs(label - pred) point_acc = 1 - torch.sum(diff) / label.numel() incorrect_count = torch.sum(diff, dim=1) incorrect = len(torch.nonzero(incorrect_count)) in_acc = 1 - incorrect / len(label) errors = [] reward = [] corrected_acc = 0 for i, x in enumerate(pred): if match_query(feed_dict["query"][i][:, 0].float(), x) and is_safe_nqueens(x): corrected_acc += 1 acc_vector[i] = 1.0 else: errors.append(feed_dict["count"][i].item()) diff = torch.sum(torch.abs(feed_dict["target_set"][i].float() - x), dim=1) reward.append(diff) corrected_acc /= len(pred) reward = torch.stack(reward) if args is not None and args.rl_reward == 'count': reward = -1 * reward.float() else: reward = -1 * torch.clamp(reward, 1).float() if return_float: return { "accuracy": in_acc, "corrected accuracy": corrected_acc, "pointwise accuracy": point_acc.item(), "classification accuracy": classification_acc.item() }, errors, reward return { "accuracy": torch.tensor(in_acc), "corrected accuracy": torch.tensor(corrected_acc), "pointwise accuracy": point_acc, "classification accuracy": classification_acc }, errors, reward
def make_data(traj, gamma): """Aggregate data as a batch for RL optimization.""" q = 0 discount_rewards = [] for reward in traj['rewards'][::-1]: q = q * gamma + reward discount_rewards.append(q) discount_rewards.reverse() traj['states'] = as_tensor(np.array(traj['states'])) traj['actions'] = as_tensor(np.array(traj['actions'])) traj['discount_rewards'] = as_tensor(np.array(discount_rewards)).float() return traj
def make_data(traj, gamma): Q = 0 discount_rewards = [] for reward in traj['rewards'][::-1]: Q = Q * gamma + reward discount_rewards.append(Q) discount_rewards.reverse() traj['states'] = as_tensor(np.array(traj['states'])) if args.is_path_task: traj['relations'] = as_tensor(np.array(traj['relations'])) traj['actions'] = as_tensor(np.array(traj['actions'])) traj['discount_rewards'] = as_tensor(np.array(discount_rewards)).float() return traj
def instance_accuracy_futoshiki(label, raw_pred, return_float=True, feed_dict=None, pred_aux=None): """get instance-wise accuracy for structured prediction task instead of pointwise task""" pred = as_tensor(raw_pred) pred = (pred > 0.5).float() label = as_tensor(label).float() diff = torch.abs(label - pred) point_acc = 1 - torch.sum(diff) / label.numel() incorrect_count = torch.sum(diff, dim=1) incorrect = len(torch.nonzero(incorrect_count)) in_acc = 1 - incorrect / len(label) errors = [] corrected_acc = 0 for i, x in enumerate(pred): constraints = feed_dict["query"][i][:, 1:] if is_safe_futoshiki(x, constraints): corrected_acc += 1 else: errors.append(feed_dict["count"][i].item()) corrected_acc /= len(pred) if pred_aux is not None: pred_aux = (pred_aux > 0.5).float() classification_acc = 1 - \ torch.sum( torch.abs(pred_aux-feed_dict["is_ambiguous"].float()))/len(pred_aux) else: classification_acc = torch.zeros(1) if return_float: return { "accuracy": in_acc, "corrected accuracy": corrected_acc, "pointwise accuracy": point_acc.item(), "classification accuracy": classification_acc.item() }, errors return { "accuracy": torch.tensor(in_acc), "corrected accuracy": torch.tensor(corrected_acc), "pointwise accuracy": point_acc, "classification accuracy": classification_acc }, errors
def step(self, feed_dict, reduce_func=default_reduce_func, cast_tensor=False, measure_time=False): if hasattr(self.model, 'train_step'): return self.model.train_step(self.optimizer, feed_dict) assert self._model.training, 'Step a evaluation-mode model.' extra = dict() self.trigger_event('step:before', self) if cast_tensor: feed_dict = as_tensor(feed_dict) if measure_time: end_time = cuda_time() self.trigger_event('forward:before', self, feed_dict) loss, monitors, output_dict = self._model(feed_dict) self.trigger_event('forward:after', self, feed_dict, loss, monitors, output_dict) if measure_time: extra['time/forward'] = cuda_time() - end_time end_time = cuda_time(False) loss = reduce_func('loss', loss) monitors = {k: reduce_func(k, v) for k, v in monitors.items()} loss_f = as_float(loss) monitors_f = as_float(monitors) if measure_time: extra['time/loss'] = cuda_time() - end_time end_time = cuda_time(False) self._optimizer.zero_grad() self.trigger_event('backward:before', self, feed_dict, loss, monitors, output_dict) if loss.requires_grad: loss.backward() if measure_time: extra['time/backward'] = cuda_time() - end_time end_time = cuda_time(False) self.trigger_event('backward:after', self, feed_dict, loss, monitors, output_dict) if loss.requires_grad: self._optimizer.step() if measure_time: extra['time/optimize'] = cuda_time() - end_time end_time = cuda_time(False) self.trigger_event('step:after', self) return loss_f, monitors_f, output_dict, extra
def step(self, feed_dict, grad_clip=0., reduce_func=default_reduce_func, cast_tensor=False, measure_time=False): if hasattr(self.model, 'train_step'): try: return self.model.train_step( self.optimizer, feed_dict, grad_clip=grad_clip, reduce_func=reduce_func, cast_tensor=False ) except NotImplementedError: pass extra = dict() self.prepare() if measure_time: end_time = cuda_time() if cast_tensor: feed_dict = as_tensor(feed_dict) self.trigger_event('forward:before', self, feed_dict) loss, monitors, output_dict = self._model(feed_dict) self.trigger_event('forward:after', self, feed_dict, loss, monitors, output_dict) if measure_time: extra['time/forward'] = cuda_time() - end_time end_time = cuda_time(False) return self.update(feed_dict, loss, monitors, output_dict, grad_clip=grad_clip, reduce_func=reduce_func, measure_time=measure_time, extra=extra)
def evaluate(self, feed_dict, cast_tensor=False): assert not self._model.training, 'Evaluating a training-mode model.' begin = time.time() if cast_tensor: feed_dict = as_tensor(feed_dict) with torch.no_grad(): output_dict = self._model(feed_dict) end = time.time() return output_dict, dict(gpu_time=end - begin)
def validate_step(self, feed_dict, metric, meters=None): feed_dict_np = as_numpy(feed_dict) feed_dict = as_tensor(feed_dict) with torch.no_grad(): output_dict = self._model(feed_dict) output_dict_np = as_numpy(output_dict) result = as_float(metric(feed_dict_np, output_dict_np)) if meters is not None: meters.update(result) return result
def make_data(traj, gamma): """Aggregate data as a batch for RL optimization.""" q = 0 discount_rewards = [] for reward in traj['rewards'][::-1]: q = q * gamma + reward discount_rewards.append(q) discount_rewards.reverse() if type(traj['states'][0]) is list: f1 = [f[0] for f in traj['states']] f2 = [f[1] for f in traj['states']] traj['states'] = [torch.cat(f1, dim=0), torch.cat(f2, dim=0)] else: traj['states'] = as_tensor(np.array(traj['states'])) traj['actions'] = as_tensor(np.array(traj['actions'])) traj['discount_rewards'] = as_tensor(np.array(discount_rewards)).float() return traj
def zero_state(self, input): batch_dim = 0 if self.batch_first else 1 batch_size = input.size(batch_dim) hidden_size = self.rnn.hidden_size nr_layers = self.rnn.num_layers * (int(self.rnn.bidirectional) + 1) state_shape = (nr_layers, batch_size, self.rnn.hidden_size) storage = as_tensor(input) gen = lambda: torch.zeros(*state_shape, device=input.device) if self.state_is_tuple: return (gen(), gen()) return gen()
def _gumbel_softmax_sample(logits, dim=-1, tau=1, eps=1e-10): """ Draw a sample from the Gumbel-Softmax distribution based on https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb (MIT license) """ gumbel_noise = _sample_gumbel(logits.size(), eps=eps, out=as_tensor(logits).new()) y = logits + var_with(gumbel_noise, logits) return F.softmax(y / tau, dim=dim)
def _get_result_given_player(self, index, meters, number, player, mode): assert mode in ['train', 'test', 'mining', 'inherit'] params = dict(eval_only=True, number=number, play_name='{}_epoch{}_episode{}'.format( mode, self.current_epoch, index)) backup = None if mode == 'train': params['eval_only'] = False params['dataset'] = self.valid_action_dataset params['entropy_beta'] = self.entropy_beta meters.update(lr=self.lr, entropy_beta=self.entropy_beta) elif mode == 'test': params['dump'] = True params['use_argmax'] = True else: backup = copy.deepcopy(player) params['use_argmax'] = self.is_candidate succ, score, traj, length = run_episode(player, self.model, **params) meters.update(number=number, succ=succ, score=score, length=length) if mode == 'train': feed_dict = make_data(traj, args.gamma) feed_dict['entropy_beta'] = as_tensor(self.entropy_beta).float() # content from valid_move dataset states, actions, labels = \ self.valid_action_dataset.sample_batch(args.batch_size) feed_dict['pred_states'] = as_tensor(states) feed_dict['pred_actions'] = as_tensor(actions) feed_dict['valid'] = as_tensor(labels).float() if args.use_gpu: feed_dict = as_cuda(feed_dict) return feed_dict else: message = ('> {} iter={iter}, number={number}, succ={succ}, ' 'score={score:.4f}, length={length}').format( mode, iter=index, **meters.val) return message, dict(succ=succ, number=number, backup=backup)
def step(self, closure=None): loss = None if closure is not None: loss = closure() self._current += 1 for group in self._base_optimizer.param_groups: for p in group['params']: if p.grad is None: continue #source gradient d_p = p.grad.data param_state = self._base_optimizer.state[p] # MJY:: we ensure that grad_buffer does not require grad. if 'grad_buffer' not in param_state: buf = param_state['grad_buffer'] = [] else: buf = param_state['grad_buffer'] #MZ: cannot simply add cause of different batch size #buf.add_(d_p) buf.append(d_p.clone()) #MZ: FIX if 'exp_avg' not in param_state: self._base_optimizer.state[p][ 'exp_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) self._base_optimizer.state[p][ 'exp_avg_sq'] = torch.zeros_like( p, memory_format=torch.preserve_format) self._base_optimizer.state[p]['step'] = 0 if self._current >= self._nr_acc: assert len(self.batch_sizes) == self._current #buf.mul_(1. / self._current) r = (torch.stack(buf, -1) * as_tensor( np.array(self.batch_sizes) / sum(self.batch_sizes)).to( buf[0].device)).sum(-1) p.grad.data.copy_(r) #buf.zero_() buf.clear() if self._current >= self._nr_acc: self._base_optimizer.step() self._current = 0 self.batch_sizes.clear() return loss
def train_step(self, feed_dict, meters=None): assert self._model.training feed_dict = as_tensor(feed_dict) self._optimizer.zero_grad() loss, monitors, output_dict = self._model(feed_dict) loss.backward() self._optimizer.step() loss, monitors = map(as_float, [loss, monitors]) if meters is not None: meters.update(loss=loss) meters.update(monitors) return as_float(loss)
def binary_accuracy(label, raw_pred, eps=1e-20, return_float=True): """get accuracy for binary classification problem.""" pred = as_tensor(raw_pred).squeeze(-1) pred = (pred > 0.5).float() label = as_tensor(label).float() # The $acc is micro accuracy = the correct ones / total acc = label.eq(pred).float() # The $balanced_accuracy is macro accuracy, with class-wide balance. nr_total = torch.ones(label.size(), dtype=label.dtype, device=label.device).sum(dim=-1) nr_pos = label.sum(dim=-1) nr_neg = nr_total - nr_pos pos_cnt = (acc * label).sum(dim=-1) neg_cnt = acc.sum(dim=-1) - pos_cnt balanced_acc = ((pos_cnt + eps) / (nr_pos + eps) + (neg_cnt + eps) / (nr_neg + eps)) / 2.0 # $sat means the saturation rate of the predication, # measure how close the predections are to 0 or 1. sat = 1 - (raw_pred - pred).abs() if return_float: acc = as_float(acc.mean()) balanced_acc = as_float(balanced_acc.mean()) sat_mean = as_float(sat.mean()) sat_min = as_float(sat.min()) else: sat_mean = sat.mean(dim=-1) sat_min = sat.min(dim=-1)[0] return { 'accuracy': acc, 'balanced_accuracy': balanced_acc, 'saturation/mean': sat_mean, 'saturation/min': sat_min, }
def _get_result_given_player(self, index, meters, number, player, mode): assert mode in ['train', 'test', 'mining', 'inherit'] params = dict(eval_only=True, number=number, play_name='{}_epoch{}_episode{}'.format( mode, self.current_epoch, index)) backup = None if mode == 'train': params['eval_only'] = False params['entropy_beta'] = self.entropy_beta meters.update(lr=self.lr, entropy_beta=self.entropy_beta) elif mode == 'test': params['dump'] = True params['use_argmax'] = True else: backup = copy.deepcopy(player) params['use_argmax'] = self.is_candidate succ, score, traj, length, optimal = \ run_episode(player, self.model, **params) meters.update(number=number, succ=succ, score=score, length=length, optimal=optimal) if mode == 'train': feed_dict = make_data(traj, args.gamma) feed_dict['entropy_beta'] = as_tensor(self.entropy_beta).float() if args.use_gpu: feed_dict = as_cuda(feed_dict) return feed_dict else: message = '> {} iter={iter}, number={number}, succ={succ}, \ score={score:.4f}, length={length}, optimal={optimal}'.format(mode, iter=index, **meters.val) return message, dict(succ=succ, number=number, backup=backup)
def _inference_model(self, feed_dict): feed_dict = as_tensor(feed_dict) with torch.no_grad(): return as_numpy(self._model(feed_dict))
def run_episode(env, model, number, play_name='', dump=False, eval_only=False, use_argmax=False, need_restart=False, entropy_beta=0.0): """Run one episode using the model with $number nodes/numbers.""" is_over = False traj = collections.defaultdict(list) score = 0 moves = [] # If dump_play=True, store the states and actions in a json file # for visualization. dump_play = args.dump_play and dump if need_restart: env.restart() if args.is_path_task: optimal = env.unwrapped.dist relation = env.unwrapped.graph.get_edges() relation = np.stack([relation, relation.T], axis=-1) st, ed = env.current_state nodes_trajectory = [int(st)] destination = int(ed) policies = [] elif args.is_sort_task: optimal = env.unwrapped.optimal array = [str(i) for i in env.unwrapped.array] while not is_over: if args.is_path_task: st, ed = env.current_state state = np.zeros((relation.shape[0], 2)) state[st, 0] = 1 state[ed, 1] = 1 feed_dict = dict(states=np.array([state]), relations=np.array([relation])) elif args.is_sort_task: state = env.current_state feed_dict = dict(states=np.array([state])) feed_dict['entropy_beta'] = as_tensor(entropy_beta).float() feed_dict = as_tensor(feed_dict) if args.use_gpu: feed_dict = as_cuda(feed_dict) with torch.set_grad_enabled(not eval_only): output_dict = model(feed_dict) policy = output_dict['policy'] p = as_numpy(policy.data[0]) action = p.argmax() if use_argmax else random.choice(len(p), p=p) reward, is_over = env.action(action) # collect moves information if dump_play: if args.is_path_task: moves.append(int(action)) nodes_trajectory.append(int(env.current_state[0])) logits = as_numpy(output_dict['logits'].data[0]) tops = np.argsort(p)[-10:][::-1] tops = list( map(lambda x: (int(x), float(p[x]), float(logits[x])), tops)) policies.append(tops) if args.is_sort_task: # Need to ensure that env.utils.MapActionProxy is the outermost class. mapped_x, mapped_y = env.mapping[action] moves.append([mapped_x, mapped_y]) # For now, assume reward=1 only when succeed, otherwise reward=0. # Manipulate the reward and get success information according to reward. if reward == 0 and args.penalty is not None: reward = args.penalty succ = 1 if is_over and reward > 0.99 else 0 score += reward traj['states'].append(state) if args.is_path_task: traj['relations'].append(relation) traj['rewards'].append(reward) traj['actions'].append(action) # dump json file storing information of playing if dump_play and not (args.dump_fail_only and succ): if args.is_path_task: num = env.unwrapped.nr_nodes graph = relation[:, :, 0].tolist() coordinates = env.unwrapped.graph.get_coordinates().tolist() json_str = json.dumps( dict(graph=graph, coordinates=coordinates, policies=policies, destination=destination, current=nodes_trajectory, moves=moves)) if args.is_sort_task: num = env.unwrapped.nr_numbers json_str = json.dumps(dict(array=array, moves=moves)) dump_file = os.path.join(args.current_dump_dir, '{}_size{}.json'.format(play_name, num)) with open(dump_file, 'w') as f: f.write(json_str) length = len(traj['rewards']) return succ, score, traj, length, optimal
def rms(p): """Root mean square function.""" return as_float((as_tensor(p)**2).mean()**0.5)
def run_episode(env, model, number, play_name='', dump=False, dataset=None, eval_only=False, use_argmax=False, need_restart=False, entropy_beta=0.0): """Run one episode using the model with $number blocks.""" is_over = False traj = collections.defaultdict(list) score = 0 if need_restart: env.restart() nr_objects = number + 1 # If dump_play=True, store the states and actions in a json file # for visualization. dump_play = args.dump_play and dump if dump_play: array = env.unwrapped.current_state moves, new_pos, policies = [], [], [] while not is_over: state = env.current_state feed_dict = dict(states=np.array([state])) feed_dict['entropy_beta'] = as_tensor(entropy_beta).float() feed_dict = as_tensor(feed_dict) if args.use_gpu: feed_dict = as_cuda(feed_dict) with torch.set_grad_enabled(not eval_only): output_dict = model(feed_dict) policy = output_dict['policy'] p = as_numpy(policy.data[0]) action = p.argmax() if use_argmax else random.choice(len(p), p=p) # Need to ensure that the env.utils.MapActionProxy is the outermost class. mapped_x, mapped_y = env.mapping[action] # env.unwrapped to get the innermost Env class. valid = env.unwrapped.world.moveable(mapped_x, mapped_y) reward, is_over = env.action(action) if dump_play: moves.append([mapped_x, mapped_y]) res = tuple(env.current_state[mapped_x][2:]) new_pos.append((int(res[0]), int(res[1]))) logits = as_numpy(output_dict['logits'].data[0]) tops = np.argsort(p)[-10:][::-1] tops = list( map(lambda x: (env.mapping[x], float(p[x]), float(logits[x])), tops)) policies.append(tops) # For now, assume reward=1 only when succeed, otherwise reward=0. # Manipulate the reward and get success information according to reward. if reward == 0 and args.penalty is not None: reward = args.penalty succ = 1 if is_over and reward > 0.99 else 0 score += reward traj['states'].append(state) traj['rewards'].append(reward) traj['actions'].append(action) if not eval_only and dataset is not None and mapped_x != mapped_y: dataset.append(nr_objects, state, action, valid) # Dump json file as record of the playing. if dump_play and not (args.dump_fail_only and succ): array = array[:, 2:].astype('int32').tolist() array = [array[:nr_objects], array[nr_objects:]] json_str = json.dumps( # Let indent=True for an indented view of json files. dict(array=array, moves=moves, new_pos=new_pos, policies=policies)) dump_file = os.path.join( args.current_dump_dir, '{}_blocks{}.json'.format(play_name, env.unwrapped.nr_blocks)) with open(dump_file, 'w') as f: f.write(json_str) length = len(traj['rewards']) return succ, score, traj, length
def __getitem__(self, index): if self._value is None: self._value = random.rand() time.sleep(0.1) return as_tensor(np.array([self._value]))
def forward(self, *args, cast_tensor=False, **kwargs): if cast_tensor: args = as_tensor(args) kwargs = as_tensor(kwargs) outputs = self._model(*args, **kwargs) return outputs
def step(self, feed_dict, reduce_func=default_reduce_func, cast_tensor=False): assert self._model.training, 'Step a evaluation-mode model.' self.num_iters += 1 self.trigger_event('step:before', self) loss_latent = 0.0 if cast_tensor: feed_dict = as_tensor(feed_dict) begin = time.time() self.trigger_event('forward:before', self, feed_dict) rl_loss = 0.0 if self.mode == 'warmup': loss, monitors, output_dict = self._model(feed_dict) else: if args.no_static: loss, monitors, output_dict = self._model( feed_dict, return_loss_matrix=True) y_hat = output_dict['pred'].detach() else: with torch.no_grad(): #y_hat = self._static_model(feed_dict)['pred'].detach() static_model_output = self._static_model( feed_dict, return_loss_matrix=True) if isinstance(static_model_output, dict): y_hat = static_model_output['pred'].detach() output_dict = static_model_output else: y_hat = static_model_output[2]['pred'].detach() output_dict = static_model_output[2] keys = [ 'mask', 'n', 'query', 'count', 'is_ambiguous', 'qid', 'target_set', 'relations', 'gtlt' ] expanded_feed_dict = {} for key in keys: if key in feed_dict: expanded_feed_dict[key] = expand_tensor( feed_dict[key], feed_dict["count"]) # #unravel target set to obtain different targets expanded_feed_dict["target"] = unravel_tensor( feed_dict["target_set"], feed_dict["count"]) # copy interemediate y for each target y_hat = expand_tensor(y_hat, feed_dict["count"]) # inserting detached loss in the expanded_feed_dict for deterministic latent model #Pdb().set_trace() if 'loss_matrix' in output_dict: expanded_feed_dict['loss'] = unravel_tensor( output_dict['loss_matrix'], feed_dict['count']).detach() if args.latent_model == 'eg': expanded_feed_dict[ 'minloss_eg_prob'] = unravel_minloss_epsilon_greedy( output_dict['loss_matrix'], feed_dict['count'], args.minloss_eg_eps).detach() # compute latent variable, i.e. the scores for each of the possible targets z_latent = self._latent_model(expanded_feed_dict, y_hat, output_dict)['latent_z'] # start index and end index are markers for start and end indices # of each query in the expanded feed dict start_index = torch.cumsum(feed_dict["count"], 0) - feed_dict["count"] end_index = torch.cumsum(feed_dict["count"], 0) min_indices = [] action_prob = [] #rl_weights = [] weights = [] # loop over each query for s, e in zip(start_index, end_index): dis2 = z_latent[s:e].squeeze(1) probs = get_prob_from_dis(dis2) weights.append( F.pad(probs, (0, feed_dict['target_set'].size(1) - probs.size(0)), "constant", 0)) # selected_feed_dict = feed_dict if args.rl_exploration: selected_feed_dict["weights"] = rl_sampling( torch.stack(weights).detach().clone()) else: selected_feed_dict["weights"] = torch.stack( weights).detach().clone() loss = 0 if not args.no_static: # Pdb().set_trace() loss, monitors, output_dict = self._model(selected_feed_dict) else: loss = (output_dict['loss_matrix'] * selected_feed_dict['weights'] ).sum() / selected_feed_dict['weights'].sum() if (feed_dict['is_ambiguous'].sum() > 0): if not args.rl_exploration: avg_reward = ( (output_dict['reward'] * (feed_dict['mask'].float())).sum(dim=1) / (feed_dict['mask'].sum(dim=1).float())).unsqueeze(-1) #avg_reward = (output_dict['reward']*(feed_dict['mask'].float())).sum()/(feed_dict['mask'].sum().float()) rewards = (output_dict['reward'] - avg_reward) * (feed_dict['mask'].float()) rl_loss = -1.0 * (rewards * torch.stack(weights)).sum( ) / feed_dict['is_ambiguous'].sum() else: #use selected_feed_dict['weights']. rewards should be only for non zero samples. #Also, now we use REINFORCE : maximize : reward*log(p_action) rl_loss = -1.0 * ( (output_dict['reward'] + 0.5) * selected_feed_dict['weights'] * torch.log( torch.stack(weights) + 1.0 - selected_feed_dict['weights']) ).sum() / feed_dict['is_ambiguous'].sum().float() loss_latent = rl_loss self.trigger_event('forward:after', self, feed_dict, loss, monitors, output_dict) loss = reduce_func('loss', loss) loss_f = as_float(loss) monitors = {k: reduce_func(k, v) for k, v in monitors.items()} if self.mode == 'hot': monitors['loss_latent'] = loss_latent monitors_f = as_float(monitors) self._optimizer.zero_grad() if self.mode in ['hot']: if torch.is_tensor(loss_latent): loss_latent = reduce_func('loss_latent', loss_latent) # self._latent_optimizer.zero_grad() self.trigger_event('backward:before', self, feed_dict, loss, monitors, output_dict) if loss.requires_grad: loss.backward() if self.mode in ['hot']: if torch.is_tensor(loss_latent): loss_latent.backward() # print("Grad:",self._latent_model.digit_embed.weight.grad[2,:2],self._latent_model.atn_across_steps.grad) # Pdb().set_trace() #print('Latent: ',self.digit_embed.weight.data[2,:4], self.row_embed.weight.data[2,:4]) #print('Atn over steps: ',self.atn_across_steps) self.trigger_event('backward:after', self, feed_dict, loss, monitors, output_dict) loss_latent_f = loss_latent.item() if torch.is_tensor( loss_latent) else loss_latent grad_norm_before_clip, grad_norm_after_clip, param_norm_before_clip, lgrad_norm_before_clip, lgrad_norm_after_clip, lparam_norm_before_clip = 0, 0, 0, -1, -1, 0 if loss.requires_grad: grad_norm_before_clip, grad_norm_after_clip, param_norm_before_clip = utils.gradient_normalization( self._model, grad_norm=args.grad_clip) #glogger.info(','.join(map(lambda x: str(round(x,6)),[self.current_epoch, self.num_iters, loss_f, loss_latent_f, grad_norm_before_clip.item(), grad_norm_after_clip.item(), param_norm_before_clip.item()]))) if grad_norm_before_clip <= args.upper_limit_on_grad_norm: self._optimizer.step() else: self.num_bad_updates += 1 logger.info( 'not taking optim step. Grad too high {}. Num bad updates: {}' .format(round(grad_norm_before_clip, 2), self.num_bad_updates)) #self._optimizer.step() if self.mode in ['hot']: lgrad_norm_before_clip, lgrad_norm_after_clip, lparam_norm_before_clip = utils.gradient_normalization( self._latent_model, grad_norm=args.grad_clip) self._latent_optimizer.step() glogger.info(','.join( map(lambda x: str(round(x, 6)), [ self.current_epoch, self.num_iters, loss_f, loss_latent_f, grad_norm_before_clip, grad_norm_after_clip, param_norm_before_clip, lgrad_norm_before_clip, lgrad_norm_after_clip, lparam_norm_before_clip ]))) end = time.time() self.trigger_event('step:after', self) return loss_f, monitors_f, output_dict, {'time/gpu': end - begin}
def _instance_accuracy(label, raw_pred, compare_func, return_float=True, feed_dict=None, args=None): """get instance-wise accuracy for structured prediction task instead of pointwise task""" # disctretize output predictions if not args.task_is_sudoku: pred = as_tensor(raw_pred) pred = (pred > 0.5).float() else: step_pred = as_tensor(raw_pred.argmax(dim=1)).float() pred = step_pred[:, :, -1] # step pred is batch_size x 81 x num_steps # transpose for more efficient reward calculation # new shape is batch_size x num_Steps x 81 step_pred = step_pred.transpose(1, 2) label = as_tensor(label).type(pred.dtype) diff = (label == pred) point_acc = torch.sum(diff).float() / label.numel() incorrect = torch.min(diff, dim=1)[0] in_acc = torch.sum(incorrect).float() / len(label) errors = [] corrected_acc = 0 reward = [] new_targets = [] acc_vector = [] for i, x in enumerate(pred): if compare_func(x, feed_dict['query'][i].type(x.dtype)): corrected_acc += 1 acc_vector.append(1) # check if pred matches any target if ((feed_dict['target_set'][i].type(x.dtype) == x).sum(dim=1) == x.shape[0]).sum() > 0: new_targets.append((None, None)) else: new_targets.append((x, 0)) else: acc_vector.append(0) errors.append(feed_dict["count"][i].item()) new_targets.append((None, None)) if args.task_is_sudoku: #if args.use_gpu: # diff = torch.zeros(len(feed_dict['target_set'][i]),step_pred.shape[1], device=torch.device("cuda")) #else: # diff = torch.zeros(len(feed_dict['target_set'][i]),step_pred.shape[1]).cuda() #for target_idx,target in enumerate(feed_dict['target_set'][i,:feed_dict['count'][i]].float()): # diff[target_idx] = torch.sum(~(step_pred[i]==target), dim=1).float() #for target_idx in range(feed_dict['count'][i],diff.shape[0]): # diff[target_idx] = diff[target_idx-1] # #alternative tensor way NS, NN, TS = step_pred.size(1), step_pred.size( 2), feed_dict['target_set'].size(1) diff = (step_pred[i].unsqueeze(-1).expand(NS, NN, TS).transpose( 0, 2).float() != feed_dict['target_set'][i].unsqueeze(-1).expand( TS, NN, NS).float()).sum(dim=1).float() if args.rl_reward == 'count': reward.append(diff.mean(dim=1)) else: reward.append(torch.clamp_max(diff, 1).mean(dim=1)) else: diff = torch.sum(~(feed_dict["target_set"][i].type(x.dtype) == x), dim=1).float() if args.rl_reward == 'count': reward.append(diff) else: reward.append(torch.clamp_max(diff, 1)) corrected_acc /= len(pred) reward = -torch.stack(reward) target_set_accuracy = (reward.max(dim=1)[0] >= 0).float().mean() if return_float: return { "accuracy": in_acc.item(), "corrected accuracy": corrected_acc, "pointwise accuracy": point_acc.item(), "target set accuracy": target_set_accuracy.item() }, errors, reward # , acc_vector return { "accuracy": torch.tensor(in_acc), "corrected accuracy": torch.tensor(corrected_acc), "pointwise accuracy": point_acc, "target set accuracy": target_set_accuracy }, errors, reward, new_targets
def _get_result_given_player(self, index, meters, number, player, mode): assert mode in ['train', 'test', 'mining', 'mining-deter', 'mining-stoch', 'inherit', 'test-inter', 'test-inter-deter', 'test-deter'] params = dict( eval_only=True, number=number, play_name='{}_epoch{}_episode{}'.format(mode, self.current_epoch, index)) backup = None if mode == 'train': params['eval_only'] = False params['dataset'] = self.valid_action_dataset params['entropy_beta'] = self.entropy_beta meters.update(lr=self.lr, entropy_beta=self.entropy_beta) elif 'test' in mode: params['dump'] = True params['use_argmax'] = 'deter' in mode else: backup = copy.deepcopy(player) params['use_argmax'] = index < (args.mining_epoch_size//2) if mode == 'train': if args.use_gpu: self.model.cpu() mergedfc = [] for i in range(args.ntrajectory): succ, score, traj, length, optimal = run_episode(player, self.model, mode, need_restart=(i!=0), **params) if args.task in ['sort', 'path']: meters.update(number=number, succ=succ, score=score, length=length, optimal=optimal) else: meters.update(number=number, succ=succ, score=score, length=length) feed_dict = make_data(traj, args.gamma) # content from valid_move dataset if args.pred_weight != 0.0: states, actions, labels = self.valid_action_dataset.sample_batch(args.batch_size) feed_dict['pred_states'] = as_tensor(states) feed_dict['pred_actions'] = as_tensor(actions) feed_dict['valid'] = as_tensor(labels).float() mergedfc.append(feed_dict) for k in feed_dict.keys(): if k not in ["rewards", "entropy_beta"]: # reward not used to update loss if type(mergedfc[0][k]) is list: f1 = [j[k][0] for j in mergedfc] f2 = [j[k][1] for j in mergedfc] feed_dict[k] = [torch.cat(f1, dim=0), torch.cat(f2, dim=0)] else: feed_dict[k] = torch.cat([j[k] for j in mergedfc], dim=0) feed_dict['entropy_beta'] = as_tensor(self.entropy_beta).float() feed_dict['training'] = as_tensor(True) if args.norm_rewards: if args.accum_grad > 1: feed_dict['discount_rewards'] = self.model.rnorm.obs_filter(feed_dict['discount_rewards']) elif feed_dict['discount_rewards'].shape[0] > 1: feed_dict['discount_rewards'] = (feed_dict['discount_rewards'] - feed_dict['discount_rewards'].mean()) / (feed_dict['discount_rewards'].std() + 10 ** -7) #dirty trick if args.accum_grad > 1: self.optimizer.provide_batch_size(feed_dict['discount_rewards'].shape[0]) if args.use_gpu: feed_dict = as_cuda(feed_dict) self.model.cuda() self.model.train() return feed_dict else: if args.use_gpu: self.model.cpu() succ, score, traj, length, optimal = run_episode(player, self.model, mode, **params) if args.task in ['sort', 'path']: meters.update(number=number, succ=succ, score=score, length=length, optimal=optimal) message = ('> {} iter={iter}, number={number}, succ={succ}, ' 'score={score:.4f}, length={length}, optimal={optimal}').format(mode, iter=index, **meters.val) else: meters.update(number=number, succ=succ, score=score, length=length) message = ('> {} iter={iter}, number={number}, succ={succ}, ' 'score={score:.4f}, length={length}').format(mode, iter=index, **meters.val) return message, dict(succ=succ, number=number, backup=backup)
def run_episode(env, model, mode, number, play_name='', dump=False, dataset=None, eval_only=False, use_argmax=False, need_restart=False, entropy_beta=0.0): """Run one episode using the model with $number blocks.""" is_over = False traj = collections.defaultdict(list) score = 0 if need_restart: env.restart() optimal = None if args.task == 'path': optimal = env.unwrapped.dist relation = env.unwrapped.graph.get_edges() relation = np.stack([relation, relation.T], axis=-1).astype(dtype=np.float32) st, ed = env.current_state nodes_trajectory = [int(st)] destination = int(ed) policies = [] elif args.task == 'sort': optimal = env.unwrapped.optimal array = [str(i) for i in env.unwrapped.array] # If dump_play=True, store the states and actions in a json file # for visualization. dump_play = args.dump_play and dump if dump_play: nr_objects = number + 1 array = env.unwrapped.current_state moves, new_pos, policies = [], [], [] if args.model == 'dlm': # by default network isn't in training mode during data collection # but with dlm we don't want to use argmax only # except in 2 cases (testing the interpretability or the last mining phase to get an interpretable policy): if ('inter' in mode) or (('mining' in mode) or ('inherit' in mode) and number == args.curriculum_graduate): model.lowernoise() else: model.train(True) if args.dlm_noise == 1 and (('mining' in mode) or ('inherit' in mode) or ('test' in mode)): model.lowernoise() elif args.dlm_noise == 2: model.lowernoise() step = 0 while not is_over: if args.task == 'path': st, ed = env.current_state state = np.zeros((relation.shape[0], 2), dtype=np.float32) state[st, 0] = 1 state[ed, 1] = 1 feed_dict = dict(states=[np.array([state]), np.array([relation])]) else: state = env.current_state if 'nlrl' not in args.task or args.task == 'sort': feed_dict = dict(states=np.array([state])) else: feed_dict = dict(states=state) feed_dict['entropy_beta'] = as_tensor(entropy_beta).float() feed_dict['training'] = as_tensor(False) feed_dict = as_tensor(feed_dict) with torch.set_grad_enabled(False): output_dict = model(feed_dict) policy = output_dict['policy'] p = as_numpy(policy.data[0]) action = p.argmax() if use_argmax else random.choice(len(p), p=p) if args.pred_weight != 0.0: # Need to ensure that the env.utils.MapActionProxy is the outermost class. mapped_x, mapped_y = env.mapping[action] # env.unwrapped to get the innermost Env class. valid = env.unwrapped.world.moveable(mapped_x, mapped_y) reward, is_over = env.action(action) step += 1 if dump_play: moves.append([mapped_x, mapped_y]) res = tuple(env.current_state[mapped_x][2:]) new_pos.append((int(res[0]), int(res[1]))) logits = as_numpy(output_dict['logits'].data[0]) tops = np.argsort(p)[-10:][::-1] tops = list( map(lambda x: (env.mapping[x], float(p[x]), float(logits[x])), tops)) policies.append(tops) # For now, assume reward=1 only when succeed, otherwise reward=0. # Manipulate the reward and get success information according to reward. if reward == 0 and args.penalty is not None: reward = args.penalty succ = 1 if is_over and reward > 0.99 else 0 score += reward if type(feed_dict['states']) is list: traj['states'].append([f for f in feed_dict['states']]) else: traj['states'].append(state) traj['rewards'].append(reward) traj['actions'].append(action) if args.pred_weight != 0.0: if not eval_only and dataset is not None and mapped_x != mapped_y: dataset.append(nr_objects, state, action, valid) # Dump json file as record of the playing. if dump_play and not (args.dump_fail_only and succ): array = array[:, 2:].astype('int32').tolist() array = [array[:nr_objects], array[nr_objects:]] json_str = json.dumps( # Let indent=True for an indented view of json files. dict(array=array, moves=moves, new_pos=new_pos, policies=policies)) dump_file = os.path.join( args.current_dump_dir, '{}_blocks{}.json'.format(play_name, env.unwrapped.nr_blocks)) with open(dump_file, 'w') as f: f.write(json_str) length = step if args.model == 'dlm': model.restorenoise() return succ, score, traj, length, optimal