def get_action(self, ob, sample=True, *args, **kwargs): self.eval_mode() t_ob = {key: torch_float(ob[key], device=cfg.alg.device) for key in ob} act_dist_cont, act_dist_disc, val = self.get_act_val(t_ob) action_cont = action_from_dist(act_dist_cont, sample=sample) action_discrete = action_from_dist(act_dist_disc, sample=sample) #print('456', action_discrete.shape, act_dist_disc) #print('123', action_cont.shape, act_dist_cont) log_prob_disc = action_log_prob(action_discrete, act_dist_disc) log_prob_cont = action_log_prob(action_cont, act_dist_cont) entropy_disc = action_entropy(act_dist_disc, log_prob_disc) entropy_cont = action_entropy(act_dist_cont, log_prob_cont) #print("cont:", torch_to_np(log_prob_cont).reshape(-1, 1)) log_prob = log_prob_cont + torch.sum(log_prob_disc, axis=1) #print(log_prob_cont.shape, log_prob_disc.shape) entropy = entropy_cont + torch.sum(entropy_disc, axis=1) action_info = dict(log_prob=torch_to_np(log_prob), entropy=torch_to_np(entropy), val=torch_to_np(val)) #print("cd", action_cont.shape, action_discrete.shape) action = np.concatenate( (torch_to_np(action_cont), torch_to_np(action_discrete)), axis=1) #print("action:", action) return action, action_info
def update_q(self, obs, actions, next_obs, rewards, dones): q1 = self.q1((obs, actions))[0] q2 = self.q2((obs, actions))[0] with torch.no_grad(): next_act_dist = self.actor(next_obs)[0] next_actions = action_from_dist(next_act_dist, sample=True) nlog_prob = action_log_prob(next_actions, next_act_dist).unsqueeze(-1) nq1_tgt_val = self.q1_tgt((next_obs, next_actions))[0] nq2_tgt_val = self.q2_tgt((next_obs, next_actions))[0] nq_tgt_val = torch.min(nq1_tgt_val, nq2_tgt_val) - self.alpha * nlog_prob q_tgt_val = rewards + cfg.alg.rew_discount * (1 - dones) * nq_tgt_val loss_q1 = F.mse_loss(q1, q_tgt_val) loss_q2 = F.mse_loss(q2, q_tgt_val) loss_q = loss_q1 + loss_q2 self.q_optimizer.zero_grad() loss_q.backward() grad_norm = clip_grad(self.q_params, cfg.alg.max_grad_norm) self.q_optimizer.step() q_info = dict( q1_loss=loss_q1.item(), q2_loss=loss_q2.item(), vec_q1_val=torch_to_np(q1), vec_q2_val=torch_to_np(q2), vec_q_tgt_val=torch_to_np(q_tgt_val), ) q_info['q_grad_norm'] = grad_norm return q_info
def get_action(self, ob, sample=True, *args, **kwargs): self.eval_mode() t_ob = torch_float(ob, device=cfg.alg.device) act_dist, val = self.get_act_val(t_ob) action = action_from_dist(act_dist, sample=sample) log_prob = action_log_prob(action, act_dist) entropy = action_entropy(act_dist, log_prob) action_info = dict(log_prob=torch_to_np(log_prob), entropy=torch_to_np(entropy), val=torch_to_np(val)) return torch_to_np(action), action_info
def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs): self.eval_mode() t_ob = torch.from_numpy(ob).float().to(cfg.alg.device).unsqueeze(dim=1) act_dist, val, out_hidden_state = self.get_act_val( t_ob, hidden_state=hidden_state) action = action_from_dist(act_dist, sample=sample) log_prob = action_log_prob(action, act_dist) entropy = action_entropy(act_dist, log_prob) action_info = dict( log_prob=torch_to_np(log_prob.squeeze(1)), entropy=torch_to_np(entropy.squeeze(1)), val=torch_to_np(val.squeeze(1)), ) return torch_to_np(action.squeeze(1)), action_info, out_hidden_state
def optimize(self, data, *args, **kwargs): pre_res = self.optim_preprocess(data) processed_data = pre_res processed_data['entropy'] = torch.mean(processed_data['entropy']) loss_res = self.cal_loss(**processed_data) loss, pg_loss, vf_loss, ratio = loss_res self.optimizer.zero_grad() loss.backward() grad_norm = clip_grad(self.all_params, cfg.alg.max_grad_norm) self.optimizer.step() with torch.no_grad(): approx_kl = 0.5 * torch.mean( torch.pow( processed_data['old_log_prob'] - processed_data['log_prob'], 2)) clip_frac = np.mean( np.abs(torch_to_np(ratio) - 1.0) > cfg.alg.clip_range) optim_info = dict(pg_loss=pg_loss.item(), vf_loss=vf_loss.item(), total_loss=loss.item(), entropy=processed_data['entropy'].item(), approx_kl=approx_kl.item(), clip_frac=clip_frac) optim_info['grad_norm'] = grad_norm return optim_info
def get_action(self, ob, sample=True, *args, **kwargs): self.eval_mode() ob = torch_float(ob, device=cfg.alg.device) act_dist = self.actor(ob)[0] action = action_from_dist(act_dist, sample=sample) action_info = dict() return torch_to_np(action), action_info
def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs): self.eval_mode() if type(ob) is dict: t_ob = { key: torch_float(ob[key], device=cfg.alg.device) for key in ob } else: t_ob = torch.from_numpy(ob).float().to( cfg.alg.device).unsqueeze(dim=1) act_dist, val, out_hidden_state = self.get_act_val( t_ob, hidden_state=hidden_state) action = action_from_dist(act_dist, sample=sample) log_prob = action_log_prob(action, act_dist) entropy = action_entropy(act_dist, log_prob) in_hidden_state = torch_to_np( hidden_state) if hidden_state is not None else hidden_state action_info = dict(log_prob=torch_to_np(log_prob.squeeze(1)), entropy=torch_to_np(entropy.squeeze(1)), val=torch_to_np(val.squeeze(1)), in_hidden_state=in_hidden_state) return torch_to_np(action.squeeze(1)), action_info, out_hidden_state
def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, sleep_time=0, reset_first=False, reset_kwargs=None, action_kwargs=None, get_last_val=False): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} if action_kwargs is None: action_kwargs = {} if evaluation: env = self.eval_env else: env = self.train_env # In RL^2, we should always reset in the begining of a rollout if self.obs is None or reset_first or evaluation: self.reset(**reset_kwargs) ob = self.obs hidden_state = self.hidden_states # this is critical for some environments depending # on the returned ob data. use deepcopy() to avoid # adding the same ob to the traj # only add deepcopy() when a new ob is generated # so that traj[t].next_ob is still the same instance as traj[t+1].ob ob = deepcopy(ob) if return_on_done: all_dones = np.zeros(env.num_envs, dtype=bool) else: all_dones = None done = None for t in range(time_steps): if render: env.render() if sleep_time > 0: time.sleep(sleep_time) if render_image: # get render images at the same time step as ob imgs = deepcopy(env.get_images()) action, action_info, hidden_state = self.agent.get_action( ob, sample=sample, hidden_state=hidden_state, **action_kwargs) #print('action_info', action_info) if self.hidden_state_shape is None: self.hidden_state_shape = hidden_state.shape next_ob, reward, done, info = env.step(action) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) true_next_ob, true_done, all_dones = self.get_true_done_next_ob( next_ob, done, reward, info, all_dones) sd = StepData( ob=ob, action=action, action_info=action_info, next_ob=true_next_ob, reward=reward, done=true_done, info=info, extra= done, # this is a flag that can tell whether the environment # is reset or not so that we know whether we need to # reset the hidden state or not. We save it in "extra" ) ob = next_ob traj.add(sd) if return_on_done and np.all(all_dones): break # the order of next few lines matter, do not exchange if get_last_val and not evaluation and t == time_steps - 1: last_val, _ = self.agent.get_val(traj[-1].next_ob_raw, hidden_state=hidden_state) if last_val is not None: traj.add_extra('last_val', torch_to_np(last_val)) else: traj.add_extra('last_val', None) hidden_state = self.check_hidden_state(hidden_state, done=done) self.obs = ob if not evaluation else None self.hidden_states = hidden_state.detach() if not evaluation else None return traj
def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, sleep_time=0, reset_first=False, reset_kwargs=None, action_kwargs=None, random_action=False, get_last_val=False): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} if action_kwargs is None: action_kwargs = {} if evaluation: env = self.eval_env else: env = self.train_env if self.obs is None or reset_first or evaluation: self.reset(env=env, **reset_kwargs) ob = self.obs # this is critical for some environments depending # on the returned ob data. use deepcopy() to avoid # adding the same ob to the traj # only add deepcopy() when a new ob is generated # so that traj[t].next_ob is still the same instance as traj[t+1].ob ob = deepcopy(ob) if return_on_done: all_dones = np.zeros(env.num_envs, dtype=bool) else: all_dones = None for t in range(time_steps): if render: env.render() if sleep_time > 0: time.sleep(sleep_time) if render_image: # get render images at the same time step as ob imgs = get_render_images(env) if random_action: action = env.random_actions() action_info = dict() else: action, action_info = self.agent.get_action(ob, sample=sample, **action_kwargs) next_ob, reward, done, info = env.step(action) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) true_next_ob, true_done, all_dones = self.get_true_done_next_ob( next_ob, done, reward, info, all_dones, skip_record=evaluation) sd = StepData(ob=ob, action=action, action_info=action_info, next_ob=true_next_ob, reward=reward, done=true_done, info=info) ob = next_ob traj.add(sd) if return_on_done and np.all(all_dones): break if get_last_val and not evaluation: last_val = self.agent.get_val(traj[-1].next_ob) traj.add_extra('last_val', torch_to_np(last_val)) self.obs = ob if not evaluation else None return traj
def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, sleep_time=0, reset_kwargs=None, action_kwargs=None): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} if action_kwargs is None: action_kwargs = {} if evaluation: env = self.eval_env else: env = self.train_env ob = env.reset(**reset_kwargs) # this is critical for some environments depending # on the returned ob data. use deepcopy() to avoid # adding the same ob to the traj # only add deepcopy() when a new ob is generated # so that traj[t].next_ob is still the same instance as traj[t+1].ob ob = deepcopy(ob) if return_on_done: all_dones = np.zeros(env.num_envs, dtype=bool) for t in range(time_steps): if render: env.render() if sleep_time > 0: time.sleep(sleep_time) if render_image: # get render images at the same time step as ob imgs = deepcopy(env.get_images()) action, action_info = self.agent.get_action(ob, sample=sample, **action_kwargs) next_ob, reward, done, info = env.step(action) next_ob = deepcopy(next_ob) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) done_idx = np.argwhere(done).flatten() if done_idx.size > 0 and return_on_done: # vec env automatically resets the environment when it's done # so the returned next_ob is not actually the next observation all_dones[done_idx] = True sd = StepData(ob=ob, action=deepcopy(action), action_info=deepcopy(action_info), next_ob=next_ob, reward=deepcopy(reward), done=deepcopy(done), info=deepcopy(info)) ob = next_ob traj.add(sd) if return_on_done and np.all(all_dones): break if not evaluation: #print("next_ob:", traj[-1].next_ob) last_val = self.agent.get_val(traj[-1].next_ob_raw) traj.add_extra('last_val', torch_to_np(last_val)) return traj