def _train_policy(self, metrics: dict, D: ExperienceReplay, epoch: int, global_prior, free_nats): losses = [] policy_ = deepcopy(self.policy.module) optimizer = optim.Adam(policy_.parameters(), lr=cfg.learning_rate, eps=cfg.adam_epsilon) O, A, _, _ = D.get_last() B, S_pos = torch.zeros(O.size(1) + 1, cfg.belief_size), torch.zeros( O.size(1) + 1, cfg.state_size) with torch.no_grad(): a_0 = torch.zeros(1, 1, A.size(2)) b_0 = torch.zeros(1, cfg.belief_size).cuda() s_0 = torch.zeros(1, cfg.state_size).cuda() B, _, _, _, S_pos, _, _ = self.wm.t_model( s_0, torch.cat((a_0, A), dim=1)[:, :-1, :], b_0, self.wm.e_model(O)) B = torch.cat((b_0.unsqueeze(0), B), 1).squeeze(0) S_pos = torch.cat((s_0.unsqueeze(0), S_pos), 1).squeeze(0) A_tgt = torch.zeros(B.size(0), A.size(-1)) for ii in tqdm(list( chunks(list(range(A_tgt.size(0))), cfg.batch_size)), desc=poem(f"{epoch} Query Expert"), leave=False): A_tgt[ii] = self.planner(B[ii], S_pos[ii]) A_tgt = A_tgt.cuda() for _ in tqdm(range(cfg.collect_interval_plcy), desc=poem(f"{epoch} Policy Train"), leave=False): ii = random.sample(range(A_tgt.size(0)), cfg.batch_size) A_pred = policy_(B[ii], S_pos[ii]) loss = F.mse_loss(A_pred, A_tgt[ii], reduction='mean') optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) metrics['im_loss'].append(mean(losses)) soft_update(self.policy.module, policy_, cfg.linear_policy_update)
def _reset_realsense(self, wait=False): ctx = rs.context() devices = ctx.query_devices() for dev in devices: dev.hardware_reset() if wait: for _ in tqdm(range(30), desc=poem("Resetting Realsense Camera")): time.sleep(1) if not self.local_queue.empty(): sys.exit(0)
def _train_policy(self, metrics: dict, D: ExperienceReplay, epoch: int, global_prior, free_nats): self.wm.eval() self.policy.train() losses = [] for _ in tqdm(range(cfg.collect_interval), desc=poem(f"{epoch} Policy Interval"), leave=False): O, A, _, M = D.sample() with torch.no_grad(): b_0, _, _, _, s_0, _, _ = self.wm.t_model( torch.zeros(cfg.batch_size, cfg.state_size), A[:, :-1], torch.zeros(cfg.batch_size, cfg.belief_size), bottle(self.wm.e_model, (O[:, 1:], )), M[:, :-1]) b_0 = b_0.view(-1, cfg.belief_size) s_0 = s_0.view(cfg.batch_size * (cfg.chunk_size - 1), cfg.state_size) m0 = M[:, 1:].reshape(cfg.batch_size * (cfg.chunk_size - 1)).byte() # b_0, s_0 = b_0[m0], s_0[m0] T = cfg.planning_horizon + 1 B, S = [torch.empty(0)] * T, [torch.empty(0)] * T B[0], S[0] = b_0, s_0 for t in range(T - 1): # forward actions A = self.policy(B[t], S[t]) b_t, s_t, _, _ = self.wm.t_model(S[t], A.unsqueeze(dim=1), B[t]) B[t + 1], S[t + 1] = b_t.squeeze(dim=1), s_t.squeeze(dim=1) loss = -self.wm.r_model(torch.cat(B, dim=0), torch.cat( S, dim=0)).mean() if cfg.learning_rate_schedule != 0: _linearly_ramping_lr(self.plcy_optimizer, cfg.learning_rate_plcy) self.plcy_optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.policy.parameters(), cfg.grad_clip_norm, norm_type=2) self.plcy_optimizer.step() # S losses.append(loss.item()) metrics['p_loss'].append(mean(losses))
def collect_interval(self, metrics: dict, D: ExperienceReplay, epoch: int, save_loc=''): self.wm.eval() self.policy.eval() frames = [] with torch.no_grad(): o, r_tot = torch.tensor(self.env.reset(), dtype=torch.float32), 0 b, s_post = torch.zeros(1, cfg.belief_size), torch.zeros( 1, cfg.state_size) a = torch.zeros(1, self.env.action_size) for t in tqdm(range( ceil(cfg.max_episode_length / cfg.action_repeat)), desc=poem(f"{epoch} Collection"), leave=False): b, _, _, _, s_post, _, _ = self.wm.t_model( s_post, a.unsqueeze(dim=1), b, self.wm.e_model(o.unsqueeze(dim=0)).unsqueeze(dim=0)) b, s_post = b.squeeze(dim=1), s_post.squeeze( dim=1) # remove time dimension a = torch.clamp( self.policy(b, s_post).cpu() + self.action_noise * torch.randn_like(a), -1., 1.) o_, r, done = self.env.step( a.view(self.env.action_size).numpy()) frames.append(self.env.render()) D.push(o, a.view(self.env.action_size), r, done) r_tot += r o = torch.tensor(o_, dtype=torch.float32) if done: break if cfg.action_noise_schedule != 0: self._linearly_ramping_an() metrics['steps'].append(t if len(metrics['steps']) == 0 else t + metrics['steps'][-1]) metrics['episodes'].append(epoch) metrics['rewards'].append(r_tot)
def _train_worldmodel(self, metrics: dict, D: ExperienceReplay, epoch: int, global_prior, free_nats): losses = [] for _ in tqdm(range(cfg.collect_interval_worm), desc=poem(f"{epoch} Train Interval"), leave=False): # self.optimizer.zero_grad() O, A, R, M = D.sample() b_0 = torch.zeros(cfg.batch_size, cfg.belief_size) s_0 = torch.zeros(cfg.batch_size, cfg.state_size) # Y := B, S_pri, MU_pri, STD_pri, S_pos, MU_pos, STD_pos Y = self.wm.t_model(s_0, A[:, :-1], b_0, bottle(self.wm.e_model, (O[:, 1:], )), M[:, :-1]) o_loss, r_loss, kl_loss = self._reconstruction_loss( Y, O, R, free_nats, global_prior) if cfg.overshooting_kl_beta != 0: kl_loss += self._latent_overshooting(Y, A, M, free_nats) if cfg.learning_rate_schedule != 0: self._linearly_ramping_lr(self.wm_optimizer) self.wm_optimizer.zero_grad() (o_loss + r_loss + kl_loss).backward() nn.utils.clip_grad_norm_(self.param_list, cfg.grad_clip_norm, norm_type=2) self.wm_optimizer.step() losses.append([o_loss.item(), r_loss.item(), kl_loss.item()]) o_loss, r_loss, kl_loss = tuple(zip(*losses)) metrics['o_loss'].append(mean(o_loss)) metrics['r_loss'].append(mean(r_loss)) metrics['kl_loss'].append(mean(kl_loss))