def run(self): for epoch_id in self.epoch_ids: reward_buf = 0 for net_path in self.log_dir_helper.network_paths_at_epoch( epoch_id): self.network.load_state_dict( torch.load(net_path, map_location=lambda storage, loc: storage)) self.network.eval() internals = listd_to_dlist( [self.network.new_internals(self.device)]) next_obs = dtensor_to_dev(self.env_mgr.reset(), self.device) self.env_mgr.render() episode_complete = False while not episode_complete: obs = next_obs with torch.no_grad(): actions, _, internals = self.actor.act( self.network, obs, internals) next_obs, rewards, terminals, infos = self.env_mgr.step( actions) self.env_mgr.render() next_obs = dtensor_to_dev(next_obs, self.device) reward_buf += rewards[0] if terminals[0]: episode_complete = True print(f"EPOCH_ID: {epoch_id} REWARD: {reward_buf}")
def run(self): local_step_count = global_step_count = self.initial_step_count ep_rewards = torch.zeros(self.nb_env) obs = dtensor_to_dev(self.env_mgr.reset(), self.device) internals = listd_to_dlist( [ self.network.new_internals(self.device) for _ in range(self.nb_env) ] ) start_time = time() while global_step_count < self.nb_step: actions, internals = self.agent.act(self.network, obs, internals) next_obs, rewards, terminals, infos = self.env_mgr.step(actions) next_obs = dtensor_to_dev(next_obs, self.device) self.agent.observe( obs, rewards.to(self.device).float(), terminals.to(self.device).float(), infos, ) for i, terminal in enumerate(terminals): if terminal: for k, v in self.network.new_internals(self.device).items(): internals[k][i] = v # Perform state updates local_step_count += self.nb_env global_step_count += self.nb_env * self.world_size ep_rewards += rewards.float() obs = next_obs term_rewards = [] for i, terminal in enumerate(terminals): if terminal: for k, v in self.network.new_internals(self.device).items(): internals[k][i] = v term_rewards.append(ep_rewards[i].item()) ep_rewards[i].zero_() if term_rewards: term_reward = np.mean(term_rewards) delta_t = time() - start_time self.logger.info( "RANK: {} " "GLOBAL STEP: {} " "REWARD: {} " "GLOBAL STEP/S: {} " "LOCAL STEP/S: {}".format( self.global_rank, global_step_count, term_reward, (global_step_count - self.initial_step_count) / delta_t, (local_step_count - self.initial_step_count) / delta_t, ) ) # Learn if self.agent.is_ready(): _, _ = self.agent.learn_step( self.updater, self.network, next_obs, internals ) self.agent.clear() for k, vs in internals.items(): internals[k] = [v.detach() for v in vs]
def run(self): local_step_count = global_step_count = self.initial_step_count next_save = self.init_next_save(self.initial_step_count, self.epoch_len) prev_step_t = time() ep_rewards = torch.zeros(self.nb_env) obs = dtensor_to_dev(self.env_mgr.reset(), self.device) internals = listd_to_dlist( [ self.network.new_internals(self.device) for _ in range(self.nb_env) ] ) start_time = time() while global_step_count < self.nb_step: actions, internals = self.agent.act(self.network, obs, internals) next_obs, rewards, terminals, infos = self.env_mgr.step(actions) next_obs = dtensor_to_dev(next_obs, self.device) self.agent.observe( obs, rewards.to(self.device).float(), terminals.to(self.device).float(), infos, ) for i, terminal in enumerate(terminals): if terminal: for k, v in self.network.new_internals(self.device).items(): internals[k][i] = v # Perform state updates local_step_count += self.nb_env global_step_count += self.nb_env * self.world_size ep_rewards += rewards.float() obs = next_obs term_rewards = [] for i, terminal in enumerate(terminals): if terminal: for k, v in self.network.new_internals(self.device).items(): internals[k][i] = v term_rewards.append(ep_rewards[i].item()) ep_rewards[i].zero_() if term_rewards: term_reward = np.mean(term_rewards) delta_t = time() - start_time self.logger.info( "RANK: {} " "GLOBAL STEP: {} " "REWARD: {} " "GLOBAL STEP/S: {} " "LOCAL STEP/S: {}".format( self.global_rank, global_step_count, term_reward, (global_step_count - self.initial_step_count) / delta_t, (local_step_count - self.initial_step_count) / delta_t, ) ) self.summary_writer.add_scalar( "reward", term_reward, global_step_count ) if global_step_count >= next_save: self.saver.save_state_dicts( self.network, global_step_count, self.optimizer ) next_save += self.epoch_len # Learn if self.agent.is_ready(): loss_dict, metric_dict = self.agent.learn_step( self.updater, self.network, next_obs, internals ) total_loss = torch.sum( torch.stack(tuple(loss for loss in loss_dict.values())) ) self.agent.clear() for k, vs in internals.items(): internals[k] = [v.detach() for v in vs] # write summaries cur_step_t = time() if cur_step_t - prev_step_t > self.summary_freq: self.write_summaries( self.summary_writer, global_step_count, total_loss, loss_dict, metric_dict, self.network.named_parameters(), ) prev_step_t = cur_step_t
def run(self): local_step_count = global_step_count = self.initial_step_count ep_rewards = torch.zeros(self.nb_env) obs = dtensor_to_dev(self.env_mgr.reset(), self.device) internals = listd_to_dlist([ self.network.new_internals(self.device) for _ in range(self.nb_env) ]) start_time = time() while global_step_count < self.nb_step: actions, internals = self.agent.act(self.network, obs, internals) next_obs, rewards, terminals, infos = self.env_mgr.step(actions) next_obs = dtensor_to_dev(next_obs, self.device) self.agent.observe( obs, rewards.to(self.device).float(), terminals.to(self.device).float(), infos ) for i, terminal in enumerate(terminals): if terminal: for k, v in self.network.new_internals(self.device).items(): internals[k][i] = v # Perform state updates local_step_count += self.nb_env global_step_count += self.nb_env * self.world_size ep_rewards += rewards.float() obs = next_obs term_rewards = [] for i, terminal in enumerate(terminals): if terminal: for k, v in self.network.new_internals(self.device).items(): internals[k][i] = v term_rewards.append(ep_rewards[i].item()) ep_rewards[i].zero_() if term_rewards: term_reward = np.mean(term_rewards) delta_t = time() - start_time self.logger.info( 'RANK: {} ' 'GLOBAL STEP: {} ' 'REWARD: {} ' 'GLOBAL STEP/S: {} ' 'LOCAL STEP/S: {}'.format( self.global_rank, global_step_count, term_reward, (global_step_count - self.initial_step_count) / delta_t, (local_step_count - self.initial_step_count) / delta_t ) ) # Learn if self.agent.is_ready(): loss_dict, metric_dict = self.agent.compute_loss( self.network, next_obs, internals ) total_loss = torch.sum( torch.stack(tuple(loss for loss in loss_dict.values())) ) self.optimizer.zero_grad() total_loss.backward() dist.barrier() handles = [] for param in self.network.parameters(): handles.append( dist.all_reduce(param.grad, async_op=True)) for handle in handles: handle.wait() # for param in self.network.parameters(): # param.grad.mul_(1. / self.world_size) self.optimizer.step() self.agent.clear() for k, vs in internals.items(): internals[k] = [v.detach() for v in vs]