def __init__(self, config, policy, global_episode, n_agent=0, agent_type='exploration', log_dir=''): print(f"Initializing agent {n_agent}...") self.config = config self.n_agent = n_agent self.agent_type = agent_type self.max_steps = config['max_ep_length'] self.num_episode_save = config['num_episode_save'] self.global_episode = global_episode self.local_episode = 0 self.log_dir = log_dir # Create environment self.env_wrapper = create_env_wrapper(config) self.ou_noise = OUNoise(dim=config["action_dim"], low=config["action_low"], high=config["action_high"]) self.ou_noise.reset() self.actor = policy print("Agent ", n_agent, self.actor.device) # Logger log_path = f"{log_dir}/agent-{n_agent}" self.logger = Logger(log_path)
def __init__(self, config, policy, target_policy, learner_w_queue, log_dir=''): """ Args: config (dict): configuration """ self.config = config hidden_dim = config['dense_size'] value_lr = config['critic_learning_rate'] policy_lr = config['actor_learning_rate'] state_dim = config['state_dim'] action_dim = config['action_dim'] self.num_train_steps = config['num_steps_train'] self.device = config['device'] self.max_steps = config['max_ep_length'] self.frame_idx = 0 self.batch_size = config['batch_size'] self.gamma = config['discount_rate'] self.tau = config['tau'] self.log_dir = log_dir self.logger = Logger(f"{log_dir}/learner") self.learner_w_queue = learner_w_queue # Noise process self.ou_noise = OUNoise(dim=config["action_dim"], low=config["action_low"], high=config["action_high"]) # Value and policy nets self.value_net = ValueNetwork(state_dim, action_dim, hidden_dim, device=self.device) self.policy_net = policy #PolicyNetwork(state_dim, action_dim, hidden_dim, device=self.device) self.target_value_net = copy.deepcopy(self.value_net) self.target_policy_net = target_policy #copy.deepcopy(self.policy_net) self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.value_criterion = nn.MSELoss(reduction='none')
def __init__(self, config, policy_net, target_policy_net, learner_w_queue, log_dir=''): hidden_dim = config['dense_size'] state_dim = config['state_dim'] action_dim = config['action_dim'] value_lr = config['critic_learning_rate'] policy_lr = config['actor_learning_rate'] self.v_min = config['v_min'] self.v_max = config['v_max'] self.num_atoms = config['num_atoms'] self.device = config['device'] self.max_steps = config['max_ep_length'] self.num_train_steps = config['num_steps_train'] self.batch_size = config['batch_size'] self.tau = config['tau'] self.gamma = config['discount_rate'] self.log_dir = log_dir self.prioritized_replay = config['replay_memory_prioritized'] self.learner_w_queue = learner_w_queue self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1) # self.logger = Logger(f"{log_dir}/learner") # Noise process self.ou_noise = OUNoise(dim=config["action_dim"], low=config["action_low"], high=config["action_high"]) # Value and policy nets self.value_net = ValueNetwork(state_dim, action_dim, hidden_dim, self.v_min, self.v_max, self.num_atoms, device=self.device) self.policy_net = policy_net self.target_value_net = ValueNetwork(state_dim, action_dim, hidden_dim, self.v_min, self.v_max, self.num_atoms, device=self.device) self.target_policy_net = target_policy_net for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(param.data) self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.value_criterion = nn.BCELoss(reduction='none')
class Agent(object): def __init__(self, config, policy, global_episode, n_agent=0, agent_type='exploration', log_dir=''): print(f"Initializing agent {n_agent}...") self.config = config self.n_agent = n_agent self.agent_type = agent_type self.max_steps = config['max_ep_length'] self.num_episode_save = config['num_episode_save'] self.global_episode = global_episode self.local_episode = 0 self.log_dir = log_dir # Create environment self.env_wrapper = create_env_wrapper(config) self.ou_noise = OUNoise(dim=config["action_dim"], low=config["action_low"], high=config["action_high"]) self.ou_noise.reset() self.actor = policy print("Agent ", n_agent, self.actor.device) # Logger log_path = f"{log_dir}/agent-{n_agent}" self.logger = Logger(log_path) def update_actor_learner(self, learner_w_queue, training_on): """Update local actor to the actor from learner. """ if not training_on.value: return try: source = learner_w_queue.get_nowait() except: return target = self.actor for target_param, source_param in zip(target.parameters(), source): w = torch.tensor(source_param).float() target_param.data.copy_(w) del source def run(self, training_on, replay_queue, learner_w_queue, update_step): # Initialise deque buffer to store experiences for N-step returns self.exp_buffer = deque() best_reward = -float("inf") rewards = [] while training_on.value: episode_reward = 0 num_steps = 0 self.local_episode += 1 self.global_episode.value += 1 self.exp_buffer.clear() if self.local_episode % 100 == 0: print(f"Agent: {self.n_agent} episode {self.local_episode}") ep_start_time = time.time() state = self.env_wrapper.reset() self.ou_noise.reset() done = False while not done: action = self.actor.get_action(state) if self.agent_type == "exploration": action = self.ou_noise.get_action(action, num_steps) action = action.squeeze(0) else: action = action.detach().cpu().numpy().flatten() next_state, reward, done = self.env_wrapper.step(action) episode_reward += reward state = self.env_wrapper.normalise_state(state) reward = self.env_wrapper.normalise_reward(reward) self.exp_buffer.append((state, action, reward)) # We need at least N steps in the experience buffer before we can compute Bellman # rewards and add an N-step experience to replay memory if len(self.exp_buffer) >= self.config['n_step_returns']: state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = self.config['discount_rate'] for (_, _, r_i) in self.exp_buffer: discounted_reward += r_i * gamma gamma *= self.config['discount_rate'] # We want to fill buffer only with form explorator if self.agent_type == "exploration": try: replay_queue.put_nowait([ state_0, action_0, discounted_reward, next_state, done, gamma ]) except: pass state = next_state if done or num_steps == self.max_steps: # add rest of experiences remaining in buffer while len(self.exp_buffer) != 0: state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = self.config['discount_rate'] for (_, _, r_i) in self.exp_buffer: discounted_reward += r_i * gamma gamma *= self.config['discount_rate'] if self.agent_type == "exploration": try: replay_queue.put_nowait([ state_0, action_0, discounted_reward, next_state, done, gamma ]) except: pass break num_steps += 1 # Log metrics step = update_step.value self.logger.scalar_summary("agent/reward", episode_reward, step) self.logger.scalar_summary("agent/episode_timing", time.time() - ep_start_time, step) # Saving agent reward_outperformed = episode_reward - best_reward > self.config[ "save_reward_threshold"] time_to_save = self.local_episode % self.num_episode_save == 0 if self.n_agent == 0 and (time_to_save or reward_outperformed): if episode_reward > best_reward: best_reward = episode_reward self.save( f"local_episode_{self.local_episode}_reward_{best_reward:4f}" ) rewards.append(episode_reward) if self.agent_type == "exploration" and self.local_episode % self.config[ 'update_agent_ep'] == 0: self.update_actor_learner(learner_w_queue, training_on) empty_torch_queue(replay_queue) print(f"Agent {self.n_agent} done.") def save(self, checkpoint_name): process_dir = f"{self.log_dir}/agent_{self.n_agent}" if not os.path.exists(process_dir): os.makedirs(process_dir) model_fn = f"{process_dir}/{checkpoint_name}.pt" torch.save(self.actor, model_fn) def save_replay_gif(self, output_dir_name): import matplotlib.pyplot as plt dir_name = output_dir_name if not os.path.exists(dir_name): os.makedirs(dir_name) state = self.env_wrapper.reset() for step in range(self.max_steps): action = self.actor.get_action(state) action = action.cpu().detach().numpy() next_state, reward, done = self.env_wrapper.step(action) img = self.env_wrapper.render() plt.imsave(fname=f"{dir_name}/{step}.png", arr=img) state = next_state if done: break fn = f"{self.config['env']}-{self.config['model']}-{step}.gif" make_gif(dir_name, f"{self.log_dir}/{fn}") shutil.rmtree(dir_name, ignore_errors=False, onerror=None) print("fig saved to ", f"{self.log_dir}/{fn}")
def __init__(self, config, policy_net, target_policy_net, learner_w_queue, log_dir=''): hidden_dim = config['dense_size'] state_dim = config['state_dims'] action_dim = config['action_dims'] value_lr = config['critic_learning_rate'] policy_lr = config['actor_learning_rate'] self.best_policy_loss = 10000 self.best_value_loss = 10000 v_min = config['v_min'] v_max = config['v_max'] self.path_weight_value = config['value_weights'] self.path_weight_policy = config['policy_weights'] self.run_name = config['run_name'] num_atoms = config['num_atoms'] self.counter = 0 self.device = config['device'] self.max_steps = config['max_ep_length'] self.num_train_steps = config['num_steps_train'] self.batch_size = config['batch_size'] self.tau = config['tau'] self.gamma = config['discount_rate'] self.log_dir = log_dir self.prioritized_replay = config['replay_memory_prioritized'] self.learner_w_queue = learner_w_queue self.logger = Logger(f"{log_dir}/learner", name="{}/learner".format(self.run_name), project_name=config["project_name"]) self.path_weight_run = self.logger.get_log_dir() if not os.path.exists(self.path_weight_run): os.makedirs(self.path_weight_run) # Noise process self.ou_noise = OUNoise(dim=config["action_dim"], low=config["action_low"], high=config["action_high"]) # Value and policy nets self.value_net = ValueNetwork(state_dim, action_dim, hidden_dim, v_min, v_max, num_atoms, device=self.device) self.target_value_net = ValueNetwork(state_dim, action_dim, hidden_dim, v_min, v_max, num_atoms, device=self.device) if os.path.exists(config['value_weights_best']): self.value_net.load_state_dict( torch.load(config['value_weights_best'])) self.target_value_net = copy.deepcopy(self.value_net) else: print("cannot load value_net: {}".format( config['value_weights_best'])) self.policy_net = policy_net #PolicyNetwork(state_dim, action_dim, hidden_dim, device=self.device) self.target_policy_net = target_policy_net for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(param.data) self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.value_criterion = nn.BCELoss(reduction='none')
class Agent(object): def __init__(self, config, policy, global_episode, n_agent=0, agent_type='exploration', log_dir=''): print(f"Initializing agent {n_agent}...") self.config = config self.n_agent = n_agent self.agent_type = agent_type self.max_steps = config['max_ep_length'] self.num_episode_save = config['num_episode_save'] self.global_episode = global_episode self.local_episode = 0 self.log_dir = log_dir # Create environment self.env_wrapper = create_env_wrapper(config) self.env_wrapper.env.set_agent(self.n_agent) self.ou_noise = OUNoise(dim=config["action_dim"], low=config["action_low"], high=config["action_high"]) self.ou_noise.reset() self.actor = policy # Logger log_path = f"{log_dir}/agent-{n_agent}" self.logger = Logger(log_path) def update_actor_learner(self, learner_w_queue): """Update local actor to the actor from learner. """ if learner_w_queue.empty(): return source = learner_w_queue.get() target = self.actor for target_param, source_param in zip(target.parameters(), source): w = torch.tensor(source_param).float() target_param.data.copy_(w) def run(self, training_on, replay_queue, learner_w_queue, update_step): # Initialise deque buffer to store experiences for N-step returns self.exp_buffer = deque() best_reward = -float("inf") rewards = [] while training_on.value: episode_reward = 0 num_steps = 0 self.local_episode += 1 self.global_episode.value += 1 self.exp_buffer.clear() if self.local_episode % 100 == 0: print(f"Agent: {self.n_agent} episode {self.local_episode}") ep_start_time = time.time() print("call reset on agent {}".format(self.n_agent)) state = self.env_wrapper.reset() print(state.shape) print("called reset on agent {}".format(self.n_agent)) self.ou_noise.reset() self.env_wrapper.env.resume_simulator() done = False angle_avg = [] distance_avg = [] while not done: action = self.actor.get_action(state) if self.agent_type == "supervisor": action = self.env_wrapper.env.get_supervised_action() elif self.agent_type == "exploration": action = self.ou_noise.get_action(action, num_steps) action = action.squeeze(0) else: action = action.detach().cpu().numpy().flatten() next_state, reward, done = self.env_wrapper.step(action) angle_avg.append(state[0]) distance_avg.append(math.hypot(state[1], state[2])) episode_reward += reward state = self.env_wrapper.normalise_state(state) reward = self.env_wrapper.normalise_reward(reward) self.exp_buffer.append((state, action, reward)) # We need at least N steps in the experience buffer before we can compute Bellman # rewards and add an N-step experience to replay memory if len(self.exp_buffer) >= self.config['n_step_returns']: state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = self.config['discount_rate'] for (_, _, r_i) in self.exp_buffer: discounted_reward += r_i * gamma gamma *= self.config['discount_rate'] if not replay_queue.full(): replay_queue.put([ state_0, action_0, discounted_reward, next_state, done, gamma ]) state = next_state if done or num_steps == self.max_steps: print("agent {} done steps: {}/{}".format( self.n_agent, num_steps, self.max_steps)) # add rest of experiences remaining in buffer while len(self.exp_buffer) != 0: #print("agent {} exp_buffer_len {}".format(self.n_agent, len(self.exp_buffer))) state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = self.config['discount_rate'] for (_, _, r_i) in self.exp_buffer: #print("agent {} exp_buffer_len {}".format(self.n_agent, len(self.exp_buffer))) discounted_reward += r_i * gamma gamma *= self.config['discount_rate'] replay_queue.put([ state_0, action_0, discounted_reward, next_state, done, gamma ]) break num_steps += 1 #print("agent {} finished if".format(self.n_agent)) # Log metrics step = update_step.value if self.agent_type == "exploitation": self.logger.scalar_summary("agent/angle", np.rad2deg(np.mean(angle_avg)), step) self.logger.scalar_summary("agent/angle_var", np.rad2deg(np.var(angle_avg)), step) self.logger.scalar_summary("agent/distance", np.mean(distance_avg), step) self.logger.scalar_summary("agent/distance_var", np.var(distance_avg), step) observation_image = self.env_wrapper.env.get_current_observation_image( ) if num_steps == self.max_steps: self.logger.image_summar("agent/observation_end", observation_image, num_steps) else: self.logger.image_summar( "agent/observation_p_{:2.3f}".format( discounted_reward), observation_image, num_steps) self.logger.scalar_summary("agent/reward", episode_reward, step) self.logger.scalar_summary("agent/episode_timing", time.time() - ep_start_time, step) # Saving agent if self.local_episode % self.num_episode_save == 0 or episode_reward > best_reward: if episode_reward > best_reward: best_reward = episode_reward self.save( f"local_episode_{self.local_episode}_reward_{best_reward:4f}" ) print("reward is: {} step: {} ".format(episode_reward, step)) rewards.append(episode_reward) if (self.agent_type == "exploration" or self.agent_type == "supervisor" ) and self.local_episode % self.config['update_agent_ep'] == 0: self.update_actor_learner(learner_w_queue) # while not replay_queue.empty(): # replay_queue.get() # Save replay from the first agent only # if self.n_agent == 0: # self.save_replay_gif() #print(f"Agent {self.n_agent} done.") def save(self, checkpoint_name): last_path = f"{self.log_dir}" process_dir = f"{self.log_dir}/agent_{self.n_agent}" if not os.path.exists(process_dir): os.makedirs(process_dir) if not os.path.exists(last_path): os.makedirs(last_path) model_fn = f"{process_dir}/{checkpoint_name}.pt" torch.save(self.actor, model_fn) model_fn = f"{last_path}/best.pt" torch.save(self.actor, model_fn) def save_replay_gif(self): dir_name = "replay_render" if not os.path.exists(dir_name): os.makedirs(dir_name) state = self.env_wrapper.reset() self.env_wrapper.env.resume_simulator() for step in range(self.max_steps): action = self.actor.get_action(state) action = action.cpu().detach().numpy() next_state, reward, done = self.env_wrapper.step(action) img = self.env_wrapper.render() plt.imsave(fname=f"{dir_name}/{step}.png", arr=img) state = next_state if done: break fn = f"{self.config['env']}-{self.config['model']}-{step}.gif" make_gif(dir_name, f"{self.log_dir}/{fn}") shutil.rmtree(dir_name, ignore_errors=False, onerror=None) print("fig saved to ", f"{self.log_dir}/{fn}")