class MaddpgAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): self.agents = [ Agent(state_size=state_size, action_size=action_size, random_seed=random_seed), Agent(state_size=state_size, action_size=action_size, random_seed=random_seed) ] self.seed = random.seed(random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # self.soft_update(self.critic_local, self.critic_target, 1) # self.soft_update(self.actor_local, self.actor_target, 1) def act(self, states, add_noise=True): actions = [ agent.act(state, add_noise) for agent, state in zip(self.agents, states) ] return actions def step(self, states, actions, rewards, next_states, dones): # Shared replay buffer for i, _ in enumerate(self.agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): for agent in self.agents: agent.learn(experiences, gamma) def reset(self): for agent in self.agents: agent.reset() def save_checkpont(self): for i, agent in enumerate(self.agents): agent.save_checkpont(i)
def test_len(self): """ Simple test for length.""" for b in sample(range(100), 3): for bs in sample(range(100), 3): rb = ReplayBuffer(buffer_size=b, batch_size=bs) # len at beginning is 0 self.assertEqual(len(rb), 0) # after adding 1 element, length is 1 rb.add(state=1, action=1, reward=1, next_state=1, done=1) self.assertEqual(len(rb), 1)
class Maddpg(): '''MADDPG Agent : Interacts with and learns from the environment''' def __init__(self, state_size, action_size, num_agents, random_seed): self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Instantiate Multiple Agent self.agents = [ Agent(state_size, action_size, random_seed, num_agents) for i in range(num_agents) ] # Instantiate Memory replay Buffer (shared between agents) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def reset(self): '''reset agents''' for agent in self.agents: agent.reset() def act(self, states, noise): '''Return action to perform for each agents (per policy)''' return [ agent.act(state, noise) for agent, state in zip(self.agents, states) ] def step(self, states, actions, rewards, next_states, dones, num_current_episode): '''Save experience in replay memory, and use random sample from buffer to learn''' self.memory.add(encode(states), encode(actions), rewards, encode(next_states), dones) # If enough samples in the replay memory and if it is time to update if (len(self.memory) > BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE == 0): # Note: this code only expects 2 agents assert (len(self.agents) == 2) # Allow to learn several time in a row in the same episode for i in range(MULTIPLE_LEARN_PER_UPDATE): # Sample a batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #0 self.maddpg_learn(experiences, own_idx=0, other_idx=1) # Sample another batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #1 self.maddpg_learn(experiences, own_idx=1, other_idx=0) def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA): states, actions, rewards, next_states, dones = experiences # Filter out the agent OWN states, actions and next_states batch own_states = decode(self.state_size, self.num_agents, own_idx, states) own_actions = decode(self.action_size, self.num_agents, own_idx, actions) own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) # Filter out the OTHER agent states, actions and next_states batch other_states = decode(self.state_size, self.num_agents, other_idx, states) other_actions = decode(self.action_size, self.num_agents, other_idx, actions) other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states) # Concatenate both agent information (own agent first, other agent in second position) all_states = torch.cat((own_states, other_states), dim=1).to(device) all_actions = torch.cat((own_actions, other_actions), dim=1).to(device) all_next_states = torch.cat((own_next_states, other_next_states), dim=1).to(device) agent = self.agents[own_idx] # Update Critic # Get predicted next-state actions and Q values from target models all_next_actions = torch.cat( (agent.actor_target(own_states), agent.actor_target(other_states)), dim=1).to(device) Q_targets_next = agent.critic_target(all_next_states, all_next_actions) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = agent.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) agent.critic_optimizer.zero_grad() critic_loss.backward() if (CLIP_CRITIC_GRADIENT): torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1) agent.critic_optimizer.step() # Update Actor # Compute actor loss all_actions_pred = torch.cat( (agent.actor_local(own_states), agent.actor_local(other_states).detach()), dim=1).to(device) actor_loss = -agent.critic_local(all_states, all_actions_pred).mean() agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # Update target networks agent.soft_update(agent.critic_local, agent.critic_target, TAU) agent.soft_update(agent.actor_local, agent.actor_target, TAU) def checkpoints(self): '''Save checkpoints for all Agents''' for idx, agent in enumerate(self.agents): actor_local_filename = 'model_dir/checkpoint_actor_local_' + str( idx) + '.pth' critic_local_filename = 'model_dir/checkpoint_critic_local_' + str( idx) + '.pth' actor_target_filename = 'model_dir/checkpoint_actor_target_' + str( idx) + '.pth' critic_target_filename = 'model_dir/checkpoint_critic_target_' + str( idx) + '.pth' torch.save(agent.actor_local.state_dict(), actor_local_filename) torch.save(agent.critic_local.state_dict(), critic_local_filename) torch.save(agent.actor_target.state_dict(), actor_target_filename) torch.save(agent.critic_target.state_dict(), critic_target_filename)
class Maddpg(): """MADDPG Agent : Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize a MADDPG Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ super(Maddpg, self).__init__() self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Instantiate Multiple Agent self.agents = [ Agent(state_size,action_size, random_seed, num_agents) for i in range(num_agents) ] # Instantiate Memory replay Buffer (shared between agents) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def reset(self): """Reset all the agents""" for agent in self.agents: agent.reset() def act(self, states, noise): """Return action to perform for each agents (per policy)""" return [ agent.act(state, noise) for agent, state in zip(self.agents, states) ] def step(self, states, actions, rewards, next_states, dones, num_current_episode): """ # Save experience in replay memory, and use random sample from buffer to learn""" self.memory.add(encode(states), encode(actions), rewards, encode(next_states), dones) # If enough samples in the replay memory and if it is time to update if (len(self.memory) > BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE ==0) : # Note: this code only expects 2 agents assert(len(self.agents)==2) # Allow to learn several time in a row in the same episode for i in range(MULTIPLE_LEARN_PER_UPDATE): # Sample a batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #0 self.maddpg_learn(experiences, own_idx=0, other_idx=1) # Sample another batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #1 self.maddpg_learn(experiences, own_idx=1, other_idx=0) def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA): """ Update the policy of the MADDPG "own" agent. The actors have only access to agent own information, whereas the critics have access to all agents information. Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(states) -> action critic_target(all_states, all_actions) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples own_idx (int) : index of the own agent to update in self.agents other_idx (int) : index of the other agent to update in self.agents gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Filter out the agent OWN states, actions and next_states batch own_states = decode(self.state_size, self.num_agents, own_idx, states) own_actions = decode(self.action_size, self.num_agents, own_idx, actions) own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) # Filter out the OTHER agent states, actions and next_states batch other_states = decode(self.state_size, self.num_agents, other_idx, states) other_actions = decode(self.action_size, self.num_agents, other_idx, actions) other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states) # Concatenate both agent information (own agent first, other agent in second position) all_states=torch.cat((own_states, other_states), dim=1).to(device) all_actions=torch.cat((own_actions, other_actions), dim=1).to(device) all_next_states=torch.cat((own_next_states, other_next_states), dim=1).to(device) agent = self.agents[own_idx] # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)), dim =1).to(device) Q_targets_next = agent.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = agent.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() if (CLIP_CRITIC_GRADIENT): torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1) agent.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()), dim = 1).to(device) actor_loss = -agent.critic_local(all_states, all_actions_pred).mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # agent.soft_update(agent.critic_local, agent.critic_target, TAU) agent.soft_update(agent.actor_local, agent.actor_target, TAU) def checkpoints(self): """Save checkpoints for all Agents""" for idx, agent in enumerate(self.agents): actor_local_filename = 'models/checkpoint_actor_local_' + str(idx) + '.pth' critic_local_filename = 'models/checkpoint_critic_local_' + str(idx) + '.pth' actor_target_filename = 'models/checkpoint_actor_target_' + str(idx) + '.pth' critic_target_filename = 'models/checkpoint_critic_target_' + str(idx) + '.pth' torch.save(agent.actor_local.state_dict(), actor_local_filename) torch.save(agent.critic_local.state_dict(), critic_local_filename) torch.save(agent.actor_target.state_dict(), actor_target_filename) torch.save(agent.critic_target.state_dict(), critic_target_filename)
class DDPG(): """Reinforcement Learning agent , learning using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.08 self.exploration_sigma = 0.15 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor 0.99 self.tau = 0.001 # for soft update of target parameters 0.01 # Score tracker and learning parameters self.total_reward = None self.count = 0 self.score = 0 self.best_score = -np.inf self.last_state = None def reset_episode(self): self.total_reward = None self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): if self.total_reward: self.total_reward += reward else: self.total_reward = reward self.count += 1 # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(states)[0] # add some noise for exploration return list(action + self.noise.sample()) def learn(self, experiences): """Update policy and value parameters using given batch of reward tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted actions of next-state and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # track best score self.score = self.total_reward / float( self.count) if self.count else -np.inf if self.best_score < self.score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def train_agent(args, param): """ Args: """ use_gym = False args.seed = param now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") torch.manual_seed(args.seed) np.random.seed(args.seed) pathname = str(args.locexp) + "/" + str(args.env_name) + '-agent-' + str( args.policy) pathname += "_batch_size_" + str(args.batch_size) pathname += '_update_freq: ' + str( args.target_update_freq) + "num_q_target_" + str( args.num_q_target) + "_seed_" + str(args.seed) pathname += "_actor_300_200" text = "Star_training target_update_freq: {} num_q_target: {} use device {} ".format( args.target_update_freq, args.num_q_target, args.device) print(pathname, text) write_into_file(pathname, text) arg_text = str(args) write_into_file(pathname, arg_text) tensorboard_name = str(args.locexp) + '/runs/' + pathname writer = SummaryWriter(tensorboard_name) if use_gym: env = gym.make(args.env_name) env.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) args.max_episode_steps = env._max_episode_steps else: size = 84 env = suite.make( args.env_name, has_renderer=False, use_camera_obs=True, ignore_done=True, has_offscreen_renderer=True, camera_height=size, camera_width=size, render_collision_mesh=False, render_visual_mesh=True, camera_name='agentview', use_object_obs=False, camera_depth=True, reward_shaping=True, ) state_dim = 200 print("State dim, ", state_dim) action_dim = env.dof print("action_dim ", action_dim) max_action = 1 args.max_episode_steps = 200 if args.policy == "TD3_ad": policy = TD31v1(state_dim, action_dim, max_action, args) elif args.policy == "DDPG": policy = DDPG(state_dim, action_dim, max_action, args) file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name) obs_shape = (3, 84, 84) action_shape = (action_dim, ) print("obs", obs_shape) print("act", action_shape) replay_buffer = ReplayBuffer(obs_shape, action_shape, int(args.buffer_size), args.device) save_env_vid = False total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() scores_window = deque(maxlen=100) episode_reward = 0 evaluations = [] tb_update_counter = 0 while total_timesteps < args.max_timesteps: tb_update_counter += 1 # If the episode is done if done: episode_num += 1 scores_window.append(episode_reward) average_mean = np.mean(scores_window) if tb_update_counter > args.tensorboard_freq: print("Write tensorboard") tb_update_counter = 0 writer.add_scalar('Reward', episode_reward, total_timesteps) writer.add_scalar('Reward mean ', average_mean, total_timesteps) writer.flush() # If we are not at the very beginning, we start the training process of the model if total_timesteps != 0: text = "Total Timesteps: {} Episode Num: {} ".format( total_timesteps, episode_num) text += "Episode steps {} ".format(episode_timesteps) text += "Reward: {:.2f} Average Re: {:.2f} Time: {}".format( episode_reward, np.mean(scores_window), time_format(time.time() - t0)) print(text) write_into_file(pathname, text) # We evaluate the episode and we save the policy if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 200) if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, env)) torch.manual_seed(args.seed) np.random.seed(args.seed) save_model = file_name + '-{}reward_{:.2f}-agent{}'.format( episode_num, evaluations[-1], args.policy) policy.save(save_model) # When the training step is done, we reset the state of the environment if use_gym: obs = env.reset() else: state = env.reset() obs, state_buffer = stacked_frames(state, size, args, policy) # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 # Before 10000 timesteps, we play random actions if total_timesteps < args.start_timesteps: if use_gym: action = env.action_space.sample() else: action = np.random.randn(env.dof) else: # After 10000 timesteps, we switch to the model if use_gym: action = policy.select_action(np.array(obs)) # If the explore_noise parameter is not 0, we add noise to the action and we clip it if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) else: action = (policy.select_action(np.array(obs)) + np.random.normal( 0, max_action * args.expl_noise, size=action_dim)).clip(-max_action, max_action) if total_timesteps % args.target_update_freq == 0: if args.policy == "TD3_ad": policy.hardupdate() # The agent performs the action in the environment, then reaches the next state and receives the reward new_obs, reward, done, _ = env.step(action) done = float(done) if not use_gym: new_obs, state_buffer = create_next_obs(new_obs, size, args, state_buffer, policy) # We check if the episode is done done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float( done) if not use_gym: if episode_timesteps + 1 == args.max_episode_steps: done = True # We increase the total reward reward = reward * args.reward_scalling episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) if args.debug: print("add to buffer next_obs ", obs.shape) print("add to bufferobs ", new_obs.shape) replay_buffer.add(obs, action, reward, new_obs, done, done_bool) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 0) episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # We add the last policy evaluation to our list of evaluations and we save our model evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, episode_num))
class MADDPG(): def __init__(self, state_size, action_size, n_agents, seed): self.state_size = state_size self.action_size = action_size self.n_agents = n_agents self.seed = random.seed(seed) # Actor-Critic agents self.ActorCriticAgents = [ Agent(state_size, action_size, n_agents, seed) for _ in range(n_agents) ] # Replay memory self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, seed) def OUNoise_reset(self): for agent in self.ActorCriticAgents: agent.exploration_noise.reset() def act(self, state): actions = [] for i, agent in enumerate(self.ActorCriticAgents): agent_action = agent.act(state[i]) actions.append(agent_action[0]) return np.stack(actions, axis=0) def step(self, ep, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: for i in range(self.n_agents): self.learn(i) def learn(self, agent_index): states, actions, rewards, next_states, dones = self.memory.sample() target_next_actions = torch.from_numpy( np.zeros(shape=actions.shape)).float().to(device) for idx, agent in enumerate(self.ActorCriticAgents): current_states = states[:, idx] target_next_actions[:, idx, :] = agent.actor_target(current_states) target_next_actions = torch.reshape(target_next_actions, shape=(BATCH_SIZE, -1)) current_agent_states = states[:, agent_index, :] current_agent_actions = actions[:, agent_index, :] current_agent_rewards = torch.reshape(rewards[:, agent_index], shape=(BATCH_SIZE, 1)) current_agent_dones = torch.reshape(dones[:, agent_index], shape=(BATCH_SIZE, 1)) action_preds = actions.clone() action_preds[:, agent_index, :] = self.ActorCriticAgents[ agent_index].actor_local(current_agent_states) action_preds = torch.reshape(action_preds, shape=(BATCH_SIZE, -1)) self.ActorCriticAgents[agent_index].update( states, current_agent_states, actions, current_agent_actions, target_next_actions, rewards, current_agent_rewards, next_states, dones, current_agent_dones, action_preds) def save_checkpoint(self): for i in range(self.n_agents): torch.save(self.ActorCriticAgents[i].actor_local.state_dict(), f'actor_checkpoint{i}.pth') torch.save(self.ActorCriticAgents[i].critic_local.state_dict(), f'critic_checkpoint{i}.pth')
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, num_agents, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, ALPHA, BETA, ANNEAL_OVER) # Tensorboard interface self.writer = SummaryWriter(comment="-ddpg-no-pri") self.tb_tracker = TBMeanTracker(self.writer, batch_size=10) self.step_t = 0 def step(self, state, action, reward, next_state, done, timestamp): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward #for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn at defined interval, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestamp % self.num_agents == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA) self.step_t += 1 def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ #states, actions, rewards, next_states, dones, idxs, weights = experiences states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # update priorities # updates = torch.abs(Q_expected - Q_targets).cpu().data.squeeze(1).numpy() # self.memory.update_priorities(idxs, updates) self.tb_tracker.track("loss_critic", critic_loss.to("cpu"), self.step_t) # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.tb_tracker.track("loss_actor", actor_loss.to("cpu"), self.step_t) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ---------------------------- update noise ---------------------------- # self.epsilon -= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DQNAgent_Vanila_simple(agent): def __init__(self, model, opt, learning=True): super().__init__() self.memory = ReplayBuffer(3000) self.previous_state = None self.previous_action = None self.previous_legal_actions = None self.step = 0 self.model = model self.opt = opt self.loss = 0 self.batch_size = 10 self.test_q = 0 self.max_tile = 0 #self.test_q = 0 self.epsilon_schedule = LinearSchedule(1000000, initial_p=0.99, final_p=0.01) self.learning = learning def should_explore(self): self.epsilon = self.epsilon_schedule.value(self.step) return random.random() < self.epsilon def action(self): if self.learning: self.step += 1 legalActions = self.legal_actions(deepcopy(self.gb.board)) if len(legalActions) == 0: print(111111111111111111111111111111111111111) board = deepcopy(self.gb.board) board = oneHotMap(board) if self.learning and self.should_explore(): q_values = None action = random.choice(legalActions) choice = self.actions[action] else: #mark state = torch.from_numpy(board).type( torch.FloatTensor).cuda().view(-1, 17, 4, 4) action, q_values = self.predict(state, legalActions) choice = self.actions[action] if self.learning: reward = self.gb.currentReward if reward != 0: reward = np.log2(reward) if (self.previous_state is not None and self.previous_action is not None): self.memory.add(self.previous_state, self.previous_action, self.previous_legal_actions, reward, legalActions, board, 0) self.previous_state = board self.previous_action = action self.previous_legal_actions = legalActions if self.learning: self.update() return choice def enableLearning(self): self.model.train() self.learning = True self.max_tile = 0 self.reset() def disableLearning(self): self.model.eval() self.learning = False def end_episode(self): if not self.learning: m = np.max(self.gb.board) if m > self.max_tile: self.max_tile = m return #print(self.gb.board) board = deepcopy(self.gb.board) board = oneHotMap(board) #legalActions = self.legal_actions(deepcopy(self.gb.board)) #print(legalActions) self.memory.add(self.previous_state, self.previous_action, self.previous_legal_actions, self.gb.currentReward, [], board, 1) self.reset() def reset(self): self.previous_state = None self.previous_action = None self.previous_legal_actions = None def update(self): if self.step < self.batch_size: return batch = self.memory.sample(self.batch_size) (states, actions, legal_actions, reward, next_legal_actions, next_states, is_terminal) = batch terminal = torch.tensor(is_terminal).type(torch.cuda.FloatTensor) reward = torch.tensor(reward).type(torch.cuda.FloatTensor) states = torch.from_numpy(states).type(torch.FloatTensor).cuda().view( -1, 17, 4, 4) next_states = torch.from_numpy(next_states).type( torch.FloatTensor).cuda().view(-1, 17, 4, 4) # Current Q Values _, q_values = self.predict_batch(states) batch_index = torch.arange(self.batch_size, dtype=torch.long) #print(actions) #print(q_values) q_values = q_values[batch_index, actions] #print(q_values) # Calculate target q_actions_next, q_values_next = self.predict_batch( next_states, legalActions=next_legal_actions) #print(q_values_next) q_max = q_values_next.max(1)[0].detach() q_max = (1 - terminal) * q_max # if sum(terminal == 1) > 0: # print(reward) # print( (terminal == 1).nonzero()) # print(terminal) # print(next_legal_actions) # print(q_max) # input() q_target = reward + 0.99 * q_max self.opt.zero_grad() loss = self.model.loss_function(q_target, q_values) loss.backward() self.opt.step() #train_loss = loss_vae.item() + loss_dqn.item() self.loss += loss.item() / len(states) def predict_batch(self, input, legalActions=None): input = input #print(legalActions) q_values = self.model(input) if legalActions is None: values, q_actions = q_values.max(1) else: isNotlegal = True # print(legalActions) # print(q_values) q_values_true = torch.full((self.batch_size, 4), -100000000).cuda() for i, action in enumerate(legalActions): q_values_true[i, action] = q_values[i, action] values, q_actions = q_values_true.max(1) q_values = q_values_true #print(q_values_true) ''' while isNotlegal: isNotlegal = False values, q_actions = q_values.max(1) #print(q_values) #print(values) #print(q_actions) for i, action in enumerate(q_actions): #print(legalActions[i]) if len(legalActions[i]) == 0: continue if action.item() not in legalActions[i]: isNotlegal = True # print(i) # print(action.item()) # print(q_values) q_values[i, action] = -1 # print(q_values) # print("*********************") ''' return q_actions, q_values def predict(self, input, legalActions): q_values = self.model(input) for action in range(4): if action not in legalActions: q_values[0, action] = -100000000 action = torch.argmax(q_values) if int(action.item()) not in legalActions: print(legalActions, q_values, action) print("!!!!!!!!!!!!!!!!!!!!!!!!!") return action.item(), q_values def legal_actions(self, copy_gb): legalActions = [] for i in range(4): try_gb = gameboard(4, deepcopy(copy_gb)) changed = try_gb.takeAction(self.actions[i]) if changed: legalActions.append(i) return legalActions '''
class MADDPG(): """Interacts with and learns from the environment.""" def __init__(self, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = config.state_size self.action_size = config.action_size self.seed = random.seed(config.random_seed) self.config = config self.t_step = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(self.state_size, self.action_size, config.random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, config.random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.state_size, self.action_size, config.random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, config.random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay) # Noise process self.noise = OUNoise(self.action_size, config.random_seed) # Replay memory self.memory = ReplayBuffer(self.action_size, config.buffer_size, config.batch_size, config.random_seed) # ----------------------- initialize target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) if config.shared_replay_buffer: self.memory = config.memory else: self.memory = config.memory_fn() def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.config.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config.batch_size: experiences = self.memory.sample() self.learn(experiences, self.config.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.tau) self.soft_update(self.actor_local, self.actor_target, self.config.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(args, repeat_opt): """ Args: param1(TD3): policy param2(Buffer): param3(openai env): """ use_gym = False # in case seed experements args.seed = repeat_opt now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") #args.repeat_opt = repeat_opt torch.manual_seed(args.seed) np.random.seed(args.seed) pathname = 'env-' + str(args.env_name) + '_update_freq: ' + str( args.target_update_freq) + "num_q_target_" + str( args.num_q_target) + "_seed_" + str(args.seed) text = "Star_training target_update_freq: {} num_q_target: {} use device {} ".format( args.target_update_freq, args.num_q_target, args.device) print(pathname, text) write_into_file('search-' + pathname, text) arg_text = str(args) write_into_file('search-' + pathname, arg_text) # tensorboard_name = 'runs' + str(dt_string) + '/' + pathname + "-Dueling" tensorboard_name = 'runs/' + pathname writer = SummaryWriter(tensorboard_name) env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86_64', no_graphics=True) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_dim = brain.vector_action_space_size states = env_info.vector_observations state_dim = states.shape[1] max_action = 1 policy = TD31v1(state_dim, action_dim, max_action, args) replay_buffer = ReplayBuffer() save_env_vid = False total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() scores_window = deque(maxlen=100) episode_reward = 0 evaluations = [] file_name = "%s_%s_%s" % ("TD3", args.env_name, str(args.seed)) print("---------------------------------------") print("Settings: %s" % (file_name)) print("---------------------------------------") # We start the main loop over 500,000 timesteps tb_update_counter = 0 while total_timesteps < args.max_timesteps: tb_update_counter += 1 # If the episode is done if done: episode_num += 1 #env.seed(random.randint(0, 100)) scores_window.append(episode_reward) average_mean = np.mean(scores_window) if tb_update_counter > args.tensorboard_freq: print("Write tensorboard") tb_update_counter = 0 writer.add_scalar('Reward', episode_reward, total_timesteps) writer.add_scalar('Reward mean ', average_mean, total_timesteps) # If we are not at the very beginning, we start the training process of the model if total_timesteps != 0: text = "Total Timesteps: {} Episode Num: {} ".format( total_timesteps, episode_num) text += "Episode steps {} ".format(episode_timesteps) text += "Reward: {} Average Re: {:.2f} Time: {}".format( episode_reward, np.mean(scores_window), time_format(time.time() - t0)) print(text) write_into_file('search-' + pathname, text) policy.train(replay_buffer, writer, episode_timesteps) # We evaluate the episode and we save the policy if timesteps_since_eval >= args.eval_freq: policy.save("%s" % (file_name), directory="./pytorch_models") timesteps_since_eval %= args.eval_freq #evaluations.append(evaluate_policy(policy, writer, total_timesteps, args, episode_num)) save_model = file_name + '-{}'.format(episode_num) policy.save(save_model, directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations) # When the training step is done, we reset the state of the environment env_info = env.reset(train_mode=True)[brain_name] obs = env_info.vector_observations[0] # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 # Before 10000 timesteps, we play random actions if total_timesteps < args.start_timesteps: action = np.random.randn(brain.vector_action_space_size) else: action = policy.select_action(np.array(obs)) # If the explore_noise parameter is not 0, we add noise to the action and we clip it if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=action_dim)).clip(-1, 1) else: action = (policy.select_action(np.array(obs)) + np.random.normal( 0, max_action * args.expl_noise, size=action_dim)).clip(-max_action, max_action) if total_timesteps % args.target_update_freq == 0: policy.hardupdate() # The agent performs the action in the environment, then reaches the next state and receives the reward env_info = env.step(action)[ brain_name] # send all actions to tne environment new_obs = env_info.vector_observations[ 0] # get next state (for each agent) reward = env_info.rewards[0] # get reward (for each agent) done = env_info.local_done[0] # We check if the episode is done #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float( done) # We increase the total reward reward = reward * args.reward_scalling episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) replay_buffer.add((obs, new_obs, action, reward, done_bool)) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # We add the last policy evaluation to our list of evaluations and we save our model if args.save_model: policy.save("%s" % (file_name), directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations)
def ddqn_train(model_name, load_model=False, model_filename=None, optimizer_filename=None): print("DDQN -- Training") env = make('hungry_geese') trainer = env.train( ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = DDQNAgent(rows=11, columns=11, num_actions=3) buffer = ReplayBuffer() strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001) if load_model: agent.load_model_weights(model_filename) agent.load_optimizer_weights(optimizer_filename) start_episode = 0 end_episode = 50000 epochs = 32 batch_size = 128 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] for episode in range(start_episode + 1, end_episode + 1): obs_dict = trainer.reset() epsilon = strategy.get_epsilon(episode - start_episode) ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 while not done: ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) buffer.add(state, action, reward, next_state, done) obs_dict = next_obs_dict prev_direction = direction ep_reward += reward if len(buffer) >= batch_size: for _ in range(epochs): states, actions, rewards, next_states, dones = buffer.get_samples( batch_size) agent.fit(states, actions, rewards, next_states, dones) print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if episode % 10 == 0: agent.update_target_network() if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) print('Epsilon: ' + str(round(epsilon, 3))) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() epsilon = 0 done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/ddqn_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/ddqn_' + model_name + '_' + str(episode) + '_optimizer.npy') agent.save_model_weights('models/ddqn_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/ddqn_' + model_name + '_' + str(end_episode) + '_optimizer.npy') plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title('Reward') plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, network): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.network = network # Q-Network if self.network == "duel": self.qnetwork_local = DuelingDQN(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingDQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) else: self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, count): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA, count) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, count): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Q values for best actions in next_state # from current Q network if self.network == "double" or "duel": Q_L = self.qnetwork_local(next_states).detach() _, actions_prime = Q_L.max(1) # get Q values from frozen network for next state and chosen action Q_targets_next = self.qnetwork_target(next_states).detach() Q_targets_next_s_a_prime = Q_targets_next.gather( 1, actions_prime.unsqueeze(1)) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next_s_a_prime * (1 - dones)) # Get expected Q values from target model using current actions Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # #if count >= TARGET_UPDATE: self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(env, env_eval, model, total_steps, view, criterion, optimizer, savedir, param_save_dir): try: os.mkdir("./params") print("Directory params Created") except FileExistsError: print("Directory params already exists") model_dir = "./params/{}".format(param_save_dir) try: os.mkdir(model_dir) print("Directory ", model_dir, " Created") except FileExistsError: print("Directory ", model_dir, " already exists") target_model = VanillaDQNCUDA(n_actions=env.action_space.n).to("cuda") memory = ReplayBuffer(MEM_SIZE) done = True episode = 0 log_steps = 0 rewards_history = [] for step in range(1, total_steps + 1): try: if step % SAVE_FREQ == 0: save_model(model, step, savedir) if done: if episode > 0: for i, experience in enumerate(trajectory): obs, action, reward, next_obs, done = experience memory.add(obs, action, reward, next_obs, done) if log_steps >= LOG_EVERY: log_steps = 0 episode_steps = step - episode_start_step print("Episode: {} | Steps: {}/{} | Return: {}".format( episode, episode_steps, step, episode_return)) trajectory = [] episode_start_step = step obs = np.array(env.reset()) obs = obs.transpose((2, 0, 1)) episode += 1 episode_return = 0.0 epsilon = update_epsilon(step) else: obs = next_obs action = agent_act(env, model, obs, epsilon) next_obs, reward, done, _ = env.step(action) next_obs = np.array(next_obs) next_obs = next_obs.transpose((2, 0, 1)) episode_return += reward trajectory.append((obs, action, reward, next_obs, done)) if step >= EXPLORE_STEPS and step % UPDATE_EVERY == 0: if step % TARGET_UPDATE_EVERY == 0: target_model.load_state_dict(model.state_dict()) batch = memory.sample(BATCH_SIZE) optimize(model, target_model, batch, num_actions=env.action_space.n, criterion=criterion, optimizer=optimizer) if step >= EXPLORE_STEPS and step % EVAL_EVERY == 0: episode_return_avg = evaluate(env_eval, model, view=view) print("Episode: {} | Steps: {} | Evaluation Return Avg: {}". format( episode, step, episode_return_avg, )) rewards_history.append(episode_return_avg) log_steps += 1 except KeyboardInterrupt: del trajectory[:] del rewards_history[:] break pickle.dump([rewards_history], open(model_dir + '/' + "model_test_rewards.p", "wb+")) env.close() env_eval.close() torch.cuda.empty_cache() return rewards_history
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Agent-Network ## TODO: Initialize your action network here "*** YOUR CODE HERE ***" self.network = AgentNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=LR) self.network.train() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.0, get_prob=False): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() if get_prob: return action_values.cpu().data.numpy() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def discount_rewards(self, rewards, gamma=0.99): r = np.array([gamma**i * rewards[i] for i in range(len(rewards))]) # Reverse the array direction for cumsum and then # revert back to the original order r = r[::-1].cumsum()[::-1] return r - r.mean() def learn(self, experiences, gamma=GAMMA): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences rewards = self.discount_rewards(rewards) ## TODO: compute and minimize the loss using REINFORCE "*** YOUR CODE HERE ***" self.optimizer.zero_grad() state_tensor = torch.FloatTensor(states) reward_tensor = torch.FloatTensor(rewards) action_tensor = torch.LongTensor(actions) # Calculate loss logprob = torch.log(self.network.forward(state_tensor)) selected_logprobs = reward_tensor * logprob[ np.arange(len(action_tensor)), action_tensor] loss = -selected_logprobs.mean() # Calculate gradients loss.backward() # Apply gradients self.optimizer.step()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_episodes, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed num_episodes (int): number of training epochs """ self.state_size = state_size self.action_size = action_size self.seed = seed # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.anneal_beta = (1. - BETA) / num_episodes self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA, BETA) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.t_learning_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def update_weights(self): self.memory.anneal_beta(self.anneal_beta) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, idxs, weights = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # update priorities updates = torch.abs(Q_expected - Q_targets).cpu().data.squeeze(1).numpy() self.memory.update_priorities(idxs, updates) # Compute loss loss = F.l1_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() (loss * weights).mean().backward() self.optimizer.step() # ------------------- update target network ------------------- # self.t_learning_step += 1 if self.t_learning_step % UPDATE_TARGET_STEPS == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): # PyTorch copy: destination.data.copy(source.data) target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, apply_dueling=False, apply_double=False): """ Initialize a Unity agent object. :param state_size: (int) dimension of each state :param action_size: (int) dimension of each action :param seed: (int) random seed """ assert(self._true_xor(apply_dueling, apply_double), "Choose one between dueling networks or DDQN") self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.apply_dueling = apply_dueling self.apply_double = apply_double # Q-Network self.q_net_target = QNetwork(state_size, action_size, seed, apply_dueling=apply_dueling).to(device) self.q_net_local = QNetwork(state_size, action_size, seed, apply_dueling=apply_dueling).to(device) self.opt = optim.Adam(self.q_net_local.parameters(), lr=LR) # Replay memory self.memory_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 @staticmethod def _true_xor(*args): return sum(args) == 1 def step(self, state, action, reward, next_state, done): """ Save experience in replay memory buffer for future experience replay :param state: The current state of the agent :param action: The action that the agent has taken in given state :param reward: The reward associated with the state action combination :param next_state: The resulting state after taking action in previous state :param done: (bool) Has the terminal state been reached? :return: None """ self.memory_buffer.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_CYCLE if self.t_step == 0: # If enough samples are available in memory, get random subset and learn from it if BATCH_SIZE < len(self.memory_buffer): experiences = self.memory_buffer.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """ Returns actions for given state as per current policy. :param state: (array_like) current state :param eps: (float) epsilon, for epsilon-greedy action selection :return: (int) The index of the action to be taken by the agent """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.q_net_local.eval() with torch.no_grad(): # Do not perform a forward pass in this context action_values = self.q_net_local(state) self.q_net_local.train() # Epsilon-greedy action selection greed_p = random.random() return np.argmax(action_values.cpu().data.numpy()) if greed_p > eps else \ random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. :param experiences: (Tuple[torch.Tensor]) tuple of (s, a, r, s', done) tuples :param gamma: (float) discount factor :return: """ states, actions, rewards, next_states, done_signals = experiences if not self.apply_double: # Get max predicted Q values for the next state of the target model. Q_targets_next = self.q_net_target(next_states).detach().max(1)[0].unsqueeze(1) else: # In the case of Double-DQN, evaluate the best selected action with the target model's set of parameters. indices = torch.argmax(self.q_net_local(next_states).detach(), 1) # The selected next best action's indices # Evaluate that action by comparing with the local network's set of parameters Q_targets_next = self.q_net_target(next_states).detach().gather(1, indices.unsqueeze(1)) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - done_signals)) # Get expected Q values from local model (being trained) # x.gather(1, actions) returns a tensor which results from the concatenation of the input tensor values along # the given dimensions (here the dim indexes are the taken actions indices) Q_expected = self.q_net_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.opt.zero_grad() loss.backward() self.opt.step() # perform network update self.soft_update(self.q_net_local, self.q_net_target, TAU) @staticmethod def soft_update(local_model, target_model, tau): """ Soft update model parameters, given by the function: θ_target = τ*θ_local + (1 - τ)*θ_target :param local_model: (PyTorch model) weights will be copied from :param target_model: (PyTorch model) weights will be copied to :param tau: (float) interpolation parameter :return: """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DQN_agent(object): def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)): self.env = env self.max_episode_steps = env._max_episode_steps """ beta: The discounted factor of Q-value function (epsilon): The explore or exploit policy epsilon. initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1 final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps', The epsilon set to the 'final_epsilon' determinately. epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'. """ self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] """ episode: Record training episode steps: Add 1 when predicting an action learning: The trigger of agent learning. It is on while training agent. It is off while testing agent. action_space: The action space of the current environment, e.g 2. """ self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space """ input_len The input length of the neural network. It equals to the length of the state vector. output_len: The output length of the neural network. It is equal to the action space. eval_model: The model for predicting action for the agent. target_model: The model for calculating Q-value of next_state to update 'eval_model'. use_target_model: Trigger for turn 'target_model' on/off """ state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) # memory: Store and sample experience replay. self.memory = ReplayBuffer(hyper_params['memory_size']) """ batch_size: Mini batch size for training model. update_steps: The frequence of traning model model_replace_freq: The frequence of replacing 'target_model' by 'eval_model' """ self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] print("agent initialized") # Linear decrease function for epsilon def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps): decay_rate = curr_steps / final_decay_steps if decay_rate > 1: decay_rate = 1 return initial_value - (initial_value - final_value) * decay_rate def explore_or_exploit_policy(self, state): p = uniform(0, 1) # Get decreased epsilon epsilon = self.linear_decrease(self.initial_epsilon, self.final_epsilon, self.steps, self.epsilon_decay_steps) #if(np.random.randint(1000)==4): #print("epsilon",epsilon) if p < epsilon: #return action return randint(0, self.action_space - 1) else: #return action return self.greedy_policy(state) def greedy_policy(self, state): return self.eval_model.predict(state) # This next function will be called in the main RL loop to update the neural network model given a batch of experience # 1) Sample a 'batch_size' batch of experiences from the memory. # 2) Predict the Q-value from the 'eval_model' based on (states, actions) # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward # 5) Call fit() to do the back-propagation for 'eval_model'. def update_batch(self): if len(self.memory ) < self.batch_size or self.steps % self.update_steps != 0: return #print("fetching minibatch from replay memory") batch = self.memory.sample(self.batch_size) (states, actions, reward, next_states, is_terminal) = batch states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) #q_values = q_values[np.arange(self.batch_size), actions] q_values = q_values[batch_index, actions] # Calculate target if self.use_target_model: #print("target_model.predict") best_actions, q_next = self.target_model.predict_batch(next_states) else: best_actions, q_next = self.eval_model.predict_batch(next_states) q_max = q_next[batch_index, best_actions] terminal = 1 - terminal q_max *= terminal q_target = reward + self.beta * q_max # update model self.eval_model.fit(q_values, q_target) def learn_and_evaluate(self, training_episodes, test_interval): test_number = training_episodes // test_interval all_results = [] for i in range(test_number): # learn self.learn(test_interval) # evaluate avg_reward = self.evaluate() all_results.append(avg_reward) return all_results def learn(self, test_interval): for episode in tqdm(range(test_interval), desc="Training"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: #INSERT YOUR CODE HERE # add experience from explore-exploit policy to memory action = self.explore_or_exploit_policy(state) next_state, reward, done, info = self.env.step(action) self.memory.add(state, action, reward, next_state, done) # update the model every 'update_steps' of experience self.update_batch() # update the target network (if the target network is being used) every 'model_replace_freq' of experiences if self.use_target_model and (self.steps % self.model_replace_freq == 0): self.target_model.replace(self.eval_model) self.steps += 1 steps += 1 state = next_state def evaluate(self, trials=30): total_reward = 0 for _ in tqdm(range(trials), desc="Evaluating"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: steps += 1 action = self.greedy_policy(state) state, reward, done, _ = self.env.step(action) total_reward += reward avg_reward = total_reward / trials print(avg_reward) f = open(result_file, "a+") f.write(str(avg_reward) + "\n") f.close() if avg_reward >= self.best_reward: self.best_reward = avg_reward self.save_model() return avg_reward # save model def save_model(self): self.eval_model.save(result_floder + '/best_model.pt') # load model def load_model(self): self.eval_model.load(result_floder + '/best_model.pt')
def train(self): # initialize memory buffer buffer = ReplayBuffer(int(500000), self.batch_size, self.num_agents, 0) # use keep_awake to keep workspace from disconnecting for episode in range(self.number_of_episodes): env_info = self.env.reset(train_mode=True)[self.brain_name] agent_episode_rewards = [0, 0] for agent in self.maddpg.ddpg_agents: agent.noise.reset() for episode_t in range(self.max_episode_len): states = env_info.vector_observations states_t = to_tensor(states) with torch.no_grad(): action_ts = self.maddpg.act(states_t, noise=self.noise) self.noise *= self.noise_reduction actions = torch.stack(action_ts).numpy() env_info = self.env.step(actions)[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for i in range(self.num_agents): agent_episode_rewards[i] += rewards[i] full_state = np.concatenate(states) full_next_state = np.concatenate(next_states) buffer.add((states, full_state, actions, rewards, next_states, full_next_state, dones)) # update once after every episode_per_update critic_losses = [] actor_losses = [] if len(buffer) > self.batch_size and episode % self.episode_per_update == 0: for i in range(self.num_agents): samples = buffer.sample() cl, al = self.maddpg.update(samples, i) critic_losses.append(cl) actor_losses.append(al) self.maddpg.update_targets() # soft update the target network towards the actual networks if np.any(dones): # if any of the agents are done break break episode_reward = max(agent_episode_rewards) self.episode_rewards.append(episode_reward) self.last_100_episode_rewards.append(episode_reward) self.avg_rewards.append(np.mean(self.last_100_episode_rewards)) # scores.append(episode_reward) print('\rEpisode {}\tAverage Score: {:.4f}\tScore: {:.4f}'.format(episode, self.avg_rewards[-1], episode_reward), end="") if episode % self.print_period == 0: print('\rEpisode {}\tAverage Score: {:.4f}'.format(episode, self.avg_rewards[-1])) # saving successful model # training ends when the threshold value is reached. if self.avg_rewards[-1] >= self.threshold: save_dict_list = [] for i in range(self.num_agents): save_dict = {'actor_params': self.maddpg.ddpg_agents[i].actor.state_dict(), 'actor_optim_params': self.maddpg.ddpg_agents[i].actor_optimizer.state_dict(), 'critic_params': self.maddpg.ddpg_agents[i].critic.state_dict(), 'critic_optim_params': self.maddpg.ddpg_agents[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save(save_dict_list, self.ckpt) raw_score_plotter(self.episode_rewards) plotter('Tennis', len(self.episode_rewards), self.avg_rewards, self.threshold) break
class DDPG_Agent: def __init__(self, state_size, action_size, random_seed, actor_hidden=[400, 300], critic_hidden=[400, 300], id=0): super(DDPG_Agent, self).__init__() self.actor_local = Actor(state_size, action_size, random_seed, hidden_layer_param=actor_hidden).to(DEVICE) self.actor_target = Actor(state_size, action_size, random_seed, hidden_layer_param=actor_hidden).to(DEVICE) self.critic_local = Critic(state_size, action_size, random_seed, hidden_layer_param=critic_hidden).to(DEVICE) self.critic_target = Critic( state_size, action_size, random_seed, hidden_layer_param=critic_hidden).to(DEVICE) self.actor_opt = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_opt = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) self.memory = ReplayBuffer(action_size, random_seed) self.seed = random.seed(random_seed) self.id = id print(critic_hidden) print("") print("--- Agent {} Params ---".format(self.id)) print("Going to train on {}".format(DEVICE)) print("Learning Rate:: Actor: {} | Critic: {}".format( LR_ACTOR, LR_CRITIC)) print( "Replay Buffer:: Buffer Size: {} | Sampled Batch size: {}".format( BUFFER_SIZE, BATCH_SIZE)) print("") print("Actor paramaters:: Input: {} | Hidden Layers: {} | Output: {}". format(state_size, actor_hidden, action_size)) print("Critic paramaters:: Input: {} | Hidden Layers: {} | Output: {}". format(state_size, [critic_hidden[0] + action_size, *critic_hidden[1:]], 1)) print(self.actor_local) print(self.critic_local) print("") print("") # def act(self, state): # state = torch.from_numpy(state).float().to(DEVICE) # self.actor_local.eval() # with torch.no_grad(): # actions = self.actor_local(state).cpu().data.numpy() # self.actor_local.train() # return actions def act(self, obs, noise=0.0): obs = obs.to(DEVICE) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(obs) #+ noise*self.noise.noise() return action def step(self, state, action, reward, next_state, done): # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # --- Teach Critic (with TD) --- # recommended_actions = self.actor_target(next_states) Q_nexts = self.critic_target(next_states, recommended_actions) Q_targets = (rewards + GAMMA * Q_nexts * (1 - dones) ) # This is what we actually got from experience Q_expected = self.critic_local( states, actions ) # This is what we thought the expected return of that state-action is. critic_loss = CRITERION(Q_targets, Q_expected) self.critic_opt.zero_grad() critic_loss.backward() self.critic_opt.step() # --- Teach Actor --- # next_actions = self.actor_local(states) # Here we get the value of each state-actions. # This will be backpropagated to the weights that produced the action in the actor network. # Large values will make weights stronger, smaller values (less expected return for that state-action) weaker actor_loss = -self.critic_local(states, next_actions).mean() self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() # Mix model parameters in both Actor and Critic # self.soft_update(self.actor_local, self.actor_target) self.soft_update(self.critic_local, self.critic_target) def soft_update(self, local, target): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)
class Actor(): def __init__(self, action_size, state_size,buffer_size, batch_size,actor_lr,critic_lr,device,weight_decay, tau,shared_memory,noise, share_memory_flag, seed=0): self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.actor_lr = actor_lr self.weight_decay = weight_decay self.device = device self.seed= seed self.actor_loss =[] #self.critic_loss =[] torch.manual_seed(seed) np.random.seed(seed) self.tau = tau self.noise= OUNoise(self.action_size,self.seed) #self.noise = noise self.share_memory_flag = share_memory_flag if self.share_memory_flag: self.memory = shared_memory else: self.memory = ReplayBuffer(action_size, buffer_size, batch_size, self.device) ## Actor self.actor_local = ActorNN(self.state_size,self.action_size).to(self.device) self.actor_target = ActorNN(self.state_size,self.action_size).to(self.device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr = self.actor_lr) ## Critic #self.critic_local = Critic(self.state_size,self.action_size).to(self.device) #self.critic_target = Critic(self.state_size,self.action_size).to(self.device) #self.critic_optimizer = Adam(self.critic_local.parameters(), lr = self.critic_lr, weight_decay=self.weight_decay) # initialize targets same as original networks self.hard_update(self.actor_target, self.actor_local) #self.hard_update(self.critic_target, self.critic_local) def reset(self): self.noise.reset() def act(self, state,noise = True,sd=1e-4): state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): #print(state.shape) action = self.actor_local(state).cpu().data.numpy() ##action.cpu().detach().numpy() self.actor_local.train() if noise: #print(type(action)) #action += np.random.normal(loc=0.0, scale=sd, size=action.size) action += self.noise.sample() action = np.clip(action, -1,1).reshape(1,-1) return action def hard_update(self,target, source): """ Copy network parameters from source to target Inputs: target (torch.nn.Module): Net to copy parameters to source (torch.nn.Module): Net whose parameters to copy """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def step(self, state, action, rewards, next_state, done,GAMMA=1.0): ## As per the description we are not supposed to use discount factor self.memory.add(state, action, rewards, next_state, done)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, number_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed number_agents (int): number of agents """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.number_agents = number_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise processes self.noise = OUNoise((number_agents, action_size), random_seed) #self.noise = GaussianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1) #self.noise = GeometricBrownianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experiences in replay memory, and use random sample from buffer to learn.""" # We save experience tuples in the memory for each agent. for i in range(self.number_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn, if enough samples are available in memory (threshold value: BATCH_SIZE) and at learning interval settings if len(self.memory) > BATCH_SIZE: for _ in range(UPDATE_RATE): experiences = self.memory.sample() self.learn(experiences, GAMMA) # def act(self, states, add_noise=True): # """Returns actions for given state as per current policy.""" # # The code has been adapted to implement batch normalization. # actions = np.zeros((self.number_agents, self.action_size)) # self.actor_local.eval() # with torch.no_grad(): # for agent_number, state in enumerate(states): # state = torch.from_numpy(state).float().unsqueeze(0).to(device) # The code has been adapted to implement batch normalization. # action = self.actor_local(state).cpu().data.numpy() # actions[agent_number, :] = action # self.actor_local.train() # if add_noise: # actions += self.noise.sample() # return np.clip(actions, -1, 1) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.number_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_number, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_number, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, mnoise=True, split_state=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.mnoise = mnoise self.split_state = split_state # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # initialize targets same as original networks self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) # Noise process if self.mnoise: self.noise = OUNoise((2, action_size), random_seed) else: self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones, step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward if self.split_state: for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) else: self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): """ Copy network parameters from source to target Inputs: target (torch.nn.Module): Net to copy parameters to source (torch.nn.Module): Net whose parameters to copy """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Agent(): def __init__(self, state_size, action_size, batch_size=128, gamma=0.99, mean_lambda=1e-3, std_lambda=1e-3, z_lambda=0.0): self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size) self.mean_lambda = mean_lambda self.std_lambda = std_lambda self.z_lambda = z_lambda self.current_value = Value(state_size).to(device) self.target_value = Value(state_size).to(device) self.softQ = soft_Q(state_size, action_size) self.policy = Policy(state_size, action_size) self.value_optimizer = optim.Adam(self.current_value.parameters(), lr=3e-4) self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4) def act(self, state): #state = torch.from_numpy(np.asarray(state)).float().to(device) action = self.policy.act(state) if self.memory.__len__() > self.batch_size: self.update() return action def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) def update(self): state, action, reward, next_state, done = self.memory.sample() expected_soft_q_value = self.softQ.forward(state, action) expected_value = self.current_value.forward(state) new_action, log_prob, z, mean, log_std = self.policy.evaluate(state) target_value = self.target_value.forward(next_state) next_soft_q_value = reward + self.gamma * target_value * (1 - done) q_val_mse = F.mse_loss(expected_soft_q_value, next_soft_q_value.detach()) expected_new_q_val = self.softQ.forward(state, new_action) next_value = expected_new_q_val - log_prob val_loss = F.mse_loss(expected_value, next_value.detach()) log_prob_target = expected_new_q_val - expected_value policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() mean_loss = self.mean_lambda * mean.pow(2).mean() std_loss = self.std_lambda * log_std.pow(2).mean() z_loss = self.z_lambda * z.pow(2).sum(1).mean() policy_loss += mean_loss + std_loss + z_loss self.soft_q_optimizer.zero_grad() q_val_mse.backward() self.soft_q_optimizer.step() self.value_optimizer.zero_grad() val_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.soft_update(self.current_value, self.target_value, TAU) def soft_update(self, local_model, target_model, TRANSFER_RATE): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(TRANSFER_RATE * local_param.data + (1.0 - TRANSFER_RATE) * target_param.data)
def train(sess, env, actor, critic, RESTORE): sess.run(tf.global_variables_initializer()) # Initialize random noise generator exploration_noise = OUNoise(env.action_space.n) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay buffER replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) # Store q values for illustration purposes q_max_array = [] reward_array = [] for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(MAX_EP_STEPS): # if i % 40 == 0 and i > 1: # env.render() # Begin "Experimentation and Evaluation Phase" # Seleect next experimental action by adding noise to action prescribed by policy a = actor.predict(np.reshape(s, (1, actor.s_dim))) # If in a testing episode, do not add noise # if i%100 is not 49 and i%100 is not 99: noise = exploration_noise.noise() a = a + noise # Take step with experimental action action = np.argmax(a) s2, r, terminal, info = env.step(action) # s2, r, terminal, info = env.step(np.reshape(a.T,newshape=(env.action_space.n,))) # Add transition to replay buffer if not testing episode # if i%100 is not 49 and i%100 is not 99: replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Find target estimate to use for updating the Q-function # Predict_traget function determines Q-value of next state target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1))) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Perform gradient descent to update critic predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value, axis=0) # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r # If episode is finished, print results if terminal: print('| Reward: %.2i' % int(ep_reward), " | Episode", i, '| Qmax: %.4f' % (ep_ave_max_q / float(j))) q_max_array.append(ep_ave_max_q / float(j)) #reward_array.append(ep_reward) break ep_reward = 0 s = env.reset() for j in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, actor.s_dim))) # Take step with experimental action action = np.argmax(a) s2, r, terminal, info = env.step(action) ep_reward += r s = s2 if terminal: print('Normal | Reward: %.2i' % int(ep_reward), " | Episode", i) reward_array.append(ep_reward) break # Max Q plot plt.plot(range(1, MAX_EPISODES + 1), q_max_array, 'b-') plt.xlabel('Episode Number') plt.ylabel('Max Q-Value') plt.savefig('Q.png') plt.show() # Reward plot plt.plot(range(1, MAX_EPISODES + 1), reward_array, 'g-') plt.xlabel('Episode Number') plt.ylabel('Reward') plt.savefig('Reward.png') plt.show() save_result([[str(i[0]) for i in q_max_array], [str(i) for i in reward_array]])
class Agent(): def __init__(self, state_space, action_space, memory_size=1000000, batch_size=32, seed=0, q_size=51): self.state_space = state_space self.action_space = action_space self.memory_size = memory_size self.batch_size = batch_size self.seed = seed self.q_size = q_size self.current_model = QDQN(self.state_space, self.action_space, n_quantiles=self.q_size).to(device) self.target_model = QDQN(self.state_space, self.action_space, n_quantiles=self.q_size).to(device) self.optimizer = Adam(self.current_model.parameters(), lr=LR) self.memory = ReplayBuffer(self.action_space, self.memory_size, self.batch_size, self.seed) self.update_every = 0 self.tau = (torch.Tensor( (2 * np.arange(self.current_model.n_quantiles) + 1) / (2.0 * self.current_model.n_quantiles)).view(1, -1)).to(device) def soft_update(self, local_model, target_model, TRANSFER_RATE): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(TRANSFER_RATE * local_param.data + (1.0 - TRANSFER_RATE) * target_param.data) def act(self, state, epsilon): if random.random() <= epsilon: action = random.choice(np.arange(self.action_space)) else: action = self.current_model.act(state).cpu().numpy() #action = self.current_model.act(state, epsilon).cpu().numpy() return action def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.update_every += 1 if self.update_every % UPDATE_FREQUENCY == 0: if len(self.memory) >= self.batch_size: experience = self.memory.sample() self.learn(experience, GAMMA) def learn(self, experience, gamma): sampled_state, sampled_action, sampled_reward, sampled_next_state, sampled_done = experience #print(self.current_model(sampled_state).shape) #print(self.current_model(sampled_state)[0:self.batch_size, 0: self.action_space]) #print(self.current_model(sampled_state)) #print(self.current_model(sampled_state).shape) #print(sampled_action.shape) #print(sampled_action.expand(self.batch_size, self.q_size)) #print(sampled_action.unsqueeze(1).expand(self.batch_size, 1, self.q_size).shape) action = sampled_action.unsqueeze(1).expand(self.batch_size, 1, self.q_size) #print(self.current_model(sampled_state)) #print(self.current_model(sampled_state).gather(1, action).squeeze(1)) theta = self.current_model(sampled_state).gather(1, action).squeeze(1) #theta = self.current_model(sampled_state).mean(2) z_next = self.target_model(sampled_next_state).detach() #print(z_next) #print(z_next.shape) z_next_max = z_next[np.arange(self.batch_size), z_next.mean(2).max(1)[1]] #print(z_next_max) Ttheta = sampled_reward + GAMMA * (1 - sampled_done) * z_next_max #print(Ttheta) #print(Ttheta.shape) #print(theta.shape) diff = Ttheta.t().unsqueeze(-1) - theta loss = self.huber(diff) * (self.tau - (diff.detach() < 0).float()).abs() loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.current_model, self.target_model, TRANSFER_RATE) def huber(self, x, k=1.0): return torch.where(x.abs() < k, 0.5 * x.pow(2), k * (x.abs() - 0.5 * k))
class Agent(): def __init__(self, env, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2): self.states = env.observation_space self.state_size = env.observation_space.shape[0] self.actions = env.action_space self.action_size = env.action_space.shape[0] self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1) def act(self, state, step, epsilon=True): state = torch.from_numpy(np.asarray(state)).float().to(device) action = self.actor.forward(state) action = action.detach().cpu().numpy() if epsilon: noise = np.random.normal(0, 0.1, action.shape[0]) action += noise return action def update(self, step): state, action, reward, next_state, done = self.memory.sample() next_state_action = self.target_actor(next_state) noise = Normal(torch.zeros(self.action_size), self.sigma).sample() noise = torch.clamp(noise, -self.noise_clip, self.noise_clip).to(device) next_state_action += noise target_Q0 = self.target_critic0(next_state, next_state_action) target_Q1 = self.target_critic1(next_state, next_state_action) target_Q = torch.min(target_Q0, target_Q1) target_value = reward + self.gamma * target_Q * (1.0 - done) expected_Q0 = self.critic0(state, action) expected_Q1 = self.critic1(state, action) critic_0_loss = F.mse_loss(expected_Q0, target_value.detach()) critic_1_loss = F.mse_loss(expected_Q1, target_value.detach()) self.critic0_optimizer.zero_grad() critic_0_loss.backward() self.critic0_optimizer.step() self.critic1_optimizer.zero_grad() critic_1_loss.backward() self.critic1_optimizer.step() if step % self.update_frequency == 0: actor_loss = self.critic0.forward(state, self.actor.forward(state)) actor_loss = -actor_loss.mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE) self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE) self.soft_update(self.actor, self.target_actor, TRANSFER_RATE) def soft_update(self, local_model, target_model, tao): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tao * local_param.data + (1.0 - tao) * target_param.data) def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done)
class Christophers_Agent(): def __init__(self, task): # Task (environment) information self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low self.w = np.random.normal( size=( self.state_size, self.action_size ), # weights for simple linear policy: state_space x action_space scale=(self.action_range / (2 * self.state_size) )) # start producing actions in a decent range self.actor = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic = Critic(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_target = Critic(self.state_size, self.action_size) self.gamma = 0.95 self.tau = 0.001 self.best_w = None self.best_score = -np.inf self.exploration_mu = 0.5 self.exploration_theta = 0.2 self.exploration_sigma = 0.4 self.noise = Noise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 32 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.best_score = -np.inf self.num_steps = 0 # Episode variables self.reset_episode() def reset_episode(self): if self.get_score() > self.best_score: self.best_score = self.get_score() self.total_reward = 0.0 self.num_steps = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.num_steps += 1 self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor.model.predict(state)[0] action = list(action + self.noise.sample()) # add some noise for exploration return action def get_score(self): return -np.inf if self.num_steps == 0 else self.total_reward / self.num_steps def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) done = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - done) self.critic.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor.train_fn([states, action_gradients, 1]) self.soft_update(self.critic.model, self.critic_target.model) self.soft_update(self.actor.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def lunarworker(wid): import tensorflow as tf import numpy as np import gym import time import os from distagent import DistAgent from memory import ReplayBuffer from util import Linear, scale, RewMonitor, SkipEnv, StackEnv gpus = tf.config.experimental.get_visible_devices("GPU") # Select single gpu depending on wid total_gpus = 2 gpu_nr = wid % total_gpus tf.config.set_visible_devices(gpus[gpu_nr], 'GPU') # Restricts mem to allow multiple tf sessions on one GPU tf.config.experimental.set_memory_growth(gpus[gpu_nr], True) # Train parameters N = int(8e6) eps = Linear(startval=0.1, endval=0.01, exploresteps=int(200e3)) gamma = 0.99 updatefreq = 4 targetfreq = 1000 savefreq = 80000 # Setup env = gym.make("LunarLander-v2") env = RewMonitor(env) env = SkipEnv(env, skip=4) # env = StackEnv(env, n_frames=4) action_len = env.action_space.n agent = DistAgent(action_len, dense=16, supportsize=29, vmin=-7.0, vmax=7.0) mem = ReplayBuffer(size=int(20e3), batchsize=32) # Prefill tf.print("Collecting history...") prefill_end = int(10e3) state = env.reset() buff = [] for t in range(1, prefill_end + 1): action = env.action_space.sample() endstate, rew, done, _ = env.step(action) data = (state, action, scale(rew), gamma, endstate, float(done)) buff.append(data) if done: state = env.reset() else: state = endstate if t % 10000 == 0: tf.print(f"Collected {t} samples.") tf.print("Done.") tf.print("Storing history...") for data in buff: mem.add(data) tf.print("Done.") # Warm up states, _, _, _, _, _, = mem.sample() agent.probvalues(states) agent.t_probvalues(states) agent.update_target() # Initial dispatch tottime = time.time() # Training loop tf.print(f"Worker {wid} learning...") state = env.reset() episode_rewards = [] buff = [] for t in range(1, N + 1): t_eps = tf.constant(eps(t), dtype=tf.float32) action = agent.eps_greedy_action( state=np.reshape(state, [1, 8]).astype(np.float32), epsval=t_eps, )[0].numpy() endstate, rew, done, info = env.step(action) data = (state, action, scale(rew), gamma, endstate, float(done)) buff.append(data) if info["Game Over"]: score = info["Episode Score"] episode_rewards.append(score) state = env.reset() if len(episode_rewards) % 100 == 0: tmptime = time.time() msit = (tmptime - tottime) / t * 1000 ma100 = np.mean(episode_rewards[-111:-1]) epstr = (f"Epsiode: {len(episode_rewards)}, " + f"Step: {t}, " + f"MA100: {ma100}, " + f"AvgSpeed: {msit:4.2f} ms/it") tf.print(epstr) else: state = endstate if t % updatefreq == 0: for data in buff: mem.add(data) buff = [] (states, actions, drews, gexps, endstates, dones) = mem.sample() agent.train(states, actions, drews, gexps, endstates, dones) if t % targetfreq == 0: agent.update_target() if t % savefreq == 0: dir_str = f"lunarmodels/step{t}/" os.makedirs(dir_str, exist_ok=True) file_str = dir_str + "model-id-" + f"{wid}" + ".h5" agent.save(file_str) env.close() tmptime = time.time() tottime = tmptime - tottime msit = tottime / N * 1000 tf.print(f"Learning done in {tottime:6.0f}s using {msit:4.2f} ms/it.") tf.print("Done.")
class Agent(): def __init__(self, state_size, action_size, action_sigma=0.1, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2, seed=0): ''' TD3 Agent :param state_size: State Dimension :param action_size: Action dimension :param action_sigma: standard deviation of the noise to be added to the action :param memory_size: :param batch: :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper) :param noise_clip: How much noise to allow :param gamma: :param update_frequency: :param seed: ''' self.state_size = state_size self.action_size = action_size self.action_sigma = action_sigma self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.seed = seed self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) #second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) # second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch, seed=seed) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1) def act(self, state, epsilon=True): state = torch.from_numpy(np.asarray(state)).float().to(device) self.actor.eval() with torch.no_grad(): action = self.actor.forward(state).cpu().data.numpy() self.actor.train() if epsilon: #if we want to inject some noise noise = np.random.normal(0, self.action_sigma, action.shape[0]) action += noise return action def update(self, step): ''' #https: // arxiv.org / pdf / 1802.09477.pdf the function is very similar to typical DDPG algorithm, except for 1) we have 2 critics to update 2) we take the min of the 2 values critics output 3) Has modified Target network with noise injected into it (Chapter 5.3 of the paper) 4) We delay updating the actor by certain steps :param step: how often to update the actor :return: ''' state, action, reward, next_state, done = self.memory.sample() # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_state_action = self.target_actor(next_state) #sample a random noise noise = Normal(torch.zeros(self.action_size), self.sigma).sample() noise = torch.clamp(noise, -self.noise_clip, self.noise_clip).to(device) next_state_action += noise target_Q0 = self.target_critic0(next_state, next_state_action) target_Q1 = self.target_critic1(next_state, next_state_action) target_Q = torch.min(target_Q0, target_Q1) target_value = reward + self.gamma * target_Q * (1.0 - done) expected_Q0 = self.critic0(state, action) expected_Q1 = self.critic1(state, action) critic_0_loss = F.mse_loss(expected_Q0, target_value.detach()) critic_1_loss = F.mse_loss(expected_Q1, target_value.detach()) self.critic0_optimizer.zero_grad() critic_0_loss.backward() self.critic0_optimizer.step() self.critic1_optimizer.zero_grad() critic_1_loss.backward() self.critic1_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss #as mentioned in the paper, we delay updating the actor network. if step % self.update_frequency == 0: actor_loss = self.critic0.forward(state, self.actor.forward(state)) actor_loss = -actor_loss.mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ------------------- # self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE) self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE) self.soft_update(self.actor, self.target_actor, TRANSFER_RATE) def soft_update(self, local_model, target_model, tao): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tao * local_param.data + (1.0 - tao) * target_param.data) def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done)