class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, memory=None, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weigth_decay=WEIGHT_DECAY, pretrained_actor_weights=None, pretrained_critic_weights=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weigth_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) if pretrained_actor_weights: actor_weights = torch.load(pretrained_actor_weights) self.actor_local.load_state_dict(actor_weights) self.actor_target.load_state_dict(actor_weights) if pretrained_critic_weights: critic_weights = torch.load(pretrained_critic_weights) self.critic_local.load_state_dict(critic_weights) self.critic_target.load_state_dict(critic_weights) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory if memory: self.memory = memory else: self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device).unsqueeze(0) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("use_cuda: ", use_cuda) print("Device: ", device) env = atari_wrapper.make_atari('RiverraidNoFrameskip-v4') env = atari_wrapper.wrap_deepmind(env, clip_rewards=False, frame_stack=True, pytorch_img=True) action_space = [a for a in range(env.action_space.n)] n_action = len(action_space) # DQN Model and optimizer: policy_model = DQNModel().to(device) target_model = DQNModel().to(device) target_model.load_state_dict(policy_model.state_dict()) optimizer = torch.optim.RMSprop(policy_model.parameters(), lr=lr, alpha=alpha) # Initialize the Replay Buffer replay_buffer = ReplayBuffer(rep_buf_size) while len(replay_buffer) < rep_buf_ini: observation = env.reset() done = False while not done: with torch.no_grad(): t_observation = torch.from_numpy(observation).float().to( device) t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2]) action = random.sample(range(len(action_space)), 1)[0] next_observation, reward, done, info = env.step( action_space[action]) replay_buffer.push(observation, action, reward, next_observation, done) observation = next_observation print('Experience Replay buffer initialized') # Use log to record the performance logger = logging.getLogger('dqn_Riverraid') logger.setLevel(logging.INFO) logger_handler = logging.FileHandler('./dqn_Riverraid.log') logger.addHandler(logger_handler) # Training part env.reset() score = 0 episode_score = [] mean_episode_score = [] episode_true = 0 num_frames = 0 episode = 0 last_100episode_score = deque(maxlen=100) while episode < max_episodes: observation = env.reset() done = False # import time # start=time.time() while not done: with torch.no_grad(): t_observation = torch.from_numpy(observation).float().to( device) / 255 t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2]) epsilon = epsilon_by_frame(num_frames) if random.random() > epsilon: q_value = policy_model(t_observation) action = q_value.argmax(1).data.cpu().numpy().astype( int)[0] else: action = random.sample(range(len(action_space)), 1)[0] next_observation, reward, done, info = env.step( action_space[action]) num_frames += 1 score += reward replay_buffer.push(observation, action, reward, next_observation, done) observation = next_observation # Update policy if len(replay_buffer ) > batch_size and num_frames % skip_frame == 0: observations, actions, rewards, next_observations, dones = replay_buffer.sample( batch_size) observations = torch.from_numpy(np.array(observations) / 255).float().to(device) actions = torch.from_numpy( np.array(actions).astype(int)).float().to(device) actions = actions.view(actions.shape[0], 1) rewards = torch.from_numpy( np.array(rewards)).float().to(device) rewards = rewards.view(rewards.shape[0], 1) next_observations = torch.from_numpy( np.array(next_observations) / 255).float().to(device) dones = torch.from_numpy( np.array(dones).astype(int)).float().to(device) dones = dones.view(dones.shape[0], 1) q_values = policy_model(observations) next_q_values = target_model(next_observations) q_value = q_values.gather(1, actions.long()) next_q_value = next_q_values.max(1)[0].unsqueeze(1) expected_q_value = rewards + gamma * next_q_value * (1 - dones) loss = huber_loss(q_value, expected_q_value) optimizer.zero_grad() loss.backward() optimizer.step() for target_param, policy_param in zip( target_model.parameters(), policy_model.parameters()): target_param.data.copy_(TAU * policy_param.data + (1 - TAU) * target_param.data) episode += 1 # episode_score.append(score) # end=time.time() # print("Running time ( %i episode): %.3f Seconds "%(episode ,end-start)) if info['ale.lives'] == 0: # episode_score.append(score) mean_score = score episode_true += 1 score = 0 # if episode % 20 == 0: # mean_score = np.mean(episode_score) mean_episode_score.append(mean_score) last_100episode_score.append(mean_score) # episode_score = [] logger.info('Frame: ' + str(num_frames) + ' / Episode: ' + str(episode_true) + ' / Average Score : ' + str(int(mean_score)) + ' / epsilon: ' + str(float(epsilon))) #plot_score(mean_episode_score, episode_true) pickle.dump(mean_episode_score, open('./dqn_Riverraid_mean_scores.pickle', 'wb')) if episode_true % 50 == 1: logger.info('Frame: ' + str(num_frames) + ' / Episode: ' + str(episode_true) + ' / Average Score : ' + str(int(mean_score)) + ' / epsilon: ' + str(float(epsilon)) + ' / last_100episode_score: ' + str(float(np.mean(last_100episode_score)))) if episode % 50 == 0: torch.save(target_model.state_dict(), './dqn_spaceinvaders_target_model_state_dict.pt') torch.save(policy_model.state_dict(), './dqn_spaceinvaders_model_state_dict.pt') pass
class MADDPG: def __init__(self, state_size, action_size, num_agents, random_seed): self.agents = [ DDPG(state_size, action_size, num_agents, random_seed), DDPG(state_size, action_size, num_agents, random_seed) ] self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END ) # set decay rate based on epsilon end target def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" actions = [ agent.act(state, add_noise) for agent, state in zip(self.agents, states) ] return actions def target_act(self, states): """Returns actions for given state as per current policy.""" actions = [ agent.target_act(state) for agent, state in zip(self.agents, states) ] return actions def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward state = np.asanyarray(state) action = np.asanyarray(action) reward = np.asanyarray(reward) next_state = np.asanyarray(next_state) done = np.asanyarray(done) self.memory.add(state.reshape((1, self.num_agents, -1)), action.reshape((1, self.num_agents, -1)), \ reward.reshape((1, self.num_agents, -1)), next_state.reshape((1,self.num_agents, -1)), \ done.reshape((1, self.num_agents, -1))) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for ai in range(self.num_agents): experiences = self.memory.sample() self.learn(experiences, ai, GAMMA) def reset(self): [agent.reset() for agent in self.agents] def learn(self, experiences, ai, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences agent = self.agents[ai] # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_states = next_states.view(1, BATCH_SIZE, self.num_agents, -1) actions_next = self.target_act(next_states) actions_next = torch.cat(actions_next, dim=1) next_states = next_states.view(BATCH_SIZE, -1) actions_next = actions_next.view(BATCH_SIZE, -1) Q_targets_next = agent.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards[:, ai] + (gamma * Q_targets_next * (1 - dones[:, ai])) # Compute critic loss Q_expected = agent.critic_local(states.view(BATCH_SIZE, -1), actions.view(BATCH_SIZE, -1)) # mean squared error loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss # zero_grad because we do not want to accumulate # gradients from other batches, so needs to be cleared agent.critic_optimizer.zero_grad() # compute derivatives for all variables that # requires_grad-True critic_loss.backward() # update those variables that requires_grad-True agent.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss # take the current states and predict actions actions_pred = agent.actor_local(states) #actions_pred = torch.cat(actions_pred, dim=1) # -1 * (maximize) Q value for the current prediction actor_loss = -agent.critic_local(states.view( BATCH_SIZE, -1), actions_pred.view(BATCH_SIZE, -1)).mean() # Minimize the loss # zero_grad because we do not want to accumulate # gradients from other batches, so needs to be cleared agent.actor_optimizer.zero_grad() # compute derivatives for all variables that # requires_grad-True actor_loss.backward() # update those variables that requires_grad-True agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(agent.critic_local, agent.critic_target, TAU) self.soft_update(agent.actor_local, agent.actor_target, TAU) # update noise decay parameter if self.eps >= EPS_FINAL: self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) agent.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, model='DQN', buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, pretrained_model_file=None): if model not in ('DQN', 'DDQN'): raise ValueError('Current model supports DQN or DDQN') """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed model (str): currently suports DQN and DDQN buffer size (int): replay buffer size batch size (int): minibatch size gamma (float): discount factor tau (float): for soft update of target parameters lr (float): learning rate update_every (int): how often to update the network pretrained_model_file (str): filepath to .pth file with pretrained model weights """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.model = model # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) if pretrained_model_file: weights = torch.load(pretrained_model_file) self.qnetwork_local.load_state_dict(weights) self.qnetwork_target.load_state_dict(weights) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model if self.model == 'DQN': Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) if self.model == 'DDQN': argmax_actions = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather( 1, argmax_actions) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Score tracker and learning parameters self.score = 0 self.best_score = -np.inf # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.1 self.exploration_theta = 0.3 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 200 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.total_reward += reward self.count += 1 self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG_agent(): def __init__(self, state_size, action_size, num_agents, random_seed): """init the agent""" self.state_size = state_size self.action_size = action_size self.seed = random_seed # Construct Actor networks self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Construct Critic networks self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # noise processing self.noise = OUNoise((num_agents,action_size), random_seed) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" # convert state from numpy to pytorch array state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(state.shape[0]): self.memory.add(state[i, :], action[i], reward[i], next_state[i, :], done[i]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def reset(self): """ reset noise """ self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 #0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor - 0.99 self.tau = 0.01 # for soft update of target parameters - 0.01 # Score tracker and learning parameters self.best_w = None self.best_score = -np.inf self.score = -np.inf def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.total_reward += reward self.count += 1 self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / float(self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): # Print debug statements self.debug = False # Task (environment) information self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Actor (policy) model self.actor_lr = 1e-4 self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.actor_lr) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.actor_lr) # Critic (value) model self.critic_lr = 1e-4 self.critic_local = Critic(self.state_size, self.action_size, learning_rate=self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, learning_rate=self.critic_lr) # Print Actor / Critic NN architectures if self.debug: self.actor_local.model.summary() self.critic_local.model.summary() # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 1.5e-1 self.exploration_sigma = 2.0e-2 self.noise = OUNoise(self.action_size, mu=self.exploration_mu, theta=self.exploration_theta, sigma=self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 128 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Score tracker self.best_score = -np.inf self.total_reward = 0.0 self.count = 0 # Episode variables self.reset_episode() def reset_episode(self): score = self.total_reward / float( self.count) if self.count else -np.inf if score > self.best_score: self.best_score = score self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 # Learn if enough samples are in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # Add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next state actions and Q values from target networks actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model)