class MADDPG: def __init__(self, state_size, action_size, num_agents, random_seed): self.agents = [ DDPG(state_size, action_size, num_agents, random_seed), DDPG(state_size, action_size, num_agents, random_seed) ] self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END ) # set decay rate based on epsilon end target def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" actions = [ agent.act(state, add_noise) for agent, state in zip(self.agents, states) ] return actions def target_act(self, states): """Returns actions for given state as per current policy.""" actions = [ agent.target_act(state) for agent, state in zip(self.agents, states) ] return actions def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward state = np.asanyarray(state) action = np.asanyarray(action) reward = np.asanyarray(reward) next_state = np.asanyarray(next_state) done = np.asanyarray(done) self.memory.add(state.reshape((1, self.num_agents, -1)), action.reshape((1, self.num_agents, -1)), \ reward.reshape((1, self.num_agents, -1)), next_state.reshape((1,self.num_agents, -1)), \ done.reshape((1, self.num_agents, -1))) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for ai in range(self.num_agents): experiences = self.memory.sample() self.learn(experiences, ai, GAMMA) def reset(self): [agent.reset() for agent in self.agents] def learn(self, experiences, ai, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences agent = self.agents[ai] # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_states = next_states.view(1, BATCH_SIZE, self.num_agents, -1) actions_next = self.target_act(next_states) actions_next = torch.cat(actions_next, dim=1) next_states = next_states.view(BATCH_SIZE, -1) actions_next = actions_next.view(BATCH_SIZE, -1) Q_targets_next = agent.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards[:, ai] + (gamma * Q_targets_next * (1 - dones[:, ai])) # Compute critic loss Q_expected = agent.critic_local(states.view(BATCH_SIZE, -1), actions.view(BATCH_SIZE, -1)) # mean squared error loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss # zero_grad because we do not want to accumulate # gradients from other batches, so needs to be cleared agent.critic_optimizer.zero_grad() # compute derivatives for all variables that # requires_grad-True critic_loss.backward() # update those variables that requires_grad-True agent.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss # take the current states and predict actions actions_pred = agent.actor_local(states) #actions_pred = torch.cat(actions_pred, dim=1) # -1 * (maximize) Q value for the current prediction actor_loss = -agent.critic_local(states.view( BATCH_SIZE, -1), actions_pred.view(BATCH_SIZE, -1)).mean() # Minimize the loss # zero_grad because we do not want to accumulate # gradients from other batches, so needs to be cleared agent.actor_optimizer.zero_grad() # compute derivatives for all variables that # requires_grad-True actor_loss.backward() # update those variables that requires_grad-True agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(agent.critic_local, agent.critic_target, TAU) self.soft_update(agent.actor_local, agent.actor_target, TAU) # update noise decay parameter if self.eps >= EPS_FINAL: self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) agent.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, memory=None, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weigth_decay=WEIGHT_DECAY, pretrained_actor_weights=None, pretrained_critic_weights=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weigth_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) if pretrained_actor_weights: actor_weights = torch.load(pretrained_actor_weights) self.actor_local.load_state_dict(actor_weights) self.actor_target.load_state_dict(actor_weights) if pretrained_critic_weights: critic_weights = torch.load(pretrained_critic_weights) self.critic_local.load_state_dict(critic_weights) self.critic_target.load_state_dict(critic_weights) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory if memory: self.memory = memory else: self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device).unsqueeze(0) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, model='DQN', buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, pretrained_model_file=None): if model not in ('DQN', 'DDQN'): raise ValueError('Current model supports DQN or DDQN') """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed model (str): currently suports DQN and DDQN buffer size (int): replay buffer size batch size (int): minibatch size gamma (float): discount factor tau (float): for soft update of target parameters lr (float): learning rate update_every (int): how often to update the network pretrained_model_file (str): filepath to .pth file with pretrained model weights """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.model = model # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) if pretrained_model_file: weights = torch.load(pretrained_model_file) self.qnetwork_local.load_state_dict(weights) self.qnetwork_target.load_state_dict(weights) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model if self.model == 'DQN': Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) if self.model == 'DDQN': argmax_actions = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather( 1, argmax_actions) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Score tracker and learning parameters self.score = 0 self.best_score = -np.inf # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.1 self.exploration_theta = 0.3 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 200 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.total_reward += reward self.count += 1 self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG_agent(): def __init__(self, state_size, action_size, num_agents, random_seed): """init the agent""" self.state_size = state_size self.action_size = action_size self.seed = random_seed # Construct Actor networks self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Construct Critic networks self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # noise processing self.noise = OUNoise((num_agents,action_size), random_seed) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" # convert state from numpy to pytorch array state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(state.shape[0]): self.memory.add(state[i, :], action[i], reward[i], next_state[i, :], done[i]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def reset(self): """ reset noise """ self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 #0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor - 0.99 self.tau = 0.01 # for soft update of target parameters - 0.01 # Score tracker and learning parameters self.best_w = None self.best_score = -np.inf self.score = -np.inf def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.total_reward += reward self.count += 1 self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / float(self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): # Print debug statements self.debug = False # Task (environment) information self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Actor (policy) model self.actor_lr = 1e-4 self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.actor_lr) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.actor_lr) # Critic (value) model self.critic_lr = 1e-4 self.critic_local = Critic(self.state_size, self.action_size, learning_rate=self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, learning_rate=self.critic_lr) # Print Actor / Critic NN architectures if self.debug: self.actor_local.model.summary() self.critic_local.model.summary() # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 1.5e-1 self.exploration_sigma = 2.0e-2 self.noise = OUNoise(self.action_size, mu=self.exploration_mu, theta=self.exploration_theta, sigma=self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 128 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Score tracker self.best_score = -np.inf self.total_reward = 0.0 self.count = 0 # Episode variables self.reset_episode() def reset_episode(self): score = self.total_reward / float( self.count) if self.count else -np.inf if score > self.best_score: self.best_score = score self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 # Learn if enough samples are in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # Add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next state actions and Q values from target networks actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model)