class Agent(object): """Implements an agent that follows DDPG algorithm.""" def __init__(self, state_shape, num_actions, action_scale=2.0, discount=0.99, tau=0.01, actor_lrate=0.001, critic_lrate=0.01, l2_decay=1e-3, batch_size=64, q_update_iter=1, capacity=1000000): if not isinstance(state_shape, tuple): raise AssertionError('state_shape must be of type <tuple>.') elif len(state_shape) == 0: raise AssertionError('No state space dimensions provided.') elif num_actions == 0: raise ValueError('Number of actions must be > 0.') elif capacity < batch_size: raise ValueError('Replay capacity must be > batch_size.') self.batch_size = batch_size self.q_update_iter = q_update_iter self.replay_buffer = ReplayBuffer(capacity, state_shape, num_actions) self.actor = Actor(state_shape, num_actions, action_scale, actor_lrate, tau) self.critic = Critic(state_shape, num_actions, discount, critic_lrate, tau, l2_decay) self.step = 0 def choose_action(self, state): """Returns an action for the agent to perform in the environment.""" return self.actor.predict(state).flatten() def update_buffer(self, s0, a, r, s1, terminal): """Updates memory replay buffer with new experience.""" self.replay_buffer.update(s0, a, r, s1, terminal) def update_policy(self): """Updates Q-networks using replay memory data + performing SGD""" mb = self.replay_buffer.sample(self.batch_size) # To update the critic, we need a prediction from target policy target_a = self.actor.predict_target(mb[3]) self.critic.train_fn(mb[0], mb[1], mb[3], target_a, mb[2], mb[4]) # Updating the actor requires gradients from critic action = self.actor.predict(mb[0]) grads = self.critic.get_action_grads(mb[0], action) self.actor.train_fn(mb[0], grads) # Every few steps in an episode we update target network weights if self.step == self.q_update_iter: self.actor.update_target() self.critic.update_target() self.step = self.step + 1 if self.step != self.q_update_iter else 0
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed, device, lr_actor, lr_critic, weight_decay_critic, batch_size, buffer_size, gamma, tau, update_every, n_updates, eps_start, eps_end, eps_decay): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.t_step = 0 self.device = device self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay_critic = weight_decay_critic self.batch_size = batch_size self.buffer_size = buffer_size self.gamma = gamma self.tau = tau self.update_every = update_every self.n_updates = n_updates self.eps = eps_start self.eps_end = eps_end self.eps_decay = eps_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.t_step += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at interval settings if len(self.memory) > self.batch_size: if self.t_step % self.update_every == 0: for _ in range(self.n_updates): experiences = self.memory.sample() self.learn(experiences, self.gamma, agent_number) def act(self, states, add_noise): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(self.device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += self.eps * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # Update epsilon noise value self.eps = max(self.eps_end, self.eps_decay*self.eps) # self.eps = self.eps - (1/self.eps_decay) # if self.eps < self.eps_end: # self.eps = self.eps_end def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(env, estimator, target_network, num_episodes=1000, replay_memory_size=500000, frame_history_len=4, save_every=10, update_every=1000, discount=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=50000, batch_size=32, record_every=50): """ deep q learning algorithm :param env: openAI gym environment :param estimator: estimator model for predicting values :param target_network: :param num_episodes: number of episodes to run :param replay_memory_size: size of replay memory :param update_every: copy params from estimator into target estimator after this many steps :param discount: discount factor :param epsilon_start: starting epsilon value :param epsilon_end: ending epsilon value :param batch_size: 32 lol :param record_every: record a video every N episodes :return: """ # Load previous state here replay_memory = ReplayBuffer(replay_memory_size, frame_history_len) # epsilon delay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) loss_func = nn.SmoothL1Loss() optimizer = torch.optim.Adam(estimator.parameters()) policy = make_epsilon_greedy_policy(estimator, len(VALID_ACTIONS)) env = Monitor(env, directory="./monitor", resume=True, video_callable=lambda count: count % record_every == 0) total_t = 0 pbar = tqdm(range(num_episodes)) pbar.set_description("ep: %d, er: %.2f, et: %d, tt: %d, exp_size: %d" % (0, 0.0, 0, 0, 0)) for ep in pbar: state = env.reset() # 210 x 160 x 4 state = process_state(state) # 94 x 94 x 3 episode_loss = 0 episode_reward = 0 episode_t = 0 for t in itertools.count(): epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] last_idx = replay_memory.store_frame(state) recent_observations = replay_memory.encode_recent_observation() action_dist = policy(recent_observations, epsilon) action_dist = action_dist.squeeze(0).numpy() action = np.random.choice(np.arange(len(action_dist)), p=action_dist) next_state, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) episode_reward += reward replay_memory.store_effect(last_idx, action, reward, done) next_state = process_state(next_state) state = next_state if replay_memory.can_sample(batch_size): obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_memory.sample(batch_size) obs_batch = torch.from_numpy(obs_batch).float() obs_batch = obs_batch.to(device) act_batch = torch.from_numpy(act_batch).long().to(device) / 255.0 rew_batch = torch.from_numpy(rew_batch).to(device) next_obs_batch = torch.from_numpy(next_obs_batch).float().to(device) / 255.0 not_done_mask = torch.from_numpy(1 - done_mask).float().to(device) state_values = estimator(obs_batch) # b x VALID_ACTIONS state_action_values = torch.gather(state_values, 1, act_batch.unsqueeze(1)) # b x 1 next_state_values_max = target_network(next_obs_batch).detach().max(dim=1)[0] next_state_values = not_done_mask * next_state_values_max expected_q_value = (next_state_values * discount) + rew_batch # bellman_error = expected_q_value - state_action_values.squeeze(1) # # clipped_bellman_error = bellman_error.clamp(-1, 1) # # d_error = clipped_bellman_error * -1.0 loss = loss_func(state_action_values, expected_q_value.unsqueeze(1)) episode_loss += loss # state_action_values.backward(d_error.data.unsqueeze(1)) optimizer.zero_grad() loss.backward() optimizer.step() if done: break total_t += 1 episode_t = t pbar.set_description("ep: %d, el: %.5f, er: %.2f, et: %d, tt: %d, exp_size: %d" % (ep, episode_loss, episode_reward, episode_t, total_t, replay_memory.num_in_buffer)) if total_t % update_every == 0: copy_model_params(estimator, target_network) # save checkpoint if ep % save_every == 0: torch.save(estimator.state_dict(), './checkpoints/checkpoint.pt') env.close()
def challenger_round(): challengers = [] leaders = [] leader_checkpoints = os.listdir(LEADER_DIR) # Need to share the same schedule with all challengers, so they all anneal # at same rate epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES) for i in xrange(NUM_LEADERS): challenger = try_gpu( DQNAgent(6, epsilon_schedule, OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM)) if i < len(leader_checkpoints): leader = try_gpu( DQNAgent(6, LinearSchedule(0.1, 0.1, 500000), OBSERVATION_MODE)) leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i]) print "LOADING CHECKPOINT: {}".format(leader_path) challenger.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) leader.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) else: leader = RandomAgent(6) print "INITIALIZING NEW CHALLENGER AND LEADER" challengers.append(challenger) leaders.append(leader) if CHALLENGER_DIR is not None: challengers = [] # Load in all of the leaders for checkpoint in os.listdir(CHALLENGER_DIR): path = os.path.join(CHALLENGER_DIR, checkpoint) print "LOADING FROM CHALLENGER_DIR: {}".format(path) challenger = try_gpu( DQNAgent(6, LinearSchedule(0.05, 0.05, 1), CHALLENGER_OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM, name=checkpoint)) challenger.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) challengers.append(challenger) challenger = EnsembleDQNAgent(challengers) leader = EnsembleDQNAgent(leaders) if OPPONENT is not None or HUMAN: leader = NoOpAgent() replay_buffer = ReplayBuffer(1000000) rewards = collections.deque(maxlen=1000) frames = 0 # number of training frames seen episodes = 0 # number of training episodes that have been played with tqdm(total=TRAIN_FRAMES) as progress: # Each loop completes a single episode while frames < TRAIN_FRAMES: states = env.reset() challenger.reset() leader.reset() episode_reward = 0. episode_frames = 0 # Each loop completes a single step, duplicates _evaluate() to # update at the appropriate frame #s for _ in xrange(MAX_EPISODE_LENGTH): frames += 1 episode_frames += 1 action1 = challenger.act(states[0]) action2 = leader.act(states[1]) next_states, reward, done = env.step(action1, action2) episode_reward += reward # NOTE: state and next_state are LazyFrames and must be # converted to np.arrays replay_buffer.add( Experience(states[0], action1._action_index, reward, next_states[0], done)) states = next_states if len(replay_buffer) > 50000 and \ frames % 4 == 0: experiences = replay_buffer.sample(32) challenger.update_from_experiences(experiences) if frames % 10000 == 0: challenger.sync_target() if frames % SAVE_FREQ == 0: # TODO: Don't access internals for agent in challenger._agents: path = os.path.join(LEADER_DIR, agent.name + "-{}".format(frames)) print "SAVING CHECKPOINT TO: {}".format(path) torch.save(agent.state_dict(), path) #path = os.path.join( # LEADER_DIR, challenger.name + "-{}".format(frames)) #torch.save(challenger.state_dict(), path) if frames >= TRAIN_FRAMES: break if done: break if episodes % 300 == 0: print "Evaluation: {}".format( evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN)) print "Episode reward: {}".format(episode_reward) episodes += 1 rewards.append(episode_reward) stats = challenger.stats stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards) stats["Num Episodes"] = episodes stats["Replay Buffer Size"] = len(replay_buffer) progress.set_postfix(stats, refresh=False) progress.update(episode_frames) episode_frames = 0
class DDPG: def __init__(self, config): self.config = config self.state_size = config.state_size self.action_size = config.action_size self.actor_local = Actor(self.state_size, self.action_size, 2).to(device) self.actor_target = Actor(self.state_size, self.action_size, 2).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.LR_ACTOR) self.critic_local = Critic(self.state_size, self.action_size, 2).to(device) self.critic_target = Critic(self.state_size, self.action_size, 2).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=config.LR_CRITIC, ) self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE) self.noise = OUNoise(self.action_size, config.random_seed) self.t_step = 0 self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) def step(self, states, actions, rewards, next_states, dones): for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY if len(self.memory) > self.config.BATCH_SIZE and (self.t_step == 0): for i in range(self.config.EPOCH): experiences = self.memory.sample(self.config.BATCH_SIZE) self.learn(experiences) def reset(self): self.noise.reset() def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences Q_targets_next = self.critic_target(next_states, self.actor_target(next_states)) Q_targets = rewards + (self.config.GAMMA * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() actor_loss = -self.critic_local(states, self.actor_local(states)).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.TAU) self.soft_update(self.actor_local, self.actor_target, self.config.TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(object): """DDPG Agent that interacts and learns from the environment.""" def __init__(self, state_size, action_size, device, actor_args={}, critic_args={}): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations actor_args (dict): Arguments describing the actor network critic_args (dict): Arguments describing the critic network """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" self.t_step = 0 """Timestep between training updates""" # Parameters # Actor network self.actor_local = Actor(state_size, action_size, **actor_args).to(device) self.actor_target = Actor(state_size, action_size, **actor_args).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network self.critic_local = Critic(state_size, action_size, **critic_args).to(device) self.critic_target = Critic(state_size, action_size, **critic_args).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process for exploration self.noise = OUNoise(action_size, sigma=NOISE_SD) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device) def reset(self): """Reset state of agent.""" self.noise.reset() def save_weights(self, path): """Save local network weights. Args: path (string): File to save to""" torch.save( { 'actor_local': self.actor_local.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic_local': self.critic_local.state_dict(), 'critic_target': self.critic_target.state_dict() }, path) def load_weights(self, path): """Load local network weights. Args: path (string): File to load weights from""" checkpoint = torch.load(path) self.actor_local.load_state_dict(checkpoint['actor_local']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic_local.load_state_dict(checkpoint['critic_local']) self.critic_target.load_state_dict(checkpoint['critic_target']) def act(self, state, add_noise=True): """Returns action for given state according to the current policy Args: state (np.ndarray): Current state Returns: action (np.ndarray): Action tuple """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Temporarily set evaluation mode (no dropout &c) & turn off autograd self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().detach().numpy() # Resume training mode self.actor_local.train() # Add noise if exploring if add_noise: action += self.noise.sample() # The noise might take us out of range action = np.clip(action, -1, 1) return action def step(self, state, action, reward, next_state, done): """Save experience and learn if due. Args: state (Tensor): Current state action (int): Chosen action reward (float): Resulting reward next_state (Tensor): State after action done (bool): True if terminal state """ self.memory.add(state, action, reward, next_state, done) # Learn as soon as we have enough stored experiences self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE: for _ in range(NUM_UPDATES): experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """Learn from batch of experiences.""" states, actions, rewards, next_states, dones = experiences # region Update Critic actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) q_targets = rewards + (GAMMA * q_targets_next * (1 - dones)) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0) self.critic_optimizer.step() # endregion # region Update Actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # endregion # Update target networks soft_update(self.critic_local, self.critic_target, TAU) soft_update(self.actor_local, self.actor_target, TAU)
class Agent(object): """DQN Agent that interacts and learns from the environment.""" def __init__(self, state_size, action_size, device, replay_buffer_size=int(1e5), batch_size=64, discount_factor=0.99, soft_update=1e-3, learning_rate=5e-4, update_every=4, **kwargs): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations replay_buffer_size (int): Size of replay buffer batch_size (int): Size of experience batches during training discount_factor (float): Discount factor (gamma) soft_update (float): Soft update coefficient (tau) learning_rate (float): Learning rate (alpha) update_every (int): Steps between updating the network **kwargs: Arguments describing the QNetwork """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" # Parameters self.batch_size = batch_size """Size of experience batches during training""" self.discount_factor = discount_factor """Discount factor (gamma)""" self.soft_update = soft_update """Soft update coefficient (tau)""" self.update_every = update_every """Steps between updating the network""" # Q Networks self.target_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Target Q-Network""" self.local_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Local Q-Network""" self.optimizer = optim.Adam(self.local_network.parameters(), lr=learning_rate) """Optimizer used when training the Q-network.""" # Memory self.memory = ReplayBuffer(replay_buffer_size, batch_size, device) # Time step self.t_step = 0 """Current time step""" def save_weights(self, path): """Save local network weights. Args: path (string): File to save to""" self.local_network.save_weights(path) def load_weights(self, path): """Load local network weights. Args: path (string): File to load weights from""" self.local_network.load_weights(path) def act(self, state, eps=0.): """Returns action for given state according to the current policy Args: state (np.ndarray): Current state eps (float): Probability of selecting random action (epsilon) Returns: int: Epsilon-greedily selected action """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Temporarily set evaluation mode (no dropout &c) & turn off autograd self.local_network.eval() with torch.no_grad(): action_values = self.local_network(state) self.local_network.train() # Select action epsilon-greedily if random.random() > eps: return np.argmax(action_values.cpu().detach().numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): """Save experience and learn if due. Args: state (Tensor): Current state action (int): Chosen action reward (float): Resulting reward next_state (Tensor): State after action done (bool): True if terminal state """ self.memory.add(state, action, reward, next_state, done) # Learn if at update_every steps self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Check that we have enough stored experiences if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """Update Q-network using given experiences Args: experiences (Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: SARS'+done tuple """ states, actions, rewards, next_states, dones = experiences # Predicted Q values from target model for next states # (NB. torch.max returns tuple (max, argmax) q_target_next = self.target_network(next_states).max(dim=1, keepdim=True)[0] # Computed target Q values for current state q_target = rewards + self.discount_factor * q_target_next * (1 - dones) # Predicted Q values from local model for current state q_local = self.local_network(states).gather(dim=1, index=actions) loss = F.mse_loss(q_local, q_target) # Update local network weights self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network soft_update(self.local_network, self.target_network, self.soft_update)
# Add to buffer. instruction_data_cuda = [ torch.tensor(t, dtype=torch.float, device=device) for t in instruction_data ] replay_buffer.append(instruction_data_cuda) # Check for minimum replay size. if len(replay_buffer) < REPLAY_MIN: print('Waiting for minimum buffer size ... {}/{}'.format( len(replay_buffer), REPLAY_MIN)) continue # Sample training mini-batch. sampled_evaluations = replay_buffer.sample(REPLAY_SAMPLE_SIZE) sampled_contexts = torch.stack([t[0] for t in sampled_evaluations]) sampled_states = torch.stack([t[1] for t in sampled_evaluations]) sampled_params = torch.stack([t[2] for t in sampled_evaluations]) sampled_values = torch.stack([t[3] for t in sampled_evaluations]) # Update critic. critic_loss = torch.distributions.Normal(*critic_model(sampled_contexts, sampled_states, sampled_params)) \ .log_prob(sampled_values).mean(dim=-1) critic_model_optimizer.zero_grad() gen_model_optimizer.zero_grad() (-critic_loss).backward() torch.nn.utils.clip_grad_norm_(critic_model.parameters(), 1.0) critic_model_optimizer.step() # Update params model.
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 #0 self.exploration_theta = 0.15 #0.15 self.exploration_sigma = 0.2 #0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score tracker and learning parameters self.best_score = -np.inf def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Save reward self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state if done: # Keeping track of the score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)