def dqnTrain(self, double=True): step = 0 memory = ReplayMemory(self.MEMORY_CAPACITY_N) eval_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE) target_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE) eval_Qnetwork.set_weights(self.Qnetwork.get_weights()) target_Qnetwork.set_weights(eval_Qnetwork.get_weights()) reward_list = self.reward_list time_start = time.time() for episode in range(1, self.episode_M + 1): episode_reward = 0 state = self.env.reset() while True: step += 1 action = self.selectAction(eval_Qnetwork, state) next_state, reward, done, _ = self.env.step(action) episode_reward += reward memory.add((state, action, reward, next_state, done)) state = next_state if len(memory) > self.BATCH_SIZE: sample_batch = memory.sample(self.BATCH_SIZE) self.updateQNetwork(eval_Qnetwork, target_Qnetwork, sample_batch, double) # self.EPS = self.EPS*self.EPS_DECAY if self.EPS > self.EPS_MIN else self.EPS_MIN eps_fraction = min( float(step) / self.schedule_timesteps, self.eps_init) self.eps = self.eps_init + eps_fraction * (self.eps_final - self.eps_init) if step % self.TARGET_UPDATE_C == 0: target_Qnetwork.set_weights(eval_Qnetwork.get_weights()) if done: break reward_list.append(episode_reward) print( "episode: {}, reward: {}, tot_step: {}, {}min. eps: {}".format( episode, episode_reward, step, (time.time() - time_start) / 60, self.eps)) if episode % 5 == 0: print( "episode {}. recent 5 episode_reward:{}. using {} min. total step: {}. " .format(episode, self.reward_list[-5:], (time.time() - time_start) / 60, step)) if episode % 50 == 0: self.save(target_Qnetwork, reward_list) self.Qnetwork.set_weights(target_Qnetwork.get_weights()) self.reward_list = reward_list return target_Qnetwork, reward_list
class Agent(AgentConfig, EnvConfig): def __init__(self): self.env = gym.make(self.env_name) self.action_size = self.env.action_space.n # 2 for cartpole self.memory = ReplayMemory(memory_size=self.memory_size, action_size=self.action_size, per=self.per) if self.train_cartpole: self.policy_network = MlpPolicy(action_size=self.action_size).to(device) self.target_network = MlpPolicy(action_size=self.action_size).to(device) self.optimizer = optim.Adam(self.policy_network.parameters(), lr=self.learning_rate) self.loss = 0 self.criterion = nn.MSELoss() def new_random_game(self): self.env.reset() action = self.env.action_space.sample() screen, reward, terminal, info = self.env.step(action) return screen, reward, action, terminal def train(self): episode = 0 step = 0 reward_history = [] if not os.path.exists("./GIF/"): os.makedirs("./GIF/") # A new episode while step < self.max_step: start_step = step episode += 1 episode_length = 0 total_episode_reward = 0 frames_for_gif = [] self.gif = True if episode % self.gif_every == 0 else False # Get initial state state, reward, action, terminal = self.new_random_game() current_state = state # current_state = np.stack((state, state, state, state)) # A step in an episode while episode_length < self.max_episode_length: step += 1 episode_length += 1 # Choose action action = random.randrange(self.action_size) if np.random.rand() < self.epsilon else \ torch.argmax(self.policy_network(torch.FloatTensor(current_state).to(device))).item() # print(current_state) # print(self.policy_network(torch.FloatTensor(current_state).to(device))) # Act state, reward, terminal, _ = self.env.step(action) new_state = state # new_state = np.concatenate((current_state[1:], [state])) reward = -1 if terminal else reward if self.gif: frames_for_gif.append(new_state) self.memory.add(current_state, reward, action, terminal, new_state) current_state = new_state total_episode_reward += reward self.epsilon_decay() if step > self.start_learning and step % self.train_freq == 0: self.minibatch_learning() if terminal: last_episode_reward = total_episode_reward last_episode_length = step - start_step reward_history.append(last_episode_reward) print('episode: %.2f, total step: %.2f, last_episode length: %.2f, last_episode_reward: %.2f, ' 'loss: %.4f, eps = %.2f' % (episode, step, last_episode_length, last_episode_reward, self.loss, self.epsilon)) self.env.reset() if self.gif: generate_gif(last_episode_length, frames_for_gif, total_episode_reward, "./GIF/", episode) break if episode % self.reset_step == 0: self.target_network.load_state_dict(self.policy_network.state_dict()) if episode % self.plot_every == 0: plot_graph(reward_history) # self.env.render() self.env.close() def minibatch_learning(self): state_batch, reward_batch, action_batch, terminal_batch, next_state_batch = self.memory.sample(self.batch_size) y_batch = torch.FloatTensor() for i in range(self.batch_size): if terminal_batch[i]: y_batch = torch.cat((y_batch, torch.FloatTensor([reward_batch[i]])), 0) else: next_state_q = torch.max(self.target_network(torch.FloatTensor(next_state_batch[i]).to(device))) y = torch.FloatTensor([reward_batch[i] + self.gamma * next_state_q]) y_batch = torch.cat((y_batch, y), 0) current_state_q = torch.max(self.policy_network(torch.FloatTensor(state_batch).to(device)), dim=1)[0] self.loss = self.criterion(current_state_q, y_batch).mean() self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() def epsilon_decay(self): self.epsilon *= self.epsilon_decay_rate self.epsilon = max(self.epsilon, self.epsilon_minimum)
class Agent: def __init__(self, action_spec: dm_env.specs.DiscreteArray, observation_spec: dm_env.specs.Array, device: torch.device, settings: dict) -> None: """ Initializes the agent, constructs the qnet and the q_target, initializes the optimizer and ReplayMemory. Args: action_spec(dm_env.specs.DiscreteArray): description of the action space of the environment observation_spec(dm_env.specs.Array): description of observations form the environment device(str): "gpu" or "cpu" settings(dict): dictionary with settings """ self.device = device action_size = action_spec.num_values state_size = np.prod(observation_spec.shape) self.action_size = action_size self.state_size = state_size self.batch_size = settings['batch_size'] self.noisy_nets = settings['qnet_settings']['noisy_nets'] self.distributional = settings["qnet_settings"]["distributional"] if self.distributional: # Currently the distributional agent always uses Dueling DQN self.qnet = DistributionalDuelDQN(state_size, action_size, settings['qnet_settings'], device).to(device) self.q_target = DistributionalDuelDQN(state_size, action_size, settings['qnet_settings'], device).to(device) vmin, vmax = settings["qnet_settings"]["vmin"], settings[ "qnet_settings"]["vmax"] number_atoms = settings["qnet_settings"]["number_atoms"] self.distribution_updater = DistributionUpdater( vmin, vmax, number_atoms) else: if settings["duelling_dqn"]: self.qnet = DuelDQN(state_size, action_size, settings['qnet_settings']).to(device) self.q_target = DuelDQN(state_size, action_size, settings['qnet_settings']).to(device) else: self.qnet = Dqn(state_size, action_size, settings['qnet_settings']).to(device) self.q_target = Dqn(state_size, action_size, settings['qnet_settings']).to(device) self.q_target.load_state_dict(self.qnet.state_dict()) self.optimizer = optim.Adam(self.qnet.parameters(), lr=settings['lr']) self.epsilon = settings["epsilon_start"] self.decay = settings["epsilon_decay"] self.epsilon_min = settings["epsilon_min"] self.gamma = settings['gamma'] self.start_optimization = settings["start_optimization"] self.update_qnet_every = settings["update_qnet_every"] self.update_target_every = settings["update_target_every"] self.number_steps = 0 self.ddqn = settings["ddqn"] # Initialize replay memory self.prioritized_replay = settings["prioritized_buffer"] if self.prioritized_replay: self.memory = PrioritizedReplayMemory( device, settings["buffer_size"], self.gamma, settings["n_steps"], settings["alpha"], settings["beta0"], settings["beta_increment"]) else: self.memory = ReplayMemory(device, settings["buffer_size"], self.gamma, settings["n_steps"]) return def policy(self, timestep: dm_env.TimeStep) -> int: """ Returns an action following an epsilon-greedy policy. Args: timestep(dm_env.TimeStep): An observation from the environment Returns: int: The chosen action. """ observation = np.array(timestep.observation).flatten() observation = torch.from_numpy(observation).float().to(self.device) self.number_steps += 1 if not self.noisy_nets: self.update_epsilon() if np.random.rand() < self.epsilon: return np.random.choice(self.action_size) else: return int(self.qnet.get_max_action(observation)) def update_epsilon(self) -> None: """ Decays epsilon until self.epsilon_min Returns: None """ if self.epsilon > self.epsilon_min: self.epsilon *= self.decay @staticmethod def calc_loss( q_observed: torch.Tensor, q_target: torch.Tensor, weights: torch.Tensor) -> typing.Tuple[torch.Tensor, np.float64]: """ Returns the mean weighted MSE loss and the loss for each sample Args: q_observed(torch.Tensor): calculated q_value q_target(torch.Tensor): target q-value weights: weights of the batch samples Returns: tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample """ losses = functional.mse_loss(q_observed, q_target, reduction='none') loss = (weights * losses).sum() / weights.sum() return loss, losses.cpu().detach().numpy() + 1e-8 @staticmethod def calc_distributional_loss( dist: torch.Tensor, proj_dist: torch.Tensor, weights: torch.Tensor, ) -> typing.Tuple[torch.Tensor, np.float64]: """ Calculates the distributional loss metric. Args: dist(torch.Tensor): The observed distribution proj_dist: The projected target distribution weights: weights of the batch samples Returns: tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample """ losses = -functional.log_softmax(dist, dim=1) * proj_dist losses = weights * losses.sum(dim=1) return losses.mean(), losses.cpu().detach().numpy() + 1e-8 def update(self, step: dm_env.TimeStep, action: int, next_step: dm_env.TimeStep) -> None: """ Adds experience to the replay memory, performs an optimization_step and updates the q_target neural network. Args: step(dm_env.TimeStep): Current observation from the environment action(int): The action that was performed by the agent. next_step(dm_env.TimeStep): Next observation from the environment Returns: None """ observation = np.array(step.observation).flatten() next_observation = np.array(next_step.observation).flatten() done = next_step.last() exp = Experience(observation, action, next_step.reward, next_step.discount, next_observation, 0, done) self.memory.add(exp) if self.memory.number_samples() < self.start_optimization: return if self.number_steps % self.update_qnet_every == 0: s0, a0, n_step_reward, discount, s1, _, dones, indices, weights = self.memory.sample_batch( self.batch_size) if not self.distributional: self.optimization_step(s0, a0, n_step_reward, discount, s1, indices, weights) else: self.distributional_optimization_step(s0, a0, n_step_reward, discount, s1, dones, indices, weights) if self.number_steps % self.update_target_every == 0: self.q_target.load_state_dict(self.qnet.state_dict()) return def optimization_step(self, s0: torch.Tensor, a0: torch.Tensor, n_step_reward: torch.Tensor, discount: torch.Tensor, s1: torch.Tensor, indices: typing.Optional[torch.Tensor], weights: typing.Optional[torch.Tensor]) -> None: """ Calculates the Bellmann update and updates the qnet. Args: s0(torch.Tensor): current state a0(torch.Tensor): current action n_step_reward(torch.Tensor): n-step reward discount(torch.Tensor): discount factor s1(torch.Tensor): next state indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet. weights(torch.Tensor): weights needed for prioritized replay Returns: None """ with torch.no_grad(): if self.noisy_nets: self.q_target.reset_noise() self.qnet.reset_noise() # Calculating the target values next_q_vals = self.q_target(s1) if self.ddqn: a1 = torch.argmax(self.qnet(s1), dim=1).unsqueeze(-1) next_q_val = next_q_vals.gather(1, a1).squeeze() else: next_q_val = torch.max(next_q_vals, dim=1).values q_target = n_step_reward.squeeze( ) + self.gamma * discount.squeeze() * next_q_val # Getting the observed q-values if self.noisy_nets: self.qnet.reset_noise() q_observed = self.qnet(s0).gather(1, a0.long()).squeeze() # Calculating the losses if not self.prioritized_replay: weights = torch.ones(self.batch_size) critic_loss, batch_loss = self.calc_loss(q_observed, q_target, weights) # Backpropagation of the gradients self.optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5) self.optimizer.step() # Update replay memory self.memory.update_priorities(indices, batch_loss) return def distributional_optimization_step( self, s0: torch.Tensor, a0: torch.Tensor, n_step_reward: torch.Tensor, discount: torch.Tensor, s1: torch.Tensor, dones: torch.Tensor, indices: typing.Optional[torch.Tensor], weights: typing.Optional[torch.Tensor]) -> None: """ Calculates the Bellmann update and updates the qnet for the distributional agent. Args: s0(torch.Tensor): current state a0(torch.Tensor): current action n_step_reward(torch.Tensor): n-step reward discount(torch.Tensor): discount factor s1(torch.Tensor): next state dones(torch.Tensor): done indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet. weights(torch.Tensor): weights needed for prioritized replay Returns: None """ with torch.no_grad(): gamma = self.gamma * discount if self.noisy_nets: self.q_target.reset_noise() self.qnet.reset_noise() # Calculating the target distributions next_dists, next_q_vals = self.q_target.calc(s1) if self.ddqn: a1 = self.qnet.get_max_action(s1) else: a1 = torch.max(next_q_vals, dim=1) distributions = next_dists[range(self.batch_size), a1] distributions = functional.softmax(distributions, dim=1) q_target = self.distribution_updater.update_distribution( distributions.cpu().detach().numpy(), n_step_reward.cpu().detach().numpy(), dones.cpu().detach().numpy(), gamma.cpu().detach().numpy()) q_target = torch.tensor(q_target).to(self.device) # Getting the observed q-value distributions if self.noisy_nets: self.qnet.reset_noise() q_observed = self.qnet(s0) q_observed = q_observed[range(self.batch_size), a0.squeeze().long()] # Calculating the losses if not self.prioritized_replay: weights = torch.ones(self.batch_size) critic_loss, batch_loss = self.calc_distributional_loss( q_observed, q_target, weights) # Backpropagation of the gradients self.optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5) self.optimizer.step() # Update replay memory self.memory.update_priorities(indices, batch_loss) return
class Agent(): def __init__(self, args, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.per = args.per self.dueling = args.dueling self.buffer_size = args.buffer_size self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.lr = args.learning_rate self.update_freq = args.update_every # Q-Network if self.dueling: self.local_qnet = DuelingQNet(state_size, action_size, seed).to(device) self.target_qnet = DuelingQNet(state_size, action_size, seed).to(device) else: self.local_qnet = QNet(state_size, action_size, seed).to(device) self.target_qnet = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_qnet.parameters(), lr=self.lr) # Replay Memory if self.per: self.memory = PrioritizedReplayMemory(args, self.buffer_size) else: self.memory = ReplayMemory(action_size, self.buffer_size, self.batch_size, seed) self.t_step = 0 # init time step for updating every UPDATE_EVERY steps def step(self, state, action, reward, next_state, done): if self.per: self.memory.append(state, action, reward, next_state, done) else: self.memory.add(state, action, reward, next_state, done) # save experience to replay memory. # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_freq if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: if self.dueling: self.learn_DDQN(self.gamma) else: self.learn(self.gamma) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.local_qnet.eval() with torch.no_grad(): action_values = self.local_qnet(state) self.local_qnet.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, gamma): if self.per: idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample( self.batch_size) else: states, actions, rewards, next_states, dones = self.memory.sample() # Get max predicted Q values for next states from target model Q_targets_next = self.target_qnet(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.local_qnet(states).gather(1, actions) # Compute loss - element-wise mean squared error # Now loss is a Tensor of shape (1,) # loss.item() gets the scalar value held in the loss. loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() if self.per: (weights * loss).mean().backward( ) # Backpropagate importance-weighted minibatch loss else: loss.backward() self.optimizer.step() if self.per: errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) self.memory.update_priorities(idxs, errors) # Update target network self.soft_update(self.local_qnet, self.target_qnet, self.tau) def learn_DDQN(self, gamma): if self.per: idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample( self.batch_size) else: states, actions, rewards, next_states, dones = self.memory.sample() # Get index of maximum value for next state from Q_expected Q_argmax = self.local_qnet(next_states).detach() _, a_prime = Q_argmax.max(1) # Get max predicted Q values for next states from target model Q_targets_next = self.target_qnet(next_states).detach().gather( 1, a_prime.unsqueeze(1)) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.local_qnet(states).gather(1, actions) # Compute loss # Now loss is a Tensor of shape (1,) # loss.item() gets the scalar value held in the loss. loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() if self.per: (weights * loss).mean().backward( ) # Backpropagate importance-weighted minibatch loss else: loss.backward() self.optimizer.step() if self.per: errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) self.memory.update_priorities(idxs, errors) # Update target network self.soft_update(self.local_qnet, self.target_qnet, self.tau) def soft_update(self, local_model, target_model, tau): # θ_target = τ*θ_local + (1 - τ)*θ_target for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDQAgent(DQAgent): """ Double DeepQ Agent with q_network and target network """ def __init__(self, q_network, target_network, environment, name='ddqn'): self.q_network = q_network self.target_network = target_network self.replay_memory = None self.environment = environment # book keeping self.name = name self.current_step = 0 self.save_path = os.path.join('checkpoints', name + '.pkl') self.logdir = os.path.join('runs', name) def learn(self, num_steps, batch_size=32, capacity=500000, lr=2.5e-4, epsilon_max=0.9, epsilon_min=0.05, decay_rate=1e-5, checkpoint_interval=50000, initial_memory=50000, sync_interval=1000, gamma=0.99): cudnn.benchmark = True self.replay_memory = ReplayMemory(capacity) if len(self.replay_memory) < initial_memory: print('populating replay memory...') self.prime_replay_memory(initial_memory) writer = SummaryWriter(self.logdir) optimizer = Adam(self.q_network.parameters(), lr=lr) criterion = nn.SmoothL1Loss() steps = 0 pbar = tqdm(total=num_steps) while steps <= num_steps: state = self.environment.reset() total_reward = 0 while True: epsilon = self.calculate_epsilon(epsilon_max, epsilon_min, decay_rate) action = self.select_action(state, epsilon) # selection an action next_state, reward, done, info = self.environment.step( action) # carry out action/observe reward self.replay_memory.add(state, action, reward, next_state, done) states, actions, rewards, next_states, done_mask = self.replay_memory.sample( batch_size) # prepare batch states = Variable(states).cuda() next_states = Variable(next_states).cuda() rewards = Variable(rewards).cuda() done_mask = Variable(done_mask).cuda() q_values = self.q_network(states)[ range(len(actions)), actions] # select only Q values for actions we took target_actions = self.q_network(next_states).max(dim=1)[1] next_q_values = self.target_network( next_states)[range(len(target_actions)), target_actions].detach() * done_mask # calculate targets = rewards + (gamma * next_Q_values) targets = rewards + (gamma * next_q_values) loss = criterion(q_values, targets) optimizer.zero_grad() loss.backward() # gradient clipping for param in self.q_network.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() writer.add_scalar('epsilon', epsilon, self.current_step) steps += 1 total_reward += reward self.current_step += 1 state = next_state # move to next state if steps % sync_interval == 0: dqn_params = self.q_network.state_dict() self.target_network.load_state_dict(dqn_params) if steps % checkpoint_interval == 0: self.save_checkpoint() pbar.update() if done: writer.add_scalar('reward', total_reward, self.current_step) pbar.set_description( "last episode reward: {}".format(total_reward)) break self.environment.close()
class DQAgent: """ DeepQ Agent without bells and whistles. Uses single Q network and replay memory to interact with environment. """ def __init__(self, q_network, environment, name='ddqn'): self.q_network = q_network self.environment = environment self.replay_memory = None # book keeping self.name = name self.current_step = 0 self.save_path = os.path.join('checkpoints', name + '.pkl') self.logdir = os.path.join('runs', name) def calculate_epsilon(self, epsilon_max, epsilon_min, decay_rate): """ calculates epsilon value given steps done and speed of decay """ epsilon = epsilon_min + (epsilon_max - epsilon_min) * \ math.exp(-decay_rate * self.current_step) return epsilon def select_action(self, state, epsilon): """ epsilon greedy policy. selects action corresponding to maximum predicted Q value, otherwise selects otherwise selects random action with epsilon probability. Args: state: current state of the environment (4 stack of image frames) epsilon: probability of random action (1.0 - 0.0) Returns: action """ if epsilon > random.random(): return self.environment.action_space.sample() state = Variable(process_state(state), volatile=True).cuda() return int(self.q_network(state).data.max(1)[1]) def learn(self, num_steps, batch_size=32, capacity=500000, lr=2.5e-4, epsilon_max=0.9, epsilon_min=0.05, decay_rate=1e-5, checkpoint_interval=50000, initial_memory=50000, gamma=0.99): cudnn.benchmark = True self.replay_memory = ReplayMemory(capacity) if len(self.replay_memory) < initial_memory: print('populating replay memory...') self.prime_replay_memory(initial_memory) writer = SummaryWriter(self.logdir) optimizer = Adam(self.q_network.parameters(), lr=lr) criterion = nn.SmoothL1Loss() steps = 0 pbar = tqdm(total=num_steps) while steps <= num_steps: state = self.environment.reset() total_reward = 0 while True: epsilon = self.calculate_epsilon(epsilon_max, epsilon_min, decay_rate) action = self.select_action(state, epsilon) # selection an action next_state, reward, done, info = self.environment.step( action) # carry out action/observe reward self.replay_memory.add(state, action, reward, next_state, done) states, actions, rewards, next_states, done_mask = self.replay_memory.sample( batch_size) # prepare batch states = Variable(states).cuda() next_states = Variable(next_states).cuda() rewards = Variable(rewards).cuda() done_mask = Variable(done_mask).cuda() q_values = self.q_network(states)[ range(len(actions)), actions] # select only Q values for actions we took # find next Q values and set Q values for done states to 0 next_q_values = self.q_network(next_states).max( dim=1)[0].detach() * done_mask # calculate targets = rewards + (gamma * next_Q_values) targets = rewards + (gamma * next_q_values) loss = criterion(q_values, targets) optimizer.zero_grad() loss.backward() # gradient clipping for param in self.q_network.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() writer.add_scalar('epsilon', epsilon, self.current_step) steps += 1 total_reward += reward self.current_step += 1 state = next_state # move to next state if steps % checkpoint_interval == 0: self.save_checkpoint() pbar.update() if done: writer.add_scalar('reward', total_reward, self.current_step) pbar.set_description( "last episode reward: {}".format(total_reward)) break self.environment.close() def play(self, num_episodes, epsilon=0.05, render=True): for _ in tqdm(range(num_episodes)): total_reward = 0 state = self.environment.reset() while True: if render: self.environment.render() action = self.select_action(state, epsilon) # selection an action next_state, reward, done, info = self.environment.step( action) # carry out action/observe reward total_reward += reward state = next_state # move to next state if done: break self.environment.close() def prime_replay_memory(self, steps): """ populates replay memory with transitions generated by random actions """ while len(self.replay_memory) <= steps: state = self.environment.reset() while True: action = self.environment.action_space.sample() next_state, reward, done, info = self.environment.step( action) # carry out action/observe reward self.replay_memory.add(state, action, reward, next_state, done) state = next_state # move to next state if done: break def load_agent(self, name): checkpoint_path = os.path.join('checkpoints', name + '.pkl') checkpoint = torch.load(checkpoint_path) self.q_network.load_state_dict(checkpoint['weights']) self.current_step = checkpoint['current_step'] def save_checkpoint(self): checkpoint = dict(weights=self.q_network.state_dict(), current_step=self.current_step) torch.save(checkpoint, self.save_path)
class Agent(): def __init__(self, config, session, num_actions): self.config = config self.sess = session self.num_actions = num_actions self.gamma = config['gamma'] self.learning_rate = config['learning_rate'] self.exp_replay = ReplayMemory(self.config) self.game_state = np.zeros((1, config['screen_width'], config['screen_height'], config['history_length']), dtype=np.uint8) self.update_thread = threading.Thread(target=lambda: 0) self.update_thread.start() self.step_count = 0 self.episode = 0 self.isTesting = False self.reset_game() self.timeout_option = tf.RunOptions(timeout_in_ms=5000) # build the net with tf.device(config['device']): # Create all variables self.state_ph = tf.placeholder(tf.float32, [None, config['screen_width'], config['screen_height'], config['history_length']], name='state_ph') self.stateT_ph = tf.placeholder(tf.float32, [None, config['screen_width'], config['screen_height'], config['history_length']], name='stateT_ph') self.action_ph = tf.placeholder(tf.int64, [None], name='action_ph') self.reward_ph = tf.placeholder(tf.float32, [None], name='reward_ph') self.terminal_ph = tf.placeholder(tf.float32, [None], name='terminal_ph') # Define training network with tf.variable_scope('Q') as scope: self.Q = self.Q_network(self.state_ph, config, 'Normal') # *** Double Q-Learning *** scope.reuse_variables() self.DoubleQT = self.Q_network(self.stateT_ph, config, 'DoubleQ') # Define Target network with tf.variable_scope('QT'): self.QT = self.Q_network(self.stateT_ph, config, 'Target') # Define training operation self.train_op = self.train_op(self.Q, self.QT, self.action_ph, self.reward_ph, self.terminal_ph, config, 'Normal') # Define operation to copy parameteres from training to target net. with tf.variable_scope('Copy_parameters'): self.sync_QT_op = [] for W_pair in zip(tf.get_collection('Target_weights'),tf.get_collection('Normal_weights')): self.sync_QT_op.append(W_pair[0].assign(W_pair[1])) # Define the summary ops self.Q_summary_op = tf.merge_summary(tf.get_collection('Normal_summaries')) self.summary_writter = tf.train.SummaryWriter(config['log_dir'], self.sess.graph, flush_secs=20) def update(self): state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, _ = self.exp_replay.sample_transition_batch() feed_dict={self.state_ph: state_batch, self.stateT_ph: next_state_batch, self.action_ph: action_batch, self.reward_ph: reward_batch, self.terminal_ph: terminal_batch} if self.step_count % self.config['update_summary_rate'] == 0: _, Q_summary_str = self.sess.run([self.train_op, self.Q_summary_op], feed_dict, options=self.timeout_option) self.summary_writter.add_summary(Q_summary_str, self.step_count) else: _ = self.sess.run(self.train_op, feed_dict, options=self.timeout_option) if self.step_count % self.config['sync_rate'] == 0: self.sess.run(self.sync_QT_op) def Q_network(self, input_state, config, Collection=None): conv_stack_shape=[(32,8,4), (64,4,2), (64,3,1)] head = tf.div(input_state,256., name='normalized_input') head = cops.conv_stack(head, conv_stack_shape, Collection) head = cops.flatten(head) head = cops.add_relu_layer(head, size=512, Collection=Collection) Q = cops.add_linear_layer(head, self.num_actions, Collection, layer_name="Q") return Q def train_op(self, Q, QT, action, reward, terminal, config, Collection): with tf.name_scope('Loss'): action_one_hot = tf.one_hot(action, self.num_actions, 1., 0., name='action_one_hot') acted_Q = tf.reduce_sum(Q * action_one_hot, reduction_indices=1, name='DQN_acted') # *** Double Q-Learning *** target_action = tf.argmax(self.DoubleQT, dimension=1) target_action_one_hot = tf.one_hot(target_action, self.num_actions, 1., 0., name='target_action_one_hot') DoubleQT_acted = tf.reduce_sum(self.QT * target_action_one_hot, reduction_indices=1, name='DoubleQT') Y = reward + self.gamma * DoubleQT_acted * (1 - terminal) # *** Double Q-Learning *** Y = tf.stop_gradient(Y) loss_batch = cops.clipped_l2(Y, acted_Q) loss = tf.reduce_sum(loss_batch, name='loss') tf.scalar_summary('losses/loss', loss, collections=[Collection + '_summaries']) tf.scalar_summary('losses/loss_0', loss_batch[0],collections=[Collection + '_summaries']) tf.scalar_summary('losses/loss_max', tf.reduce_max(loss_batch),collections=[Collection + '_summaries']) tf.scalar_summary('main/Y_0', Y[0], collections=[Collection + '_summaries']) tf.scalar_summary('main/Y_max', tf.reduce_max(Y), collections=[Collection + '_summaries']) tf.scalar_summary('main/acted_Q_0', acted_Q[0], collections=[Collection + '_summaries']) tf.scalar_summary('main/acted_Q_max', tf.reduce_max(acted_Q), collections=[Collection + '_summaries']) tf.scalar_summary('main/reward_max', tf.reduce_max(reward), collections=[Collection + '_summaries']) train_op, grads = cops.graves_rmsprop_optimizer(loss, self.learning_rate, 0.95, 0.01, 1) return train_op def testing(self, t=True): self.isTesting = t def reset_game(self): self.episode_begining = True self.game_state.fill(0) def epsilon(self): if self.step_count < self.config['exploration_steps']: return self.config['ep_start'] - ((self.config['ep_start'] - self.config['ep_min']) / self.config['exploration_steps']) * self.step_count else: return self.config['ep_min'] def e_greedy_action(self, epsilon): if np.random.uniform() < epsilon: action = random.randint(0, self.num_actions - 1) else: action = np.argmax(self.sess.run(self.Q, feed_dict={self.state_ph: self.game_state})[0]) return action def done(self): if not self.isTesting: self.exp_replay.add(self.game_state[:, :, :, -1],self.game_action, self.game_reward, True) self.reset_game() def observe(self, x, r): self.game_reward = r x_ = cv2.resize(x, (self.config['screen_width'], self.config['screen_height'])) x_ = cv2.cvtColor(x_, cv2.COLOR_RGB2GRAY) self.game_state = np.roll(self.game_state, -1, axis=3) self.game_state[0, :, :, -1] = x_ def step(self, x, r): r = max(self.config['min_reward'], min(self.config['max_reward'], r)) if not self.isTesting: if not self.episode_begining: self.exp_replay.add(self.game_state[:, :, :, -1], self.game_action, self.game_reward, False) else: for i in range(self.config['history_length'] - 1): # add the resetted buffer self.exp_replay.add(self.game_state[:, :, :, i], 0, 0, False) self.episode_begining = False self.observe(x, r) self.game_action = self.e_greedy_action(self.epsilon()) if self.step_count > self.config['steps_before_training']: self.update_thread.join() self.update_thread = threading.Thread(target=self.update) self.update_thread.start() self.step_count += 1 else: self.observe(x, r) self.game_action = self.e_greedy_action(0.01) return self.game_action
class DeepQ_agent: """ Represents the DQN agent. """ def __init__(self, env, hidden_units = None, network_LR=0.01, batch_size=1024, update_every=5, gamma=0.95): """ Creates a DQN agent. :param env: game environment. :type env: Class Snake_Env(). :param hidden_units: number of neurons in each layer. :type hidden_units: tupple with dimension (1, 3). :param network_LR: learning rate of the action-value neural network. :type network_LR: float. :param batch_size: size of the minibatch taken from the replay buffer. :type batch_size: int. :param update_every: number of iterations for updating the target qnetwork. :type update_every: int :param gamma: discount factor. :type gamma: float. """ self.env = env self.BATCH_SIZE = batch_size self.GAMMA = gamma self.NETWORK_LR = network_LR self.MEMORY_CAPACITY = int(1e5) self.ACTION_SIZE = env.ACTION_SPACE self.HIDDEN_UNITS = hidden_units self.UPDATE_EVERY = update_every self.qnetwork_local = QNetwork(input_shape = self.env.STATE_SPACE, hidden_units = self.HIDDEN_UNITS, output_size = self.ACTION_SIZE, learning_rate = self.NETWORK_LR) self.qnetwork_target = QNetwork(input_shape = self.env.STATE_SPACE, hidden_units = self.HIDDEN_UNITS, output_size = self.ACTION_SIZE, learning_rate = self.NETWORK_LR) self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) #Temp variable self.t = 0 def learn(self): """ Learn from memorized experience. """ if self.memory.__len__() > self.BATCH_SIZE: states, actions, rewards, next_states, dones = self.memory.sample(self.env.STATE_SPACE) #Calculating action-values using local network target = self.qnetwork_local.predict(states, self.BATCH_SIZE) #Future action-values using target network target_val = self.qnetwork_target.predict(next_states, self.BATCH_SIZE) #Future action-values using local network target_next = self.qnetwork_local.predict(next_states, self.BATCH_SIZE) max_action_values = np.argmax(target_next, axis=1) #action selection for i in range(self.BATCH_SIZE): if dones[i]: target[i][actions[i]] = rewards[i] else: target[i][actions[i]] = rewards[i] + self.GAMMA*target_val[i][max_action_values[i]] #action evaluation self.qnetwork_local.train(states, target, batch_size = self.BATCH_SIZE) if self.t == self.UPDATE_EVERY: self.update_target_weights() self.t = 0 else: self.t += 1 def act(self, state, epsilon=0.0): """ Chooses an action using an epsilon-greedy policy. :param state: current state. :type state: NumPy array with dimension (1, 18). :param epsilon: epsilon used in epsilon-greedy policy. :type epsilon: float :return action: action chosen by the agent. :rtype: int """ state = state.reshape((1,)+state.shape) action_values = self.qnetwork_local.predict(state) #returns a vector of size = self.ACTION_SIZE if random() > epsilon: action = np.argmax(action_values) #choose best action - Exploitation else: action = randint(0, self.ACTION_SIZE-1) #choose random action - Exploration return action def add_experience(self, state, action, reward, next_state, done): """ Add experience to agent's memory. """ self.memory.add(state, action, reward, next_state, done) def update_target_weights(self): """ Updates values of the Target network. """ self.qnetwork_target.model.set_weights(self.qnetwork_local.model.get_weights())
class T3DAgent: def __init__(self, env, brain, brain_name, device, settings): self.env = env self.brain_name = brain_name self.device = device action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations state_size = states.shape[1] self.action_size = action_size self.state_size = state_size self.batch_size = settings['batch_size'] # Initialize actor local and target networks self.actor_local = Actor(state_size, action_size, settings['actor_settings']).to(device) self.actor_target = Actor(state_size, action_size, settings['actor_settings']).to(device) self.actor_target.load_state_dict(self.actor_local.state_dict()) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=settings['lr_actor']) # Initialize critic networks self.critic_local = Critic(state_size, action_size, settings['critic_settings']).to(device) self.critic_target = Critic(state_size, action_size, settings['critic_settings']).to(device) self.critic_target.load_state_dict(self.critic_local.state_dict()) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=settings['lr_critic']) # Save some of the settings into class member variables self.pretrain_steps = settings['pretrain_steps'] self.gamma = settings['gamma'] self.tau = settings['tau'] self.action_noise = settings['action_noise'] self.action_clip = settings['action_clip'] self.target_action_noise = settings['target_action_noise'] self.target_noise_clip = settings['target_noise_clip'] self.optimize_every = settings['optimize_critic_every'] # Initialize replay memory and episode generator self.memory = ReplayMemory(device, settings['buffer_size']) self.generator = self.play_episode() self.number_steps = 0 return def get_action_noise(self): return self.action_noise def set_action_noise(self, std): self.action_noise = std return def pretrain(self): # The idea of using a pretrain phase before starting regular episodes # is from https://github.com/whiterabbitobj/Continuous_Control/ print("Random sampling of " + str(self.pretrain_steps) + " steps") env = self.env brain_name = self.brain_name env_info = env.reset(train_mode=True)[brain_name] number_agents = env_info.vector_observations.shape[0] for _ in range(self.pretrain_steps): actions = [] states = env_info.vector_observations for _ in range(number_agents): actions.append(np.random.uniform(-1, 1, self.action_size)) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(Experience(state, action, reward, next_state, done)) if np.any(dones): env_info = env.reset(train_mode=True)[brain_name] def play_episode(self, train_mode = True): # The idea of generating episodes in an "experience generator" is from # "Deep Reinforcement Learning Hands-On" by Maxim Lapan print("Starting episode generator") # Initialize the environment env = self.env brain_name = self.brain_name env_info = env.reset(train_mode=train_mode)[brain_name] # Initialize episode_rewards and get the first state episode_rewards = [] # Run episode step by step while True: states = env_info.vector_observations with torch.no_grad(): actions = self.actor_local.forward( torch.from_numpy(states).type(torch.FloatTensor).to(self.device)).cpu().detach().numpy() actions += self.action_noise * np.random.normal(size=actions.shape) actions = np.clip(actions, -self.action_clip, self.action_clip) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done episode_rewards.append(rewards) for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(Experience(state, action, reward, next_state, done)) if np.any(dones): agent_reward = np.sum(episode_rewards, axis=0) std_reward = np.std(agent_reward) mean_reward = np.mean(agent_reward) episode_rewards = [] env_info = env.reset(train_mode=True)[brain_name] yield mean_reward, std_reward else: yield -1, -1 def take_step(self, train_mode = True): return next(self.generator, train_mode) def learn(self): self.number_steps += 1 if self.memory.number_samples() <= self.batch_size: return # states, actions, rewards, next states, done s0, a0, r, s1, d = self.memory.sample_batch(self.batch_size) critic_loss_a, critic_loss_b = self.optimize_critic(s0, a0, r, s1, d) actor_loss = self.optimize_actor(s0) return actor_loss, critic_loss_a, critic_loss_b def optimize_actor(self, s0): # Calc policy loss if self.number_steps % self.optimize_every == 0: a0_pred = self.actor_local(s0) actor_loss = -self.critic_local.get_qa(s0, a0_pred).mean() # Update actor nn self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # slow update self.slow_update(self.tau) return -actor_loss.cpu().detach().numpy() return 0 def optimize_critic(self, s0, a0, r, s1, d): # The ideas of adding noise to the next state a1 as well as the critic loss, that takes q1_expected and # q2_expected as arguments at the same time, are from the implementation of the authors of the TD3 manuscript # at https://github.com/sfujim/TD3/ with torch.no_grad(): # calc critic loss noise = torch.randn_like(a0).to(self.device) noise = noise * torch.tensor(self.target_action_noise).expand_as(noise).to(self.device) noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip) a1 = (self.actor_target(s1) + noise).clamp(-self.action_clip, self.action_clip) qa_target, qb_target = self.critic_target(s1, a1) q_target = torch.min(qa_target, qb_target) q_target = r + self.gamma * (1.0 - d) * q_target qa_expected, qb_expected = self.critic_local(s0, a0) critic_loss_a = functional.mse_loss(qa_expected, q_target) critic_loss_b = functional.mse_loss(qb_expected, q_target) critic_loss = critic_loss_a + critic_loss_b # Update critic nn self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() return critic_loss_a.cpu().detach().numpy(), critic_loss_b.cpu().detach().numpy() def slow_update(self, tau): for target_par, local_par in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_par.data.copy_(tau * local_par.data + (1.0 - tau) * target_par.data) for target_par, local_par in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_par.data.copy_(tau * local_par.data + (1.0 - tau) * target_par.data) return def load_nets(self, actor_file_path, critic_file_path): self.actor_local.load_state_dict(torch.load(actor_file_path)) self.actor_local.eval() self.critic_local.load_state_dict(torch.load(critic_file_path)) self.critic_local.eval() return def save_nets(self, model_save_path): actor_path = model_save_path + "_actor_net.pt" torch.save(self.actor_local.state_dict(), actor_path) critic_path = model_save_path + "_critic_net.pt" torch.save(self.critic_local.state_dict(), critic_path) return
class MADDPGAgent(Agent): def __init__(self, index, name, env, actor, critic, params): self.index = index self.name = name self.env = env self.actor = actor.to(DEVICE) self.critic = critic.to(DEVICE) self.actor_target = actor.clone().to(DEVICE) self.critic_target = critic.clone().to(DEVICE) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=params.lr_actor) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=params.lr_critic) self.memory = ReplayMemory(params.memory_size, params.max_episode_len, self.actor.n_outputs, self.actor.n_inputs) self.mse = torch.nn.MSELoss() # params self.batch_size = params.batch_size self.tau = params.tau self.gamma = params.gamma self.clip_grads = True # flags # local obs/actions means only the obs/actions of this agent are available # if obs and actions are local this is equivalent to DDPG self.local_obs = params.local_obs self.local_actions = params.local_actions or params.local_obs # agent modeling self.use_agent_models = params.use_agent_models self.agent_models = {} self.model_optims = {} self.model_lr = params.modeling_lr self.entropy_weight = 1e-3 self.max_past = params.max_past self.modeling_train_steps = params.modeling_train_steps self.modeling_batch_size = params.modeling_batch_size self.model_class = Actor # action and observation noise self.obfuscate_others = (params.sigma_noise is not None) or (params.temp_noise is not None) self.sigma_noise = params.sigma_noise self.temp_noise = params.temp_noise def init_agent_models(self, agents): for agent in agents: if agent is self: continue agent_model = self.model_class.from_actor(agent.actor).to(DEVICE) self.agent_models[agent.index] = agent_model optim = torch.optim.Adam(agent_model.parameters(), lr=self.model_lr) self.model_optims[agent.index] = optim def update_params(self, target, source): zipped = zip(target.parameters(), source.parameters()) for target_param, source_param in zipped: updated_param = target_param.data * (1.0 - self.tau) + \ source_param.data * self.tau target_param.data.copy_(updated_param) def act(self, obs, explore=True): obs = torch.tensor(obs, dtype=torch.float, requires_grad=False).to(DEVICE) actions = self.actor.select_action(obs, explore=explore).detach() return actions.to('cpu').numpy() def experience(self, episode_count, obs, action, reward, new_obs, done): self.memory.add(episode_count, obs, action, reward, new_obs, float(done)) def train_actor(self, batch): ### forward pass ### pred_actions = self.actor.select_action(batch.observations[self.index]) actions = list(batch.actions) actions[self.index] = pred_actions q_obs = [batch.observations[self.index] ] if self.local_obs else batch.observations q_actions = [actions[self.index]] if self.local_actions else actions pred_q = self.critic(q_obs, q_actions) ### backward pass ### p_reg = torch.mean( self.actor.forward(batch.observations[self.index])**2) loss = -pred_q.mean() + 1e-3 * p_reg self.actor_optim.zero_grad() loss.backward() if self.clip_grads: torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5) self.actor_optim.step() return loss def train_critic(self, batch, agents): """Train critic with TD-target.""" ### forward pass ### # (a_1', ..., a_n') = (mu'_1(o_1'), ..., mu'_n(o_n')) self_obs = batch.next_observations[self.index] self_action = self.actor_target.select_action(self_obs).detach() if self.local_actions: pred_next_actions = [self_action] elif self.use_agent_models: pred_next_actions = [ m.select_action(batch.next_observations[idx]).detach() for idx, m in self.agent_models.items() ] pred_next_actions.insert(self.index, self_action) else: pred_next_actions = [ a.actor_target.select_action(o).detach() for o, a in zip(batch.next_observations, agents) ] q_next_obs = [batch.next_observations[self.index] ] if self.local_obs else batch.next_observations q_next = self.critic_target(q_next_obs, pred_next_actions) reward = batch.rewards[self.index] done = batch.dones[self.index] # if not done: y = r + gamma * Q(o_1, ..., o_n, a_1', ..., a_n') # if done: y = r q_target = reward + (1.0 - done) * self.gamma * q_next ### backward pass ### # loss(params) = mse(y, Q(o_1, ..., o_n, a_1, ..., a_n)) q_obs = [batch.observations[self.index] ] if self.local_obs else batch.observations q_actions = [batch.actions[self.index] ] if self.local_actions else batch.actions loss = self.mse(self.critic(q_obs, q_actions), q_target.detach()) self.critic_optim.zero_grad() loss.backward() if self.clip_grads: torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.critic_optim.step() return loss def train_models(self, batch, agents): for idx, model in self.agent_models.items(): obs = batch.observations[idx] actions = batch.actions[idx] distributions = model.prob_dists(obs) split_actions = torch.split(actions, agents[idx].actor.action_split, dim=-1) self.model_optims[idx].zero_grad() losses = torch.zeros(len(distributions)) for i, (actions, dist) in enumerate(zip(split_actions, distributions)): entropy = dist.base_dist._categorical.entropy() loss = (dist.log_prob(actions).mean() + self.entropy_weight * entropy).mean() losses[i] = loss loss = -torch.mean(losses) loss.backward() self.model_optims[idx].step() return loss def compare_models(self, agents, batch): kls = [] for idx, model in self.agent_models.items(): kls.append([]) obs = batch.observations[idx] modelled_distributions = model.prob_dists(obs) agent_distributions = agents[idx].actor.prob_dists(obs) for model_dist, agent_dist in zip(modelled_distributions, agent_distributions): kl_div = torch.distributions.kl.kl_divergence( agent_dist, model_dist).data kls[-1].append(kl_div.mean()) return zip(self.agent_models.keys(), kls) def add_noise_(self, batch): for i in range(len(batch.actions)): if i == self.index: continue # get observations and actions for agent i obs = batch.observations[i] actions = batch.actions[i] # create noise tensors, same shape and on same device if self.sigma_noise is not None: obs = obs + torch.randn_like(obs) * self.sigma_noise if self.temp_noise is not None: temp = torch.tensor(self.temp_noise, dtype=torch.float, device=actions.device) # avoid zero probs which lead to nan samples probs = actions + 1e-45 actions = RelaxedOneHotCategorical(temp, probs=probs).sample() # add noise batch.observations[i] = obs batch.actions[i] = actions def update(self, agents): # collect transistion memories form all agents memories = [a.memory for a in agents] # train model networks if self.use_agent_models: model_losses = [] for _ in range(self.modeling_train_steps): batch = self.memory.sample_transitions_from( memories, self.modeling_batch_size, max_past=self.max_past) if self.obfuscate_others: self.add_noise_(batch) model_losses.append(self.train_models(batch, agents).data) model_loss = np.mean(model_losses) model_kls = self.compare_models(agents, batch) else: model_loss = None model_kls = None # sample minibatch batch = self.memory.sample_transitions_from(memories, self.batch_size) if self.obfuscate_others: self.add_noise_(batch) # train actor and critic network actor_loss = self.train_actor(batch) critic_loss = self.train_critic(batch, agents) # update target network params self.update_params(self.actor_target, self.actor) self.update_params(self.critic_target, self.critic) return actor_loss, critic_loss, model_loss, model_kls def get_state(self): if self.agent_models: models = {i: m.state_dict() for i, m in self.agent_models.items()} optims = {i: o.state_dict() for i, o in self.model_optims.items()} model_pair = (models, optims) else: model_pair = None return { 'actor': self.actor.state_dict(), 'actor_target': self.actor_target.state_dict(), 'actor_optim': self.actor_optim.state_dict(), 'critic': self.critic.state_dict(), 'critic_target': self.critic_target.state_dict(), 'critic_optim': self.critic_optim.state_dict(), }, model_pair def load_state(self, state): for key, value in state['state_dicts'].items(): getattr(self, key).load_state_dict(value) if 'models' in state: models, optims = state['models'] for i, m in models.items(): self.agent_models[i].load_state_dict(m) for i, o in optims.items(): self.model_optims[i].load_state_dict(o)
class Agent: def __init__(self, environment, optimizer, memory_length, dueling=True, loss='mse', noisy_net=False, egreedy=False, save_memory=None, save_weights=None, verbose_action=False, ): self.environment = environment self._optimizer = optimizer self._loss = loss self.dueling = dueling self.egreedy = egreedy self.noisy_net = noisy_net # Initialize discount and exploration rate, etc self.total_steps = 0 self.gamma = 0.99 self.epsilon = 1 self.epsilon_min = 0.01 self.epsilon_decay = 0.00005 self.tau = 0.05 self.pretraining_steps = 0 # Build networks self.q_network = self._build_compile_model() self.target_network = self._build_compile_model() self.align_target_model(how='hard') self.memory = ReplayMemory(memory_length) self.save_weights_fp = save_weights self.save_memory_fp = save_memory self.start_time = datetime.datetime.now() self.verbose_action = verbose_action def load_memory(self, fp): with open(fp, 'rb') as f: self.memory.load_memory(pickle.load(f)) print(f'loading {self.memory.length} memories...') def save_memory(self, fp): if fp: with open(fp, 'wb') as f: print('saving replay memory...') pickle.dump(self.memory.get_memory(), f) def load_weights(self, weights_fp): if weights_fp: print('loading weights...') self.q_network.load_weights(weights_fp) self.align_target_model(how='hard') def save_weights(self, weights_fp): if weights_fp: self.q_network.save_weights(weights_fp) def set_epsilon_decay_schedule(self, epsilon, epsilon_min, annealed_steps): self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = math.log(self.epsilon / self.epsilon_min) / annealed_steps def set_beta_schedule(self, beta_start, beta_max, annealed_samplings): self.memory.beta = beta_start self.memory.beta_max = beta_max self.memory.beta_increment_per_sampling = (self.memory.beta_max - self.memory.beta) / annealed_samplings def predict(self, state, use_target=False): if use_target: return self.target_network.predict(state) else: return self.q_network.predict(state) def _decay_epsilon(self): self.epsilon = self.epsilon * np.exp(-self.epsilon_decay) def store(self, state, action, reward, next_state, terminated): self.memory.add((state, action, reward, next_state, terminated)) self.total_steps += 1 if not self.egreedy: if (self.epsilon > self.epsilon_min) and (self.memory.length > self.pretraining_steps): self._decay_epsilon() def batch_store(self, batch_load): batch_load[-2][2] = -0.1 # custom reward altering for row in batch_load: self.store(*row) def _build_compile_model(self): inputs = tf.keras.layers.Input(shape=(32, 290, 4)) conv1 = tf.keras.layers.Conv2D(32, (8, 8), strides=4, padding='same', activation='relu')(inputs) conv2 = tf.keras.layers.Conv2D(64, (4, 4), strides=2, padding='same', activation='relu')(conv1) conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu')(conv2) conv3 = tf.keras.layers.Flatten()(conv3) if self.noisy_net: advt = NoisyNetDense(256, activation='relu')(conv3) final = NoisyNetDense(2)(advt) else: advt = tf.keras.layers.Dense(256, activation='relu')(conv3) final = tf.keras.layers.Dense(2)(advt) if self.dueling: if self.noisy_net: value = NoisyNetDense(256, activation='relu')(conv3) value = NoisyNetDense(1)(value) else: value = tf.keras.layers.Dense(256, activation='relu')(conv3) value = tf.keras.layers.Dense(1)(value) advt = tf.keras.layers.Lambda(lambda x: x - tf.reduce_mean(x, axis=1, keepdims=True))(final) final = tf.keras.layers.Add()([value, advt]) model = tf.keras.models.Model(inputs=inputs, outputs=final) model.compile(optimizer=self._optimizer, loss=self._loss, metrics=['accuracy']) return model def align_target_model(self, how): assert how in ('hard', 'soft'), '"how" must be either "hard" or "soft"' if how == 'hard': self.target_network.set_weights(self.q_network.get_weights()) elif how == 'soft': for t, e in zip(self.target_network.trainable_variables, self.q_network.trainable_variables): t.assign(t * (1 - self.tau) + (e * self.tau)) def choose_action(self, state): if not self.egreedy: if np.random.rand() <= self.epsilon: action = self.environment.action_space.sample() if self.verbose_action: print(f'action: {action}, q: random') return action q_values = self.predict(state, use_target=False) action = np.argmax(q_values[0]) if self.verbose_action: print(f'action: {action}, q: {q_values}') return action def train(self, batch, is_weights): td_errors = np.zeros(len(batch)) states = np.zeros((len(batch), 32, 290, 4)) targets = np.zeros((len(batch), 2)) for i, (state, action, reward, next_state, terminated) in enumerate(batch): target, td_error = self._get_target(state, action, reward, next_state, terminated) states[i] = state.reshape(32, 290, 4) targets[i] = target td_errors[i] = td_error self.q_network.fit(states, targets, sample_weight=is_weights, batch_size=32, epochs=1, verbose=0) self.align_target_model(how='soft') return td_errors def replay(self, batch_size, epoch_steps=None): num_batches = 1 if epoch_steps: num_batches = int(np.max([np.floor(epoch_steps / 4), 1])) bar = progressbar.ProgressBar(maxval=num_batches, widgets=[f'training - ', progressbar.widgets.Counter(), f'/{num_batches} ', progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() for i in range(num_batches): leaf_idx, batch, is_weights = self.memory.get_batch(batch_size) # prioritized experience replay td_errors = self.train(batch, is_weights) self.memory.update_sum_tree(leaf_idx, td_errors) bar.update(i + 1) bar.finish() self.save_weights(self.save_weights_fp) def _get_target(self, state, action, reward, next_state, terminated): target = self.predict(state, use_target=False) prev_target = target[0][action] if terminated: target[0][action] = reward else: a = np.argmax(self.predict(next_state, use_target=False)[0]) target[0][action] = reward + (self.gamma * self.predict(next_state, use_target=True)[0][a]) # double Q Network td_error = abs(prev_target - target[0][action]) return target, td_error
class DDPG: def __init__(self, env, actor_model, critic_model, memory=10000, batch_size=64, gamma=0.99, tau=0.001, actor_lr=1e-4, critic_lr=1e-3, critic_decay=1e-2, ou_theta=0.15, ou_sigma=0.2, render=None, evaluate=None, save_path=None, save_every=10, render_every=10, train_per_step=True): self.env = env self.actor = actor_model self.actor_target = actor_model.clone() self.critic = critic_model self.critic_target = critic_model.clone() if use_cuda: for net in [ self.actor, self.actor_target, self.critic, self.critic_target ]: net.cuda() self.memory = ReplayMemory(memory) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.random_process = OrnsteinUhlenbeckProcess( env.action_space.shape[0], theta=ou_theta, sigma=ou_sigma) self.optim_critic = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=critic_decay) self.optim_actor = optim.Adam(self.actor.parameters(), lr=actor_lr) self.render = render self.render_every = render_every self.evaluate = evaluate self.save_path = save_path self.save_every = save_every self.train_per_step = train_per_step def update(self, target, source): zipped = zip(target.parameters(), source.parameters()) for target_param, source_param in zipped: updated_param = target_param.data * (1 - self.tau) + \ source_param.data * self.tau target_param.data.copy_(updated_param) def train_models(self): if len(self.memory) < self.batch_size: return None, None mini_batch = self.memory.sample_batch(self.batch_size) critic_loss = self.train_critic(mini_batch) actor_loss = self.train_actor(mini_batch) self.update(self.actor_target, self.actor) self.update(self.critic_target, self.critic) return critic_loss.data[0], actor_loss.data[0] def mse(self, inputs, targets): return torch.mean((inputs - targets)**2) def train_critic(self, batch): # forward pass pred_actions = self.actor_target(batch.next_states) target_q = batch.rewards + batch.done * self.critic_target( [batch.next_states, pred_actions]) * self.gamma pred_q = self.critic([batch.states, batch.actions]) # backward pass loss = self.mse(pred_q, target_q) self.optim_critic.zero_grad() loss.backward(retain_graph=True) for param in self.critic.parameters(): param.grad.data.clamp_(-1, 1) self.optim_critic.step() return loss def train_actor(self, batch): # forward pass pred_mu = self.actor(batch.states) pred_q = self.critic([batch.states, pred_mu]) # backward pass loss = -pred_q.mean() self.optim_actor.zero_grad() loss.backward() # for param in self.actor.parameters(): # param.grad.data.clamp_(-1, 1) self.optim_actor.step() return loss def prep_state(self, s): return Variable(torch.from_numpy(s).float().unsqueeze(0)) def select_action(self, state, exploration=True): if use_cuda: state = state.cuda() self.actor.eval() action = self.actor(state) self.actor.train() if exploration: noise = Variable( torch.from_numpy(self.random_process.sample()).float()) if use_cuda: noise = noise.cuda() action = action + noise return action def step(self, action): next_state, reward, done, _ = self.env.step( action.data.cpu().numpy()[0]) next_state = self.prep_state(next_state) reward = FloatTensor([reward]) return next_state, reward, done def warmup(self, num_steps): overall_step = 0 while overall_step <= num_steps: done = False state = self.prep_state(self.env.reset()) self.random_process.reset() while not done: overall_step += 1 action = self.select_action(state) next_state, reward, done = self.step(action) self.memory.add(state, action, reward, next_state, done) state = next_state def train(self, num_steps): running_reward = None reward_sums = [] losses = [] overall_step = 0 episode_number = 0 while overall_step <= num_steps: episode_number += 1 done = False state = self.prep_state(self.env.reset()) reward_sum = 0 self.random_process.reset() while not done: overall_step += 1 action = self.select_action(state) next_state, reward, done = self.step(action) self.memory.add(state, action, reward, next_state, done) state = next_state reward_sum += reward[0] if self.train_per_step: losses.append(self.train_models()) if not self.train_per_step: losses.append(self.train_models()) render_this_episode = self.render and (episode_number % self.render_every == 0) evaluation_reward = self.run(render=render_this_episode) reward_sums.append((reward_sum, evaluation_reward)) if self.save_path is not None and (episode_number % self.save_every == 0): self.save_models(self.save_path) self.save_results(self.save_path, losses, reward_sums) running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 print( 'episode: {} steps: {} running train reward: {:.4f} eval reward: {:.4f}' .format(episode_number, overall_step, running_reward, evaluation_reward)) if self.save_path is not None: self.save_models(self.save_path) self.save_results(self.save_path, losses, reward_sums) return reward_sums, losses def run(self, render=True): state = self.env.reset() done = False reward_sum = 0 while not done: if render: self.env.render() action = self.select_action(self.prep_state(state), exploration=False) state, reward, done, _ = self.env.step( action.data.cpu().numpy()[0]) reward_sum += reward return reward_sum def save_models(self, path): self.actor.save(path) self.critic.save(path) def save_results(self, path, losses, rewards): losses = np.array([l for l in losses if l[0] is not None]) rewards = np.array(rewards) np.savetxt(os.path.join(path, 'losses.csv'), losses, delimiter=',', header='critic,actor', comments='') np.savetxt(os.path.join(path, 'rewards.csv'), rewards, delimiter=',', header='train,evaluation', comments='')
class Agent: def __init__(self, env, sess, horizon, epsilon, learning_rate_policy, learning_rate_value, gamma, lam, logger): self.env = env self.sess = sess self.horizon = horizon self.epsilon = epsilon self.learning_rate_policy = learning_rate_policy self.learning_rate_value = learning_rate_value self.gamma = gamma self.lam = lam self.logger = logger self.observation_space = env.observation_space.shape[0] self.action_space = env.action_space.shape[0] self.policy = Policy(self.observation_space, self.action_space, self.epsilon, self.learning_rate_policy) self.value_function = Value_function(self.observation_space, self.learning_rate_value) self.replay_memory = ReplayMemory(self.horizon, self.observation_space, self.action_space) def learn(self): """ Learning process that loops forever if not stopped """ while True: #Fill replay memory with one trajectory self.run_trajectory() adv, vtarget = self.gae() self.sess.run(self.policy.network.copy_to(self.policy.network_old)) #Train policy and value function on minibatch bg = BatchGenerator((self.replay_memory.observations, self.replay_memory.actions, adv), 1000) for _ in range(20): for ms, ma, madv in bg.iterate_once(): self.sess.run( self.policy.optimizer, { self.policy.network.input_pl: ms, self.policy.network_old.input_pl: ms, self.policy.action_pl: ma, self.policy.adv_pl: madv }) bg = BatchGenerator((self.replay_memory.observations, vtarget), 250) for _ in range(10): for ms, mvpred in bg.iterate_once(): self.sess.run( self.value_function.optimizer, { self.value_function.network.input_pl: ms, self.value_function.value_pl: mvpred }) def run_trajectory(self): """ Runs for one trajectory and fills the replay memory Returns: Nothing, data is stored in replay memory for later use """ self.replay_memory.clear() observation = self.env.reset() episode_reward = 0 for _ in range(self.horizon): observation = np.array([observation]) action = self.sess.run( self.policy.network.sample, {self.policy.network.input_pl: observation})[0] new_observation, reward, done, info = self.env.step(action) episode_reward += reward self.replay_memory.add(observation, action, reward, new_observation, done) if done: #Log episode reward and reset self.logger.add_reward(episode_reward) episode_reward = 0 observation = self.env.reset() else: observation = new_observation def gae(self): """ Takes data in replay memory and calculates general advantage estimate with it Returns: gae: general advantage estimate vtarget: predicted values """ v = self.sess.run(self.value_function.network.predict, { self.value_function.network.input_pl: self.replay_memory.observations }) v1 = self.sess.run( self.value_function.network.predict, { self.value_function.network.input_pl: self.replay_memory.new_observations }) tds = self.replay_memory.rewards + self.gamma * v1 * ( 1 - self.replay_memory.done) - v gae = scipy.signal.lfilter([1.0], [1.0, -self.gamma * self.lam], tds[::-1])[::-1] vtarget = gae + v gae = (gae - gae.mean()) / (gae.std() + 1e-6) return gae, vtarget
class DeepQ_agent: def __init__(self, env, hidden_units=None, network_LR=0.001, batch_size=64, update_every=4, gamma=1.0): self.env = env self.BATCH_SIZE = batch_size self.GAMMA = gamma self.NETWORK_LR = network_LR self.MEMORY_CAPACITY = int(1e5) #this is pythonic self.nA = env.ACTION_SPACE #number of actions agent can perform self.HIDDEN_UNITS = hidden_units self.UPDATE_EVERY = update_every #let's give it some brains self.qnetwork_local = QNetwork(input_shape=self.env.STATE_SPACE, hidden_units=self.HIDDEN_UNITS, output_size=self.nA, learning_rate=self.NETWORK_LR) print(self.qnetwork_local.model.summary()) #I call the target network as the PC # Where our agent stores all the concrete and important stuff self.qnetwork_target = QNetwork(input_shape=self.env.STATE_SPACE, hidden_units=self.HIDDEN_UNITS, output_size=self.nA, learning_rate=self.NETWORK_LR) #and the memory of course self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) #handy temp variable self.t = 0 #----------------------Learn from experience-----------------------------------# def learn(self): ''' hell yeah ''' if self.memory.__len__() > self.BATCH_SIZE: states, actions, rewards, next_states, dones = self.memory.sample( self.env.STATE_SPACE) #calculating action-values using local network target = self.qnetwork_local.predict(states, self.BATCH_SIZE) #future action-values using target network target_val = self.qnetwork_target.predict(next_states, self.BATCH_SIZE) #future action-values using local network target_next = self.qnetwork_local.predict(next_states, self.BATCH_SIZE) #The main point of Double DQN is selection of action from local network #while the update si from target network max_action_values = np.argmax(target_next, axis=1) #action selection for i in range(self.BATCH_SIZE): if dones[i]: target[i][actions[i]] = rewards[i] else: target[i][ actions[i]] = rewards[i] + self.GAMMA * target_val[i][ max_action_values[i]] #action evaluation self.qnetwork_local.train(states, target, batch_size=self.BATCH_SIZE) if self.t == self.UPDATE_EVERY: self.update_target_weights() self.t = 0 else: self.t += 1 #-----------------------Time to act-----------------------------------------------# def act(self, state, epsilon=0): #set to NO exploration by default state = state.reshape((1, ) + state.shape) action_values = self.qnetwork_local.predict( state) #returns a vector of size = self.nA if random.random() > epsilon: action = np.argmax( action_values) #choose best action - Exploitation else: action = random.randint(0, self.nA - 1) #choose random action - Exploration return action #-----------------------------Add experience to agent's memory------------------------# def add_experience(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) #----------------------Updates values of Target network----------------------------# def update_target_weights(self): #well now we are doing hard update, but we can do soft update also self.qnetwork_target.model.set_weights( self.qnetwork_local.model.get_weights()) #---------------------helpful save function-------------------------------------# def save(self, model_num, directory): self.qnetwork_local.model.save( f'{directory}/snake_dqn_{model_num}_{time.asctime()}.h5')