def test_observation_zeroing(self): """ Tests zeroing out of frames not from current episode """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) for terminal_idx in range(5): obs_ = [] obs_next_ = [] for i in range(1, 6): partial_obs = np.ones(obs_shape) * i terminal = 1 if i == terminal_idx else 0 er.append(partial_obs, 0, 0, terminal) if i <= terminal_idx: partial_obs *= 0 if i < 5: obs_.append(partial_obs) if i > 1: obs_next_.append(partial_obs) obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0)) obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0)) batch = er.sample(1) obs, rewards, actions, obs_next, terminals = batch assert np.array_equal(obs_, obs) assert np.array_equal(obs_next_, obs_next)
def test_sampling(self): """ Tests observation construction from partial observations """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) for i in range(1, 6): partial_obs = np.ones(obs_shape) * i er.append(partial_obs, 1, 1, 0) batch = er.sample(1) _, rewards, actions, _, terminals = batch assert np.array_equal(rewards, np.array([1])) assert np.array_equal(actions, np.array([1])) assert np.array_equal(terminals, np.array([0]))
def test_observation_construction(self): """ Tests observation construction from partial observations """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) obs_ = [] obs_next_ = [] for i in range(1, 6): partial_obs = np.ones(obs_shape) * i if i < 5: obs_.append(partial_obs) if i > 1: obs_next_.append(partial_obs) er.append(partial_obs, 0, 0, 0) obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0)) obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0)) batch = er.sample(1) obs, rewards, actions, obs_next, terminals = batch assert np.array_equal(obs_, obs) assert np.array_equal(obs_next_, obs_next)
class DQN: def __init__(self): self.batch_size = 64 # How many experiences to use for each training step self.train_frequency = 5 # How often you update the network self.num_epochs = 20 # How many epochs to train when updating the network self.y = 0.99 # Discount factor self.prob_random_start = 0.6 # Starting chance of random action self.prob_random_end = 0.1 # Ending chance of random action self.annealing_steps = 1000. # Steps of training to reduce from start_e -> end_e self.max_num_episodes = 10000 # Max number of episodes you are allowes to played to train the game self.min_pre_train_episodes = 100 # Number of episodes played with random actions before to start training. self.max_num_step = 50 # Maximum allowed episode length self.goal = 15 # Number of rewards we want to achieve while playing a game. # Set env self.env = gameEnv(partial=False, size=5) # Reset everything from keras session K.clear_session() # Setup our Q-networks self.main_qn = Qnetwork() self.target_qn = Qnetwork() # Setup our experience replay self.experience_replay = ExperienceReplay() def update_target_graph(self): updated_weights = np.array(self.main_qn.model.get_weights()) self.target_qn.model.set_weights(updated_weights) def choose_action(self, state, prob_random, num_episode): if np.random.rand() < prob_random or \ num_episode < self.min_pre_train_episodes: # Act randomly based on prob_random or if we # have not accumulated enough pre_train episodes action = np.random.randint(self.env.actions) else: # Decide what action to take from the Q network # First add one dimension to the netword to fit expected dimension of the network state = np.expand_dims(state, axis=0) action = np.argmax(self.main_qn.model.predict(state)) return action def run_one_episode(self, num_episode, prob_random): # Create an experience replay for the current episode. experiences_episode = [] # Get the game state from the environment state = self.env.reset() done = False # Game is complete cur_step = 0 # Running sum of number of steps taken in episode while cur_step < self.max_num_step and not done: cur_step += 1 action = self.choose_action(state=state, prob_random=prob_random, num_episode=num_episode) # Take the action and retrieve the next state, reward and done next_state, reward, done = self.env.step(action) # Setup the experience to be stored in the episode buffer experience = [state, action, reward, next_state, done] # Store the experience in the episode buffer experiences_episode.append(experience) # Update the state state = next_state return experiences_episode def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done): # Our predictions (actions to take) from the main Q network target_q = self.main_qn.model.predict(train_state) # Tells us whether game over or not # We will multiply our rewards by this value # to ensure we don't train on the last move train_gameover = train_done == 0 # Q value of the next state based on action target_q_next_state = self.target_qn.model.predict(train_next_state) train_next_state_values = np.max(target_q_next_state[range( self.batch_size)], axis=1) # Reward from the action chosen in the train batch actual_reward = train_reward + (self.y * train_next_state_values * train_gameover) target_q[range(self.batch_size), train_action] = actual_reward return target_q def train_one_step(self): # Train batch is [[state,action,reward,next_state,done],...] train_batch = self.experience_replay.sample(self.batch_size) # Separate the batch into numpy array for each compents train_state = np.array([x[0] for x in train_batch]) train_action = np.array([x[1] for x in train_batch]) train_reward = np.array([x[2] for x in train_batch]) train_next_state = np.array([x[3] for x in train_batch]) train_done = np.array([x[4] for x in train_batch]) # Generate target Q target_q = self.generate_target_q(train_state=train_state, train_action=train_action, train_reward=train_reward, train_next_state=train_next_state, train_done=train_done) # Train the main model loss = self.main_qn.model.train_on_batch(train_state, target_q) return loss def train(self): # Make the networks equal self.update_target_graph() # We'll begin by acting complete randomly. As we gain experience and improve, # we will begin reducing the probability of acting randomly, and instead # take the actions that our Q network suggests prob_random = self.prob_random_start prob_random_drop = (self.prob_random_start - self.prob_random_end) / self.annealing_steps # Init variable num_steps = [] # Tracks number of steps per episode rewards = [] # Tracks rewards per episode print_every = 50 # How often to print status losses = [0] # Tracking training losses num_episode = 0 while True: # Run one episode experiences_episode = self.run_one_episode(num_episode, prob_random) # Save the episode in the replay buffer self.experience_replay.add(experiences_episode) # If we have play enoug episode. Start the training if num_episode > self.min_pre_train_episodes: # Drop the probability of a random action if wi didn't reach the prob_random_end value if prob_random > self.prob_random_end: prob_random -= prob_random_drop # Every train_frequency iteration, train the model if num_episode % self.train_frequency == 0: for num_epoch in range(self.num_epochs): loss = self.train_one_step() losses.append(loss) # Update the target model with values from the main model self.update_target_graph() # Increment the episode num_episode += 1 num_steps.append(len(experiences_episode)) rewards.append(sum([e[2] for e in experiences_episode])) # Print Info if num_episode % print_every == 0: # datetime object containing current date and time now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print( "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}" .format(dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) # Stop Condition if np.mean(rewards[-print_every:]) >= self.goal: now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print( "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}" .format(dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) print("Training complete because we reached goal rewards.") break if num_episode > self.max_num_episodes: print("Training Stop because we reached max num of episodes") break
class DQN: """ Implementation of deep q learning algorithm """ def __init__(self): self.prob_random = 1.0 # Probability to play random action self.y = .99 # Discount factor self.batch_size = 64 # How many experiences to use for each training step self.prob_random_end = .01 # Ending chance of random action self.prob_random_decay = .996 # Decrease decay of the prob random self.max_episode = 300 # Max number of episodes you are allowes to played to train the game self.expected_goal = 200 # Expected goal self.dnn = DNN() self.env = gym.make('CartPole-v0') self.memory = ExperienceReplay(buffer_size=10000) self.metadata = [ ] # we will store here info score, at the end of each episode def choose_action(self, state, prob_random): if np.random.rand() <= prob_random: action = np.random.randint(self.env.action_space.n) else: action = np.argmax(self.dnn.model.predict(state)) return action def run_one_step(self, state): action = self.choose_action(state, self.prob_random) next_state, reward, done, _ = self.env.step(action) next_state = np.expand_dims(next_state, axis=0) return state, action, reward, next_state, done def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done): # Our predictions (actions to take) from the main Q network target_q = self.dnn.model.predict(train_state) # Tells us whether game over or not # We will multiply our rewards by this value # to ensure we don't train on the last move train_gameover = train_done == 0 # Q value of the next state based on action target_q_next_state = self.dnn.model.predict(train_next_state) train_next_state_values = np.max(target_q_next_state[range( self.batch_size)], axis=1) # Reward from the action chosen in the train batch actual_reward = train_reward + (self.y * train_next_state_values * train_gameover) target_q[range(self.batch_size), train_action] = actual_reward return target_q def train_one_step(self): batch_data = self.memory.sample(self.batch_size) train_state = np.array([i[0] for i in batch_data]) train_action = np.array([i[1] for i in batch_data]) train_reward = np.array([i[2] for i in batch_data]) train_next_state = np.array([i[3] for i in batch_data]) train_done = np.array([i[4] for i in batch_data]) # These lines remove useless dimension of the matrix train_state = np.squeeze(train_state) train_next_state = np.squeeze(train_next_state) # Generate target Q target_q = self.generate_target_q(train_state=train_state, train_action=train_action, train_reward=train_reward, train_next_state=train_next_state, train_done=train_done) loss = self.dnn.model.train_on_batch(train_state, target_q) return loss def train(self): scores = [] for e in range(self.max_episode): # Init New episode state = self.env.reset() state = np.expand_dims(state, axis=0) episode_score = 0 while True: state, action, reward, next_state, done = self.run_one_step( state) self.memory.add( experiences=[[state, action, reward, next_state, done]]) episode_score += reward state = next_state if len(self.memory.buffer) > self.batch_size: self.train_one_step() if self.prob_random > self.prob_random_end: self.prob_random *= self.prob_random_decay if done: now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") self.metadata.append( [now, e, episode_score, self.prob_random]) print( "{} - episode: {}/{}, score: {:.1f} - prob_random {:.3f}" .format(dt_string, e, self.max_episode, episode_score, self.prob_random)) break scores.append(episode_score) # Average score of last 100 episode means_last_10_scores = np.mean(scores[-10:]) if means_last_10_scores == self.expected_goal: print('\n Task Completed! \n') break print("Average over last 10 episode: {0:.2f} \n".format( means_last_10_scores)) print("Maximum number of episode played: %d" % self.max_episode)
logger.log("stats/max_tile", np.max(g.state), i) logger.log("stats/best_score", best_score, i) logger.log("settings/epsilon", epsilon, i) logger.log("settings/num_random_moves", num_random_moves, i) logger.log("settings/perc_random_moves", num_random_moves / num_moves, i) logger.log("settings/experience", len(replay), i) reward = 0 replay.add( (state, action_onehot, reward, np.zeros(state.shape))) if i > OBSERVE: batch = replay.sample(batch_size=32) states = [] actions = [] rewards = [] next_states = [] for e, b in enumerate(batch): states.append(b[0]) actions.append(b[1]) rewards.append(b[2]) next_states.append(b[3]) states = np.array(states) actions = np.array(actions) rewards = np.array(rewards) next_states = np.array(next_states)
# Interleave planning and learning steps print("\nInterleaving planning and learning steps.", flush=True) actor.reset() steps_cnt = 0 episode_steps = 0 episodes_cnt = 0 while episodes_cnt < n_episodes: r, episode_done = planning_step(actor=actor, planner=planner, dataset=experience_replay, policy_fn=network_policy, tree_budget=tree_budget, cache_subtree=cache_subtree, discount_factor=discount_factor) # Learning step batch = experience_replay.sample(batch_size) loss, _ = learner.train_step(tf.constant(batch["observations"], dtype=tf.float32), tf.constant(batch["target_policy"], dtype=tf.float32)) steps_cnt += 1 episode_steps +=1 print("\n".join([" ".join(row) for row in env.unwrapped.get_char_matrix(actor.tree.root.data["s"])]), "Reward: ", r, "Simulator steps:", actor.nodes_generated, "Planning steps:", steps_cnt, "Loss:", loss.numpy(), "\n") if episode_done: print("Problem solved in %i steps (min 13 steps)."%episode_steps) actor.reset() episodes_cnt += 1 episode_steps = 0 if episodes_cnt < n_episodes: print("\n------- New episode -------")
# get after take action newstate, reward, done, _ = env.step(action) if (newstate == []): print("Terminate") break replay_ep.add( np.reshape(np.array([state, action, reward, done, newstate]), [1, 5])) # train if config.total_step > config.args.num_pretrain_step: if epsilon > config.args.end_epsilon: epsilon -= epsilon_decay if config.total_step % config.args.online_update_freq == 0: train_batch = replay.sample(config.args.batch_size) loss = qnet.learn_on_minibatch(train_batch, config.args.gamma) sys.stdout.write( "\rTrain step at {}th step | loss {} | epsilon {}".format( config.total_step, loss, epsilon)) sys.stdout.flush() if config.total_step % config.args.target_update_freq == 0: # print("Update target net") qnet.update_target_model(config.args.tau) config.total_step += 1 total_reward += reward state = newstate if done:
class Agent: def __init__(self, s_size, a_size, seed): """ Parameters: s_size (int): dimension of each state a_size (int): dimension of each action seed (int): random seed """ self.s_size = s_size self.a_size = a_size self.seed = random.seed(seed) # Initialize both the Q-networks self.local_dqn = Model(s_size, a_size, seed).to(device) self.target_dqn = Model(s_size, a_size, seed).to(device) self.optimizer = optim.Adam(self.local_dqn.parameters(), lr=c.LEARNING_RATE) # Initialize experience deque self.buffer = ExperienceReplay(a_size, c.REPLAY_BUFFER_SIZE, c.BATCH_SIZE, seed) # Time step counter used for updating as per UPDATE_FREQUENCY self.t_step = 0 def step(self, s, a, r, s_next, done, transfer_method): # Add experience to dequeue self.buffer.add(s, a, r, s_next, done) # Learn if UPDATE_FREQUENCY matched. self.t_step = (self.t_step + 1) % c.UPDATE_FREQUENCY if self.t_step == 0: # Get random experiences to learn from. if len(self.buffer) > c.BATCH_SIZE: es = self.buffer.sample() self.learn(es, transfer_method, c.GAMMA) def act(self, state, transfer_method, eps=0.): """Returns actions for given state as per current policy. Parameters: state (array_like): current state isTransfer (int): 0 if pre-trained weights to be used, int otherwise eps (float): epsilon, for exploration """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.local_dqn.eval() with torch.no_grad(): a_values = self.local_dqn(state, transfer_method) self.local_dqn.train() # Generate a random number. If larger than epsilon pick greedy, or random otherwise if random.random() > eps: return np.argmax(a_values.cpu().data.numpy()) else: return random.choice(np.arange(self.a_size)) def learn(self, es, transfer_method, gamma): """Update parameters based on experiences. Parameters: es (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ s_, a_, r_, s_next_, d_ = es # Max predicted Q-values target_Q_next = self.target_dqn( s_next_, transfer_method).detach().max(1)[0].unsqueeze(1) # Target Q-value target_Q = r_ + (gamma * target_Q_next * (1 - d_)) # Expected Q-vales expected_Q = self.local_dqn(s_, transfer_method).gather(1, a_) loss = F.mse_loss(expected_Q, target_Q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network update(self.local_dqn, self.target_dqn, c.TAU)
class DQN(object): """ OOP for a Deep Q-Network (DQN). """ def __init__(self, game, memory_size = 100000, batch_size = 1, epsilon_init = 1.0, alpha_init = .00025, anneal_alpha = True, anneal_epsilon = True, batch_size_incr = 0): self.memories = ExperienceReplay(memory_size) self.nnet = LeNet(game.state_shape, dim_out = game.n_actions, batch_size = 1, fc_dim = 500, nkerns = [16,32], filter_dims = [2,2], out_type = 'linear') self.trainer = single_batch_trainer(self.nnet) self.game = game self.n_episodes = 0 self.avg_rewards = [] self.avg_action_vals = [] self.alpha = alpha_init self.epsilon = epsilon_init self.anneal_ep = anneal_epsilon self.anneal_lr = anneal_alpha self.batch_size = batch_size self.batch_size_incr = batch_size_incr self._pct_invalids = [] self._costs = [] def train(self, n_episodes = 3, max_iter = 500): g = self.game g.reset() # set anneal rate for epsilon. ep_anneal_rate = 0 if self.anneal_ep: ep_anneal_rate = float(self.epsilon)/n_episodes alpha_anneal_rate = 0 if self.anneal_lr: alpha_anneal_rate = float(self.alpha)/n_episodes for e_idx in range(n_episodes): s = g.get_state() print "Episode: %d, Exploration Rate: %f, Learning Rate: %f" %(e_idx, self.epsilon, self.alpha) while not g.is_terminal() and not self.game._num_moves >= max_iter and not self.game.iter_ctr >= 200: # epsilon-greedy action selection below if np.random.binomial(1,self.epsilon): a_idx = np.random.randint(self.game.n_actions) else: values = self.nnet.outputter(s.reshape(self.nnet.image_shape)) a_idx = np.argmax(values[0]) r = g.take_action(a_idx) stp1 = g.get_state() # Reshape states into shape expected by convnet. self.memories.insert(Memory( s.transpose(2,0,1).reshape(self.nnet.image_shape), a_idx, r, stp1.transpose(2,0,1).reshape(self.nnet.image_shape) )) s = stp1 # TEST CLOOJ if self.game.iter_ctr %200 == 0: print "move_n: %d, action: %d, reward: %f, status: %d" %( self.game.iter_ctr, a_idx, r, self.game._STATUS ) # Minibatch update. if e_idx > 0: costs = [] # local for this iter. data = self.memories.sample(self.batch_size) # random (state, action, reward, nxt_state) sample from memory replay. data = [m.target_pair(self.nnet) for m in data] # convert above tuple into training data, label pair. for i in range(self.batch_size): d = data[i] costs.append(self.trainer(d[0], d[1], self.alpha)) # call trainer func self._costs.append(np.mean(costs)) # print "Game %d ends in %d iterations with status %d, reward %d." %(e_idx, self.game.iter_ctr, self.game._STATUS, r) # compute percent invalid actions. n_moves = g.iter_ctr rs = g.episode_rewards n_invalid = len(np.where(rs == np.array([-.02 for _ in range(len(rs))]))[0]) pct_invalid = float(n_invalid)/n_moves self._pct_invalids.append(pct_invalid) print "Pct Invalid: %f" %pct_invalid g.reset() self.epsilon -= ep_anneal_rate self.batch_size += self.batch_size_incr if e_idx > 0: self.alpha -= alpha_anneal_rate
class DQN: def __init__(self): self.batch_size = 64 # How many experiences to use for each training step self.train_frequency = 5 # How often you update the network self.num_epochs = 20 # How many epochs to train when updating the network self.y = 0.99 # Discount factor self.prob_random_start = 0.6 # Starting chance of random action self.prob_random_end = 0.1 # Ending chance of random action self.annealing_steps = 1000. # Steps of training to reduce from start_e -> end_e self.max_num_episodes = 10000 # Max number of episodes you are allowes to played to train the game self.min_pre_train_episodes = 100 # Number of episodes played with random actions before to start training. self.max_num_step = 50 # Maximum allowed episode length self.goal = 15 # Number of rewards we want to achieve while playing a game. # Set env self.env = gameEnv(partial=False, size=5) # Reset everything from keras session K.clear_session() # Setup our Q-networks self.main_qn = Qnetwork() self.target_qn = Qnetwork() # Setup our experience replay self.experience_replay = ExperienceReplay() def update_target_graph(self): # TODO return def choose_action(self, state, prob_random, num_episode): # TODO return action def run_one_episode(self, num_episode, prob_random): # TODO return experiences_episode def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done): # TODO return target_q def train_one_step(self): # Train batch is [[state,action,reward,next_state,done],...] train_batch = self.experience_replay.sample(self.batch_size) # Separate the batch into numpy array for each compents train_state = np.array([x[0] for x in train_batch]) train_action = np.array([x[1] for x in train_batch]) train_reward = np.array([x[2] for x in train_batch]) train_next_state = np.array([x[3] for x in train_batch]) train_done = np.array([x[4] for x in train_batch]) # Generate target Q target_q = self.generate_target_q( train_state=train_state, train_action=train_action, train_reward=train_reward, train_next_state=train_next_state, train_done=train_done ) # Train the main model loss = self.main_qn.model.train_on_batch(train_state, target_q) return loss def train(self): # Make the networks equal self.update_target_graph() # We'll begin by acting complete randomly. As we gain experience and improve, # we will begin reducing the probability of acting randomly, and instead # take the actions that our Q network suggests prob_random = self.prob_random_start prob_random_drop = (self.prob_random_start - self.prob_random_end) / self.annealing_steps # Init variable num_steps = [] # Tracks number of steps per episode rewards = [] # Tracks rewards per episode print_every = 50 # How often to print status losses = [0] # Tracking training losses num_episode = 0 while True: # Run one episode experiences_episode = self.run_one_episode(num_episode, prob_random) # Save the episode in the replay buffer self.experience_replay.add(experiences_episode) # If we have play enoug episode. Start the training if num_episode > self.min_pre_train_episodes: # Drop the probability of a random action if wi didn't reach the prob_random_end value if prob_random > self.prob_random_end: prob_random -= prob_random_drop # Every train_frequency iteration, train the model if num_episode % self.train_frequency == 0: for num_epoch in range(self.num_epochs): loss = self.train_one_step() losses.append(loss) # Update the target model with values from the main model self.update_target_graph() # Increment the episode num_episode += 1 num_steps.append(len(experiences_episode)) rewards.append(sum([e[2] for e in experiences_episode])) # Print Info if num_episode % print_every == 0: # datetime object containing current date and time now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print("{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}".format( dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) # Stop Condition if np.mean(rewards[-print_every:]) >= self.goal: now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print("{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}".format( dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) print("Training complete because we reached goal rewards.") break if num_episode > self.max_num_episodes: print("Training Stop because we reached max num of episodes") break
def main(_): # Reproducability tf.reset_default_graph() np.random.seed(cfg.random_seed) tf.set_random_seed(cfg.random_seed) # Logging summary_writer = tf.summary.FileWriter(cfg.log_dir) if not cfg.evaluate and not tf.gfile.Exists(cfg.save_dir): tf.gfile.MakeDirs(cfg.save_dir) else: assert tf.gfile.Exists(cfg.save_dir) # TODO handel this episode_results_path = os.path.join(cfg.log_dir, "episodeResults.csv") episode_results = tf.gfile.GFile(episode_results_path, "w") episode_results.write("model_freq={},save_dir={}".format( cfg.model_freq, cfg.save_dir)) episode_results.write("episode,reward,steps\n") episode_results.flush() # Setup ALE and DQN graph obs_shape = (84, 84, 1) input_height, input_width, _ = obs_shape dqn = DQN(input_height, input_width, cfg.num_actions) # Global step global_step = tf.train.get_or_create_global_step() increment_step = tf.assign_add(global_step, 1) # Save all variables vars_to_save = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="agent/q") vars_to_save.append(global_step) saver = tf.train.Saver(var_list=vars_to_save) # Handle loading specific variables sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) restore_or_initialize_weights(sess, dqn, saver) sess.run(dqn.copy_to_target) if cfg.evaluate: # if in evaluation mode, saver is no longer needed saver = None # ##### Restoring AEs ######## if not cfg.evaluate: vaes = create_generative_models(sess) image_summaries = [] image_summaries_ph = tf.placeholder(tf.float32, shape=(4, 84, 84, 4), name="image_summaries_placeholder") for i in range(4): for j in range(4): image_summaries.append( tf.summary.image( "VAE_OUT_{}_{}".format(i, j), tf.reshape(image_summaries_ph[i, :, :, j], (1, 84, 84, 1)))) # ############################ if not cfg.evaluate: summary_writer.add_graph(tf.get_default_graph()) summary_writer.add_graph(vaes[0].graph) summary_writer.add_graph(vaes[1].graph) summary_writer.add_graph(vaes[2].graph) summary_writer.flush() # Initialize ALE postprocess_frame = lambda frame: sess.run(dqn.process_frame, feed_dict={dqn.image: frame}) env = AtariEnvironment(obs_shape, postprocess_frame) # Replay buffer if not cfg.evaluate: replay_buffer = ExperienceReplay(cfg.replay_buffer_size, obs_shape) # Perform random policy to get some training data with tqdm(total=cfg.seed_frames, disable=cfg.disable_progress or cfg.evaluate) as pbar: seed_steps = 0 while seed_steps * cfg.frame_skip < cfg.seed_frames and not cfg.evaluate: action = np.random.randint(cfg.num_actions) reward, next_state, terminal = env.act(action) seed_steps += 1 replay_buffer.append(next_state[:, :, -1, np.newaxis], action, reward, terminal) if terminal: pbar.update(env.episode_frames) env.reset(inc_episode_count=False) if cfg.evaluate: assert cfg.max_episode_count > 0 else: assert len(replay_buffer) >= cfg.seed_frames // cfg.frame_skip # Main training loop steps = tf.train.global_step(sess, global_step) env.reset(inc_episode_count=False) terminal = False total = cfg.max_episode_count if cfg.evaluate else cfg.num_frames with tqdm(total=total, disable=cfg.disable_progress) as pbar: # Loop while we haven't observed our max frame number # If we are at our max frame number we will finish the current episode while (not ( # We must be evaluating or observed the last frame # As well as be terminal # As well as seen the maximum episode number (steps * cfg.frame_skip > cfg.num_frames or cfg.evaluate) and terminal and env.episode_count >= cfg.max_episode_count)): # Epsilon greedy policy with epsilon annealing if not cfg.evaluate and steps * cfg.frame_skip < cfg.eps_anneal_over: # Only compute epsilon step while we're still annealing epsilon epsilon = cfg.eps_initial - steps * ( (cfg.eps_initial - cfg.eps_final) / cfg.eps_anneal_over) else: epsilon = cfg.eps_final # Epsilon greedy policy if np.random.uniform() < epsilon: action = np.random.randint(0, cfg.num_actions) else: action = sess.run(dqn.action, feed_dict={dqn.S: [env.state]}) # Perform environment step steps = sess.run(increment_step) reward, next_state, terminal = env.act(action) if not cfg.evaluate: replay_buffer.append(next_state[:, :, -1, np.newaxis], action, reward, terminal) # Sample and do gradient updates if steps % cfg.learning_freq == 0: placeholders = [ dqn.S, dqn.actions, dqn.rewards, dqn.S_p, dqn.terminals, ] batch = replay_buffer.sample(cfg.batch_size) train_op = [dqn.train] if steps % (cfg.learning_freq * cfg.model_freq) == 0: experience_batch = batch batch = imagined_batch(vaes, batch[1]) if steps / (cfg.learning_freq * cfg.model_freq) < 10: placeholders.append(image_summaries_ph) batch = list(batch) batch.append(batch[0][ np.random.randint(0, 32, size=4), :, :, :]) train_op.extend(image_summaries) if steps % cfg.log_summary_every: train_op.append(dqn.summary) result = sess.run( train_op, feed_dict=dict(zip(placeholders, batch)), ) if len(result) > 1: for i in range(1, len(result)): summary_writer.add_summary(result[i], global_step=steps) if steps % cfg.target_update_every == 0: sess.run([dqn.copy_to_target]) if steps % cfg.model_chkpt_every == 0: saver.save(sess, "%s/model_epoch_%04d" % (cfg.save_dir, steps)) if terminal: episode_results.write("%d,%d,%d\n" % (env.episode_count, env.episode_reward, env.episode_frames)) episode_results.flush() # Log episode summaries to Tensorboard add_simple_summary(summary_writer, "episode/reward", env.episode_reward, env.episode_count) add_simple_summary(summary_writer, "episode/frames", env.episode_frames, env.episode_count) pbar.update(env.episode_frames if not cfg.evaluate else 1) env.reset() episode_results.close() tf.logging.info("Finished %d %s" % ( cfg.max_episode_count if cfg.evaluate else cfg.num_frames, "episodes" if cfg.evaluate else "frames", ))
class DQN_agent(): def __init__(self): self.eps = 0.1 self.env = GridEnv(3) self.batch_size = 20 if prioritized_replay and replay_type == "proportional": self.replay = ProportionalReplay(max_buffer_size, prioritized_replay_alpha) elif prioritized_replay and replay_type == "ranked": N_list = [self.batch_size] + [ int(x) for x in np.linspace(100, max_buffer_size, 5) ] save_quantiles(N_list=N_list, k=self.batch_size, alpha=prioritized_replay_alpha) self.replay = RankBasedReplay(max_buffer_size, prioritized_replay_alpha) else: self.replay = ExperienceReplay( max_buffer_size) # passing size of buffer # define graph self.inputs = tf.placeholder(tf.float32, shape=(None, self.env.state_size)) self.target_values = tf.placeholder(tf.float32, shape=(None, )) self.actions = tf.placeholder(tf.int32, shape=(None, )) self.is_weights = tf.placeholder(tf.float32, shape=( None, )) # importance sampling weights for prioritized replay self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph( ) # build main network self.target_Q_out_op, _, _ = self.build_graph( 'target') # build identical target network self.init_op = tf.global_variables_initializer() self.sess = tf.Session() def build_graph(self, scope='main'): with tf.variable_scope(scope): h = tf.layers.dense(self.inputs, 16, activation=tf.nn.relu, name="h") outputs = tf.layers.dense(h, self.env.num_actions, activation=tf.nn.softmax, name="outputs") # everything is now the same shape (batch_size, num_actions) # nonzero error only for selected actions action_mask = tf.one_hot(self.actions, self.env.num_actions, on_value=True, off_value=False) targets = tf.tile(tf.expand_dims(self.target_values, 1), [1, self.env.num_actions]) target_outputs = tf.where( action_mask, targets, outputs ) # takes target value where mask is true. takes outputs value otherwise td_error = target_outputs - outputs # only one element in each row is non-zero weights = tf.tile(tf.expand_dims(self.is_weights, 1), [1, self.env.num_actions ]) # all 1s when not using priority replay weighted_td_error = weights * td_error # element-wise multiplication loss = tf.reduce_sum(tf.square(weighted_td_error)) update = tf.train.AdamOptimizer().minimize(loss) return outputs, update, td_error def train(self): steps_per_ep = np.zeros(episodes) for episode in range(episodes): print(episode) self.env.reset() state = self.env.state done = False num_steps = 0 while not done: num_steps += 1 action = self.get_eps_action(state, self.eps) next_state, reward, done, _ = self.env.step(action) self.replay.add((state, action, reward, next_state, done)) # store in experience replay # sample from experience replay if prioritized_replay: beta = beta0 + episode * ( 1 - beta0 ) / episodes # linear annealing schedule for IS weights states, actions, rewards, next_states, dones, weights, indices = self.replay.sample( self.batch_size, beta) self.net_update(states, actions, rewards, next_states, dones, weights, indices) # qlearning else: states, actions, rewards, next_states, dones = self.replay.sample( self.batch_size) self.net_update(states, actions, rewards, next_states, dones) # qlearning # slowly update target network if num_steps % update_every == 0: self.target_net_update() # sort max heap periodically if num_steps % sort_every == 0: if prioritized_replay and replay_type == "ranked": self.replay.sort() state = next_state steps_per_ep[episode] = num_steps return steps_per_ep # from https://tomaxent.com/2017/07/09/Using-Tensorflow-and-Deep-Q-Network-Double-DQN-to-Play-Breakout/ def target_net_update(self): # get sorted lists of parameters in each of the networks main_params = [ t for t in tf.trainable_variables() if t.name.startswith("main") ] main_params = sorted(main_params, key=lambda v: v.name) target_params = [ t for t in tf.trainable_variables() if t.name.startswith("target") ] target_params = sorted(target_params, key=lambda v: v.name) update_ops = [] for main_v, target_v in zip(main_params, target_params): op = target_v.assign(main_v) update_ops.append(op) self.sess.run(update_ops) # minibatch qlearning def net_update(self, states, actions, rewards, next_states, dones, weights=None, indices=None): not_dones = np.logical_not(dones) # create a shape (batch_size, ) array of target values target_values = rewards.astype( float) # np.array of shape (batch_size, ) next_inputs = next_states[ not_dones] # np.array of shape (#done, state_size) next_Qs = self.sess.run(self.Q_out_op, {self.inputs: next_inputs }) # np.array of shape (#done, num_actions) max_Qs = np.max(next_Qs, axis=1) # np.array of shape (#done,) target_values[not_dones] += gamma * max_Qs # if not using prioritized replay if weights is None: weights = np.ones(self.batch_size) # compute gradients and update parameters _, td_error = self.sess.run([self.Q_update_op, self.td_error_op], \ {self.inputs: states, self.target_values: target_values, self.actions: actions, self.is_weights: weights}) # update priority replay priorities if indices is not None: td_error = td_error.ravel()[np.flatnonzero( td_error)] # shape (batch_size, ) self.replay.update_priorities( indices, np.abs(td_error) + 1e-3 ) # add small number to prevent never sampling 0 error transitions # returns eps-greedy action with respect to Q def get_eps_action(self, state, eps): if self.env.np_random.uniform() < eps: action = self.env.sample() else: Q = self.sess.run(self.Q_out_op, {self.inputs: np.array([state])}) max_actions = np.where(np.ravel(Q) == Q.max())[0] action = self.env.np_random.choice( max_actions) # to select argmax randomly return action