def __init__(self, params): self.params = params # These are the parameters collected for the agent. # Load environmnet self.game = MinesweeperEnvironment(self.params.input_height, self.params.input_width, self.params.mines_min, self.params.mines_max, self.params.show_game, self.params.reward_recent_update) # Initialize two Q-Value Networks # Q-network for training. self.dqn_train = DeepQNetwork(params=self.params, num_actions=self.game.num_actions, network_name="qnetwork-train", trainable=True) if self.params.is_train: # Q-Network for predicting target Q-values self.dqn_target = DeepQNetwork(params=self.params, num_actions=self.game.num_actions, network_name="qnetwork-target", trainable=False) # Initialize replay memory for storing experience to sample batches from self.replay_mem = ReplayMemory( self.params.replay_capacity, self.params.history_length, self.params.nchannels, self.params.batch_size, self.params.input_height, self.params.input_width, self.params.game, self.params.memory_checkpoint, self.params.restore_memory, self.params.output_dir) # Small structure for storing the last four screens self.history = ScreenHistory(self.params) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it self.checkpoint_dir = os.path.abspath( os.path.join(self.params.output_dir, "checkpoints_" + self.params.game)) self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "model") if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.train_iteration = 0 self.count_actions = np.zeros( self.game.num_actions) # Count per action (only greedy) self.count_act_random = 0 # Count of random actions self.count_act_greedy = 0 # Count of greedy actions self.win_rate = 0.0 # For atari # Histories of qvalues and loss for running average self.qvalues_hist = collections.deque( [0] * self.params.interval_summary, maxlen=self.params.interval_summary) self.loss_hist = collections.deque([10] * self.params.interval_summary, maxlen=self.params.interval_summary) self.epsilon = 0
def train(params): # Load Atari rom and prepare ALE environment atari = GymEnvironment(params.random_start_wait, params.show_game) # Initialize two Q-Value Networks one for training and one for target prediction dqn_train = DeepQNetwork( params=params, num_actions=atari.num_actions, network_name="qnetwork-train", trainable=True ) # Q-Network for predicting target Q-values dqn_target= DeepQNetwork( params=params, num_actions=atari.num_actions, network_name="qnetwork-target", trainable=False ) # Initialize replay memory for storing experience to sample batches from replay_mem = ReplayMemory(params.replay_capacity, params.batch_size) # Small structure for storing the last four screens history = ScreenHistory(params) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it replay_mem_dump = os.path.abspath(os.path.join(params.output_dir, "replay_memory.hdf5")) checkpoint_dir = os.path.abspath(os.path.join(params.output_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) train_step = 0 count_actions = np.zeros(atari.num_actions) # Count per action (only greedy) count_act_random = 0 # Count of random actions count_act_greedy = 0 # Count of greedy actions # Histories of qvalues and loss for running average qvalues_hist = collections.deque([0]*params.interval_summary, maxlen=params.interval_summary) loss_hist = collections.deque([10]*params.interval_summary, maxlen=params.interval_summary) # Time measurements dt_batch_gen = collections.deque([0]*10, maxlen=10) dt_optimization = collections.deque([0]*10, maxlen=10) dt_train_total = collections.deque([0]*10, maxlen=10) # Optionally load pre-initialized replay memory from disk if params.replay_mem_dump is not None and params.is_train: print("Loading pre-initialized replay memory from HDF5 file.") replay_mem.load(params.replay_mem_dump) # Initialize a new game and store the screens in the history reward, screen, is_terminal = atari.new_random_game() for _ in xrange(params.history_length): history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=0.4 ) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # Initialize the TensorFlow session init = tf.initialize_all_variables() sess.run(init) # Only save trainable variables and the global step to disk tf_vars_to_save = tf.trainable_variables() + [dqn_train.global_step] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=40) if params.model_file is not None: # Load pre-trained model from disk saver.restore(sess, params.model_file) train_step, learning_rate = sess.run([dqn_train.global_step, dqn_train.learning_rate]) print("Restarted training from model file. Step = %06i, Learning Rate = %.5f" % (train_step, learning_rate)) # Initialize summary writer dqn_train.build_summary_writer(sess) # Initialize the target Q-Network fixed with the same weights update_target_network(sess, "qnetwork-train", "qnetwork-target") for step in xrange(params.num_steps): replay_mem_size = replay_mem.num_examples() if params.is_train and replay_mem_size < params.train_start and step % 1000 == 0: print("Initializing replay memory %i/%i" % (step, params.train_start)) # Epsilon Greedy Exploration: with the probability of epsilon # choose a random action, otherwise go greedy with the action # having the maximal Q-value. Note the minimum episolon of 0.1 if params.is_train: epsilon = max(0.1, 1.0-float(train_step*params.train_freq) / float(params.epsilon_step)) else: epsilon = 0.05 ################################################################ ####################### SELECT A MOVE ########################## ################################################################ # Either choose a random action or predict the action using the Q-network do_random_action = (random.random() < epsilon) if do_random_action or (replay_mem_size < params.train_start and params.is_train): action_id = random.randrange(atari.num_actions) count_act_random += 1 else: # Get the last screens from the history and perform # feed-forward through the network to compute Q-values feed_dict = { dqn_train.pl_screens: history.get() } qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) count_act_greedy += 1 count_actions[action_id] += 1 qvalues_hist.append(qvalue_max) ################################################################ ####################### PLAY THE MOVE ########################## ################################################################ # Play the selected action (either random or predicted) on the Atari game # Note that the action is performed for k = 4 frames (frame skipping) cumulative_reward, screen, is_terminal = atari.act(action_id) # Perform reward clipping and add the example to the replay memory cumulative_reward = min(+1.0, max(-1.0, cumulative_reward)) # Add the screen to short term history and replay memory history.add(screen) # Add experience to replay memory if params.is_train: replay_mem.add(action_id, cumulative_reward, screen, is_terminal) # Check if we are game over, and if yes, initialize a new game if is_terminal: reward, screen, is_terminal = atari.new_random_game() replay_mem.add(0, reward, screen, is_terminal) history.add(screen) ################################################################ ###################### TRAINING MODEL ########################## ################################################################ if params.is_train and step > params.train_start and step % params.train_freq == 0: t1 = time.time() # Prepare batch and train the network # TODO: set actions with terminal == 1 to reward = -1 ?? screens_in, actions, rewards, screens_out, terminals = replay_mem.sample_batch() dt_batch_gen.append(time.time() - t1) t2 = time.time() # Compute the target rewards from the previously fixed network # Note that the forward run is performed on the output screens. qvalues_target = sess.run( dqn_target.qvalues, feed_dict={ dqn_target.pl_screens: screens_out } ) # Inputs for trainable Q-network feed_dict = { dqn_train.pl_screens : screens_in, dqn_train.pl_actions : actions, dqn_train.pl_rewards : rewards, dqn_train.pl_terminals : terminals, dqn_train.pl_qtargets : np.max(qvalues_target, axis=1), } # Actual training operation _, loss, train_step = sess.run([dqn_train.train_op, dqn_train.loss, dqn_train.global_step], feed_dict=feed_dict) t3 = time.time() dt_optimization.append(t3 - t2) dt_train_total.append(t3 - t1) # Running average of the loss loss_hist.append(loss) # Check if the returned loss is not NaN if np.isnan(loss): print("[%s] Training failed with loss = NaN." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Once every n = 10000 frames update the Q-network for predicting targets if train_step % params.network_update_rate == 0: print("[%s] Updating target network." % datetime.now().strftime("%Y-%m-%d %H:%M")) update_target_network(sess, "qnetwork-train", "qnetwork-target") ################################################################ ####################### MODEL EVALUATION ####################### ################################################################ if params.is_train and train_step % params.eval_frequency == 0: eval_total_reward = 0 eval_num_episodes = 0 eval_num_rewards = 0 eval_episode_max_reward = 0 eval_episode_reward = 0 eval_actions = np.zeros(atari.num_actions) # Initialize new game without random start moves reward, screen, terminal = atari.new_game() for _ in range(4): history.add(screen) for eval_step in range(params.eval_steps): if random.random() < params.eval_epsilon: # Random action action_id = random.randrange(atari.num_actions) else: # Greedy action # Get the last screens from the history and perform # feed-forward through the network to compute Q-values feed_dict_eval = { dqn_train.pl_screens: history.get() } qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict_eval) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) # Keep track of how many of each action is performed eval_actions[action_id] += 1 # Perform the action reward, screen, terminal = atari.act(action_id) history.add(screen) eval_episode_reward += reward if reward > 0: eval_num_rewards += 1 if terminal: eval_total_reward += eval_episode_reward eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward) eval_episode_reward = 0 eval_num_episodes += 1 reward, screen, terminal = atari.new_game() for _ in range(4): history.add(screen) # Send statistics about the environment to TensorBoard eval_update_ops = [ dqn_train.eval_rewards.assign(eval_total_reward), dqn_train.eval_num_rewards.assign(eval_num_rewards), dqn_train.eval_max_reward.assign(eval_episode_max_reward), dqn_train.eval_num_episodes.assign(eval_num_episodes), dqn_train.eval_actions.assign(eval_actions / np.sum(eval_actions)) ] sess.run(eval_update_ops) summaries = sess.run(dqn_train.eval_summary_op, feed_dict=feed_dict) dqn_train.train_summary_writer.add_summary(summaries, train_step) print("[%s] Evaluation Summary" % datetime.now().strftime("%Y-%m-%d %H:%M")) print(" Total Reward: %i" % eval_total_reward) print(" Max Reward per Episode: %i" % eval_episode_max_reward) print(" Num Episodes: %i" % eval_num_episodes) print(" Num Rewards: %i" % eval_num_rewards) ################################################################ ###################### PRINTING / SAVING ####################### ################################################################ # Write a training summary to disk if params.is_train and train_step % params.interval_summary == 0: avg_dt_batch_gen = sum(dt_batch_gen) / float(len(dt_batch_gen)) avg_dt_optimization = sum(dt_optimization) / float(len(dt_optimization)) avg_dt_total = sum(dt_train_total) / float(len(dt_train_total)) # print("Avg. Time Batch Preparation: %.3f seconds" % avg_dt_batch_gen) # print("Avg. Time Train Operation: %.3f seconds" % avg_dt_train_op) # print("Avg. Time Total per Batch: %.3f seconds (%.2f samples/second)" % # (avg_dt_total, (1.0/avg_dt_total)*params.batch_size)) # Send statistics about the environment to TensorBoard update_game_stats_ops = [ dqn_train.avg_reward_per_game.assign(atari.avg_reward_per_episode()), dqn_train.max_reward_per_game.assign(atari.max_reward_per_episode), dqn_train.avg_moves_per_game.assign(atari.avg_steps_per_episode()), dqn_train.total_reward_replay.assign(replay_mem.total_reward()), dqn_train.num_games_played.assign(atari.episode_number), dqn_train.actions_random.assign(count_act_random), dqn_train.actions_greedy.assign(count_act_greedy), dqn_train.runtime_batch.assign(avg_dt_batch_gen), dqn_train.runtime_train.assign(avg_dt_optimization), dqn_train.runtime_total.assign(avg_dt_total), dqn_train.samples_per_second.assign((1.0/avg_dt_total)*params.batch_size) ] sess.run(update_game_stats_ops) # Build and save summaries summaries = sess.run(dqn_train.train_summary_op, feed_dict=feed_dict) dqn_train.train_summary_writer.add_summary(summaries, train_step) avg_qvalue = avg_loss = 0 for i in xrange(len(qvalues_hist)): avg_qvalue += qvalues_hist[i] avg_loss += loss_hist[i] avg_qvalue /= float(len(qvalues_hist)) avg_loss /= float(len(loss_hist)) format_str = "[%s] Step %06i, ReplayMemory = %i, Epsilon = %.4f, "\ "Episodes = %i, Avg.Reward = %.2f, Max.Reward = %.2f, Avg.QValue = %.4f, Avg.Loss = %.6f" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), train_step, replay_mem.num_examples(), epsilon, atari.episode_number, atari.avg_reward_per_episode(), atari.max_reward_per_episode, avg_qvalue, avg_loss)) # For debugging purposes, dump the batch to disk #print("[%s] Writing batch images to file (debugging)" % # datetime.now().strftime("%Y-%m-%d %H:%M")) #batch_output_dir = os.path.join(params.output_dir, "batches/%06i/" % train_step) #replay_mem.write_batch_to_disk(batch_output_dir, screens_in, actions, rewards, screens_out) # Write model checkpoint to disk if params.is_train and train_step % params.interval_checkpoint == 0: path = saver.save(sess, checkpoint_prefix, global_step=train_step) print("[%s] Saving TensorFlow model checkpoint to disk." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Dump the replay memory to disk # TODO: fix this! # print("[%s] Saving replay memory to disk." % # datetime.now().strftime("%Y-%m-%d %H:%M")) # replay_mem.save(replay_mem_dump) sum_actions = float(reduce(lambda x, y: x+y, count_actions)) action_str = "" for action_id, action_count in enumerate(count_actions): action_perc = action_count/sum_actions if not sum_actions == 0 else 0 action_str += "<%i, %s, %i, %.2f> " % \ (action_id, atari.action_to_string(action_id), action_count, action_perc) format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), count_act_random, count_act_greedy, action_str)) print("Finished training Q-network.")
class QAgent: """An environment class for open ai gym atari games using the screen. Attributes: _display : bool Display the game visually _screen (:obj: 'array', :obj: 'float') : The screen output (rgb) _reward (float) : amount of reward achieved by the previous action. The scale varies between environments, but the goal is always to increase your total reward. _done (bool) : Whether it's time to reset the environment again. Most (but not all) tasks are divided up into well-defined episodes, and done being True indicates the episode has terminated. _random_start (int) : How long we let the agent take random actions in a new game. screen_width (int) : The width of the screen after resizing. screen_height (int) : The height of the screen after resizing. _action_repeat (int) : The number of time-steps an action is repeated. env (:obj:) : The open ai gym environment object """ def __init__(self, params): self.params = params # These are the parameters collected for the agent. # Load environmnet self.game = MinesweeperEnvironment(self.params.input_height, self.params.input_width, self.params.mines_min, self.params.mines_max, self.params.show_game, self.params.reward_recent_update) # Initialize two Q-Value Networks # Q-network for training. self.dqn_train = DeepQNetwork(params=self.params, num_actions=self.game.num_actions, network_name="qnetwork-train", trainable=True) if self.params.is_train: # Q-Network for predicting target Q-values self.dqn_target = DeepQNetwork(params=self.params, num_actions=self.game.num_actions, network_name="qnetwork-target", trainable=False) # Initialize replay memory for storing experience to sample batches from self.replay_mem = ReplayMemory( self.params.replay_capacity, self.params.history_length, self.params.nchannels, self.params.batch_size, self.params.input_height, self.params.input_width, self.params.game, self.params.memory_checkpoint, self.params.restore_memory, self.params.output_dir) # Small structure for storing the last four screens self.history = ScreenHistory(self.params) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it self.checkpoint_dir = os.path.abspath( os.path.join(self.params.output_dir, "checkpoints_" + self.params.game)) self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "model") if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.train_iteration = 0 self.count_actions = np.zeros( self.game.num_actions) # Count per action (only greedy) self.count_act_random = 0 # Count of random actions self.count_act_greedy = 0 # Count of greedy actions self.win_rate = 0.0 # For atari # Histories of qvalues and loss for running average self.qvalues_hist = collections.deque( [0] * self.params.interval_summary, maxlen=self.params.interval_summary) self.loss_hist = collections.deque([10] * self.params.interval_summary, maxlen=self.params.interval_summary) self.epsilon = 0 def fit(self): screen, reward, is_done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.params.gpu_memory) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: # Initialize the TensorFlow session init = tf.global_variables_initializer() sess.run(init) # Only save trainable variables and the global iteration to disk tf_vars_to_save = tf.trainable_variables() + [ self.dqn_train.global_iteration ] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=200) if self.params.model_file is not None: # Load pre-trained model from disk model_path = os.path.join(self.checkpoint_dir, self.params.model_file) saver.restore(sess, model_path) self.train_iteration, learning_rate = sess.run([ self.dqn_train.global_iteration, self.dqn_train.learning_rate ]) print( "Restarted training from model file. iteration = %06i, Learning Rate = %.5f" % (self.train_iteration, learning_rate)) # Initialize summary writer self.dqn_train.build_summary_writer(sess) # Initialize the target Q-Network fixed with the same weights update_target_network(sess, "qnetwork-train", "qnetwork-target") for iteration in range( self.params.num_iterations ): # Iteration is also how many times we added to replay # self.train_iteration is the true train iteration self._sel_move(sess, iteration) self._train(sess, iteration, saver) print("Finished training Q-network.") def _sel_move(self, sess, iteration): if self.params.is_train: replay_mem_size = self.replay_mem.num_examples() if replay_mem_size < self.params.train_start and iteration % 1000 == 0: print("Initializing replay memory %i/%i" % (iteration, self.params.train_start)) # self.epsilon Greedy Exploration: with the probability of self.epsilon # choose a random action, otherwise go greedy with the action # having the maximal Q-value. Note the minimum episolon of 0.1 if self.params.is_train: self.epsilon = max( self.params.min_epsilon, 1.0 - float(self.train_iteration * self.params.train_freq) / float(self.params.epsilon_step)) else: self.epsilon = self.params.eval_epsilon ################################################################ ####################### SELECT A MOVE ########################## ################################################################ # Either choose a random action or predict the action using the Q-network do_random_action = (random.random() < self.epsilon) if do_random_action or (self.params.is_train and replay_mem_size < self.params.train_start): action_id = random.randrange(self.game.num_actions) self.count_act_random += 1 else: # Get the last screens from the self.history and perform # feed-forward through the network to compute Q-values feed_dict = {self.dqn_train.pl_screens: self.history.get()} qvalues = sess.run(self.dqn_train.qvalues, feed_dict=feed_dict) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) self.count_act_greedy += 1 self.count_actions[action_id] += 1 self.qvalues_hist.append(qvalue_max) self._move(action_id) def _move(self, action_id): ################################################################ ####################### PLAY THE MOVE ########################## ################################################################ # Play the selected action (either random or predicted) on the self.game game # Note that the action is performed for k = 4 frames (frame skipping) screen, cumulative_reward, is_done = self.game.act(action_id) # Perform reward clipping and add the example to the replay memory # This is done with Huber loss now #cumulative_reward = min(+1.0, max(-1.0, cumulative_reward)) # Add the screen to short term self.history and replay memory self.history.add(screen) # Add experience to replay memory if self.params.is_train: self.replay_mem.add(action_id, cumulative_reward, screen, is_done) # Check if we are game over, and if yes, initialize a new game if is_done: screen, reward, is_done = self.game.new_game() if self.params.is_train: self.replay_mem.add(0, reward, screen, is_done) self.history.add(screen) def _train(self, sess, iteration, saver): ################################################################ ###################### TRAINING MODEL ########################## ################################################################ if self.params.is_train and iteration > self.params.train_start and iteration % self.params.train_freq == 0: screens, actions, rewards, screens_1, dones = self.replay_mem.sample_batch( ) # Below, we perform the Double-DQN update. # First, we need to determine the best actions # in the train network qvalues_train = sess.run( self.dqn_train.qvalues, feed_dict={self.dqn_train.pl_screens: screens_1}) # Find the best actions for each using the train network # which will be used with the q-values form the target network actions_target = np.argmax(qvalues_train, 1) # We use this to evalute the q-value for some state # Now,we get the q-values for all actions given the states # We then later sort out the q-values from the target network # using the best actions from the train network qvalues_target = sess.run( self.dqn_target.qvalues, feed_dict={self.dqn_target.pl_screens: screens_1}) # Inputs for trainable Q-network feed_dict = { self.dqn_train.pl_screens: screens, self.dqn_train.pl_actions: actions, self.dqn_train.pl_rewards: rewards, self.dqn_train.pl_dones: dones, #self.dqn_train.pl_qtargets : np.max(qvalues_target, axis=1), self.dqn_train.pl_qtargets: qvalues_target, self.dqn_train.pl_actions_target: actions_target, } # Actual training operation _, loss, self.train_iteration = sess.run([ self.dqn_train.train_op, self.dqn_train.loss, self.dqn_train.global_iteration ], feed_dict=feed_dict) # Running average of the loss self.loss_hist.append(loss) # Check if the returned loss is not NaN if np.isnan(loss): print("[%s] Training failed with loss = NaN." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Once every n = 10000 frames update the Q-network for predicting targets if self.train_iteration % self.params.network_update_rate == 0: print("[%s] Updating target network." % datetime.now().strftime("%Y-%m-%d %H:%M")) update_target_network(sess, "qnetwork-train", "qnetwork-target") self._evaluate(sess, feed_dict) self._print_save(sess, feed_dict, saver) def _evaluate(self, sess, feed_dict): ################################################################ ####################### MODEL EVALUATION ####################### ################################################################ if self.params.is_train and self.train_iteration % self.params.eval_frequency == 0 or self.train_iteration == 0: eval_total_reward = 0 eval_num_episodes = 0 eval_num_wins = 0 eval_num_rewards = 0 eval_episode_max_reward = 0 eval_episode_reward = 0 eval_actions = np.zeros(self.game.num_actions) # We store all of these parameters temporarily so this evaluation does not # affect model evaluation tmp_episode_step = self.game._episode_step tmp_episode_number = self.game._episode_number tmp_episode_reward = self.game._episode_reward tmp_max_reward_episode = self.game._max_reward_episode tmp_global_step = self.game._global_step tmp_global_reward = self.game._global_reward tmp_recent_reward = self.game._recent_reward tmp_recent_episode_number = self.game._recent_episode_number tmp_recent_games_won = self.game._recent_games_won tmp_games_won = self.game._games_won tmp_reward_recent_update = self.game.reward_recent_update prev_action_id = -1 prev_episode_num = -1 # Just has to be different intially than prev action_id = -1 eval_num_episodes = 0 # Initialize new game without random start moves screen, reward, done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) #for eval_iterations in range(self.params.eval_iterations): while eval_num_episodes < self.params.eval_iterations: # Play eval_iterations games prev_action_id = action_id # if random.random() < self.params.eval_epsilon: # # Random action # action_id = random.randrange(self.game.num_actions) #else: # Greedy action # Get the last screens from the self.history and perform # feed-forward through the network to compute Q-values feed_dict_eval = { self.dqn_train.pl_screens: self.history.get() } qvalues = sess.run(self.dqn_train.qvalues, feed_dict=feed_dict_eval) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) # Skip this action if we are in the same game if prev_action_id == action_id and prev_episode_num == eval_num_episodes: action_id = random.randrange(self.game.num_actions) prev_episode_num = eval_num_episodes # Keep track of how many of each action is performed eval_actions[action_id] += 1 # Perform the action screen, reward, done = self.game.act(action_id) self.history.add(screen) eval_episode_reward += reward if reward > 0: eval_num_rewards += 1 if reward == self.game.env.rewards["win"]: eval_num_wins += 1 if done: # Note max reward is from playin gthe games eval_total_reward += eval_episode_reward eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward) eval_episode_reward = 0 eval_num_episodes += 1 screen, reward, done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) # Send statistics about the environment to TensorBoard eval_update_ops = [ self.dqn_train.eval_rewards.assign(eval_total_reward), self.dqn_train.eval_win_rate.assign( (eval_num_wins / eval_num_episodes) * 100), self.dqn_train.eval_num_rewards.assign(eval_num_rewards), self.dqn_train.eval_max_reward.assign(eval_episode_max_reward), self.dqn_train.eval_num_episodes.assign(eval_num_episodes), self.dqn_train.eval_actions.assign(eval_actions / np.sum(eval_actions)) ] sess.run(eval_update_ops) summaries = sess.run(self.dqn_train.eval_summary_op, feed_dict=feed_dict) self.dqn_train.train_summary_writer.add_summary( summaries, self.train_iteration) print("[%s] Evaluation Summary" % datetime.now().strftime("%Y-%m-%d %H:%M")) print(" Total Reward: %i" % eval_total_reward) print(" Max Reward per Episode: %i" % eval_episode_max_reward) print(" Num Episodes: %i" % eval_num_episodes) print(" Num Rewards: %i" % eval_num_rewards) print(" Win Rate: %.1f" % ((eval_num_wins / eval_num_episodes) * 100)) self.win_rate = (eval_num_wins / eval_num_episodes) * 100 self.game._episode_step = tmp_episode_step self.game._episode_number = tmp_episode_number self.game._episode_reward = tmp_episode_reward self.game._max_reward_episode = tmp_max_reward_episode self.game._global_step = tmp_global_step self.game._global_reward = tmp_global_reward self.game._recent_reward = tmp_recent_reward self.game._recent_episode_number = tmp_recent_episode_number self.game._recent_games_won = tmp_recent_games_won self.game._games_won = tmp_games_won self.game.reward_recent_update = tmp_reward_recent_update def _print_save(self, sess, feed_dict, saver): ################################################################ ###################### PRINTING / SAVING ####################### ################################################################ # Write a training summary to disk # This is what controls how often we write to disk if self.params.is_train and self.train_iteration % self.params.interval_summary == 0: # Send statistics about the environment to TensorBoard update_game_stats_ops = [ self.dqn_train.avg_reward_per_game.assign( self.game.avg_reward_per_episode()), self.dqn_train.max_reward_per_game.assign( self.game.max_reward_per_episode), self.dqn_train.avg_moves_per_game.assign( self.game.avg_steps_per_episode()), self.dqn_train.total_reward_replay.assign( self.replay_mem.total_reward()), self.dqn_train.num_games_played.assign( self.game.episode_number), self.dqn_train.moves.assign(self.game.global_step), self.dqn_train.actions_random.assign(self.count_act_random), self.dqn_train.actions_greedy.assign(self.count_act_greedy), ] sess.run(update_game_stats_ops) # Build and save summaries summaries = sess.run(self.dqn_train.train_summary_op, feed_dict=feed_dict) # Here we set train_iteration on x-axis self.dqn_train.train_summary_writer.add_summary( summaries, self.train_iteration) # Here we set number of moves on x-axis #self.dqn_train.train_summary_writer.add_summary(summaries, self.game.global_step) avg_qvalue = avg_loss = 0 for i in range(len(self.qvalues_hist)): avg_qvalue += self.qvalues_hist[i] avg_loss += self.loss_hist[i] avg_qvalue /= float(len(self.qvalues_hist)) avg_loss /= float(len(self.loss_hist)) learning_rate = sess.run(self.dqn_train.learning_rate) format_str = "[%s] It. %06i, Replay = %i, epsilon = %.4f, "\ "Episodes = %i, Steps = %i, Avg.R = %.3f, "\ "Max.R = %.3f, Win = %.1f, Avg.Q = %.4f, Avg.Loss = %.6f, lr = %.6f" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), self.train_iteration, self.replay_mem.num_examples(), self.epsilon, self.game.episode_number, self.game.global_step, self.game.avg_reward_per_episode(), self.game.max_reward_per_episode, self.win_rate, avg_qvalue, avg_loss, learning_rate)) # Write model checkpoint to disk if self.params.is_train and self.train_iteration % self.params.interval_checkpoint == 0: path = saver.save(sess, self.checkpoint_prefix, global_step=self.train_iteration) print("[%s] Saving TensorFlow model checkpoint to disk." % datetime.now().strftime("%Y-%m-%d %H:%M")) sum_actions = float(reduce(lambda x, y: x + y, self.count_actions)) action_str = "" for action_id, action_count in enumerate(self.count_actions): action_perc = action_count / sum_actions if not sum_actions == 0 else 0 action_str += "<%i, %s, %i, %.2f> " % \ (action_id, self.game.action_to_string(action_id), action_count, action_perc) format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), self.count_act_random, self.count_act_greedy, action_str)) def play_mine(self): # Initialize a new game and store the screens in the self.history screen, reward, is_done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.params.gpu_memory) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: # Initialize the TensorFlow session init = tf.global_variables_initializer() sess.run(init) # Only save trainable variables and the global iteration to disk tf_vars_to_save = tf.trainable_variables() + [ self.dqn_train.global_iteration ] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=200) if self.params.model_file is not None: # Load pre-trained model from disk model_path = os.path.join(self.checkpoint_dir, self.params.model_file) saver.restore(sess, model_path) while self.game.episode_number < self.params.num_games: if self.params.show_game: inp = input("Enter input (ROW,COL)") self._sel_move(sess, 0) print(self.game.episode_number) print(self.game.win_rate) def evaluate_mine(self): # Initialize a new game and store the screens in the self.history screen, reward, is_done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.params.gpu_memory) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: max_name = 800000 min_name = 680000 current_name = min_name best_model = min_name best_win_rate = 0 current_win_rate = 0 # Initialize the TensorFlow session init = tf.global_variables_initializer() sess.run(init) # Only save trainable variables and the global iteration to disk tf_vars_to_save = tf.trainable_variables() + [ self.dqn_train.global_iteration ] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=200) while current_name <= max_name: print("Restoring: ", current_name) # if self.params.model_file is not None: # # Load pre-trained model from disk # model_path = os.path.join(self.checkpoint_dir, self.params.model_file) # saver.restore(sess, model_path) model_path = os.path.join(self.checkpoint_dir, 'model-' + str(current_name)) saver.restore(sess, model_path) prev_action_id = -1 prev_episode_num = -1 # Just has to be different intially than prev action_id = -1 eval_num_episodes = 0 eval_total_reward = 0 eval_num_episodes = 0 eval_num_wins = 0 eval_num_rewards = 0 eval_episode_max_reward = 0 eval_episode_reward = 0 eval_actions = np.zeros(self.game.num_actions) # Initialize new game without random start moves screen, reward, done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) #for eval_iterations in range(self.params.eval_iterations): while eval_num_episodes < self.params.eval_iterations: # Play eval_iterations games prev_action_id = action_id feed_dict_eval = { self.dqn_train.pl_screens: self.history.get() } qvalues = sess.run(self.dqn_train.qvalues, feed_dict=feed_dict_eval) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) # Skip this action if we are in the same game if prev_action_id == action_id and prev_episode_num == eval_num_episodes: action_id = random.randrange(self.game.num_actions) prev_episode_num = eval_num_episodes # Perform the action screen, reward, done = self.game.act(action_id) self.history.add(screen) eval_episode_reward += reward if reward > 0: eval_num_rewards += 1 if reward == self.game.env.rewards["win"]: eval_num_wins += 1 if done: # Note max reward is from playin gthe games eval_total_reward += eval_episode_reward eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward) eval_episode_reward = 0 eval_num_episodes += 1 screen, reward, done = self.game.new_game() for _ in range(self.params.history_length): self.history.add(screen) current_win_rate = (eval_num_wins / eval_num_episodes) * 100 print(" Win Rate: %.2f" % (current_win_rate)) if current_win_rate > best_win_rate: best_win_rate = current_win_rate best_model = current_name current_name = current_name + 20000 print("Best model is: ", best_model)