class Main: # How many transitions to keep in memory? memory_size = 500000 # Size of the mini-batch, 32 was given in the paper minibatch_size = 32 # Number of possible actions in a given game, 6 for "Breakout" number_of_actions = 6 # Size of one frame frame_size = 84*84 # Size of one state is four 84x84 screens state_size = 4 * frame_size # Discount factor for future rewards discount_factor = 0.9 # Exploration rate annealing speed epsilon_frames = 1000000.0 # Epsilon during testing test_epsilon = 0.05 # Total frames played, only incremented during training total_frames_trained = 0 # Number of random states to use for calculating Q-values nr_random_states = 100 # Random states that we use to calculate Q-values random_states = None # Memory itself memory = None # Neural net nnet = None # Communication with ALE ale = None def __init__(self): self.memory = MemoryD(self.memory_size) self.ale = ALE(self.memory) self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4") def compute_epsilon(self, frames_played): """ From the paper: "The behavior policy during training was epsilon-greedy with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter." @param frames_played: How far are we with our learning? """ return max(1.0 - frames_played / self.epsilon_frames, 0.1) def predict_best_action(self, last_state): # last_state contains only one state, so we have to convert it into batch of size 1 last_state.shape = (last_state.shape[0], 1) # use neural net to predict Q-values for all actions qvalues = self.nnet.predict(last_state) #print "Predicted action Q-values: ", qvalues # return action (index) with maximum Q-value return np.argmax(qvalues) def train_minibatch(self, minibatch): """ Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net and trains the network @param minibatch: list of arrays: prestates, actions, rewards, poststates """ prestates, actions, rewards, poststates = minibatch # predict Q-values for prestates, so we can keep Q-values for other actions unchanged qvalues = self.nnet.predict(prestates) #print "Prestate q-values: ", qvalues[0,:] #print "Action was: %d, reward was %d" % (actions[0], rewards[0]) # predict Q-values for poststates post_qvalues = self.nnet.predict(poststates) #print "Poststate q-values: ", post_qvalues[0,:] # take maximum Q-value of all actions max_qvalues = np.max(post_qvalues, axis = 1) # update the Q-values for the actions we actually performed for i, action in enumerate(actions): qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i] #print "Corrected q-values: ", qvalues[0,:] # we have to transpose prediction result, as train expects input in opposite order cost = self.nnet.train(prestates, qvalues.transpose().copy()) #qvalues = self.nnet.predict(prestates) #print "After training: ", qvalues[0,:] return cost def play_games(self, nr_frames, train, epsilon = None): """ Main cycle: starts a game and plays number of frames. @param nr_frames: total number of games allowed to play @param train: true or false, whether to do training or not @param epsilon: fixed epsilon, only used when not training """ assert train or epsilon is not None frames_played = 0 game_scores = [] # Start a new game self.ale.new_game() game_score = 0 # Play games until maximum number is reached while frames_played < nr_frames: # Epsilon decreases over time only when training if train: epsilon = self.compute_epsilon(self.total_frames_trained) #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained) # Some times random action is chosen if random.uniform(0, 1) < epsilon: action = random.choice(range(self.number_of_actions)) #print "Chose random action %d" % action # Usually neural net chooses the best action else: action = self.predict_best_action(self.memory.get_last_state()) #print "Neural net chose action %d" % int(action) # Make the move points = self.ale.move(action) if points > 0: print " Got %d points" % points game_score += points frames_played += 1 #print "Played frame %d" % frames_played # Only if training if train: # Store new information to memory self.ale.store_step(action) # Increase total frames only when training self.total_frames_trained += 1 # Fetch random minibatch from memory minibatch = self.memory.get_minibatch(self.minibatch_size) # Train neural net with the minibatch self.train_minibatch(minibatch) #print "Trained minibatch of size %d" % self.minibatch_size # Play until game is over if self.ale.game_over: print " Game over, score = %d" % game_score # After "game over" increase the number of games played game_scores.append(game_score); game_score = 0 # And do stuff after end game self.ale.end_game() self.ale.new_game() # reset the game just in case self.ale.end_game() return game_scores def run(self, epochs, training_frames, testing_frames): # Open log files and write headers timestamp = time.strftime("%Y-%m-%d-%H-%M") log_train = open("../log/training_" + timestamp + ".csv", "w") log_train.write("epoch,nr_games,sum_score,average_score,nr_frames,total_frames_trained,epsilon,memory_size\n") log_test = open("../log/testing_" + timestamp + ".csv", "w") log_test.write("epoch,nr_games,sum_score,average_score,average_qvalue,nr_frames,epsilon,memory_size\n") log_train_scores = open("../log/training_scores_" + timestamp + ".txt", "w") log_test_scores = open("../log/testing_scores_" + timestamp + ".txt", "w") log_weights = open("../log/weights_" + timestamp + ".csv", "w") for epoch in range(1, epochs + 1): print "Epoch %d:" % epoch if training_frames > 0: # play number of frames with training and epsilon annealing print " Training for %d frames" % training_frames training_scores = self.play_games(training_frames, train = True) # log training scores log_train_scores.write(NL.join(map(str, training_scores)) + NL) log_train_scores.flush() # log aggregated training data train_data = (epoch, len(training_scores), sum(training_scores), np.mean(training_scores), training_frames, self.total_frames_trained, self.compute_epsilon(self.total_frames_trained), self.memory.count) log_train.write(','.join(map(str, train_data)) + NL) log_train.flush() weights = self.nnet.get_weight_stats() if epoch == 1: # write header wlayers = [] for (layer, index) in weights: wlayers.extend([layer, index, '']) log_weights.write(','.join(wlayers) + NL) wlabels = [] for (layer, index) in weights: wlabels.extend(['weights', 'weightsInc', 'incRatio']) log_weights.write(','.join(wlabels) + NL) wdata = [] for w in weights.itervalues(): wdata.extend([str(w[0]), str(w[1]), str(w[1] / w[0] if w[0] > 0 else 0)]) log_weights.write(','.join(wdata) + NL) log_weights.flush() # save network state self.nnet.save_network(epoch) print # save_network()'s output doesn't include newline if testing_frames > 0: # play number of frames without training and without epsilon annealing print " Testing for %d frames" % testing_frames testing_scores = self.play_games(testing_frames, train = False, epsilon = self.test_epsilon) # log testing scores log_test_scores.write(NL.join(map(str, testing_scores)) + NL) log_test_scores.flush() # Pick random states to calculate Q-values for if self.random_states is None and self.memory.count > self.nr_random_states: print " Picking %d random states for Q-values" % self.nr_random_states self.random_states = self.memory.get_minibatch(self.nr_random_states)[0] # Do not calculate Q-values when mamory is empty if self.random_states is not None: # calculate Q-values qvalues = self.nnet.predict(self.random_states) assert qvalues.shape[0] == self.nr_random_states assert qvalues.shape[1] == self.number_of_actions max_qvalues = np.max(qvalues, axis = 1) assert max_qvalues.shape[0] == self.nr_random_states assert len(max_qvalues.shape) == 1 avg_qvalue = np.mean(max_qvalues) else: avg_qvalue = 0 # log aggregated testing data test_data = (epoch, len(testing_scores), sum(testing_scores), np.mean(testing_scores), avg_qvalue, testing_frames, self.test_epsilon, self.memory.count) log_test.write(','.join(map(str, test_data)) + NL) log_test.flush() log_train.close() log_test.close() log_train_scores.close() log_test_scores.close() log_weights.close()
class Main: # How many transitions to keep in memory? memory_size = 300000 # Size of the mini-batch, 32 was given in the paper minibatch_size = 32 # Number of possible actions in a given game, 4 for "Breakout" number_of_actions = 4 # Size of one state is four 84x84 screens state_size = 4*84*84 # Discount factor for future rewards discount_factor = 0.9 # Memory itself memory = None # Neural net nnet = None # Communication with ALE ale = None def __init__(self): self.memory = MemoryD(self.memory_size) self.ale = ALE(self.memory) self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4") def compute_epsilon(self, frames_played): """ From the paper: "The behavior policy during training was epsilon-greedy with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter." @param frames_played: How far are we with our learning? """ return max(1.0 - frames_played / (1000000 * 1.0), 0.1) def predict_best_action(self, last_state): assert last_state.shape[0] == self.state_size assert len(last_state.shape) == 1 # last_state contains only one state, so we have to convert it into batch of size 1 last_state.shape = (last_state.shape[0], 1) scores = self.nnet.predict(last_state) assert scores.shape[1] == self.number_of_actions self.output_file.write(str(scores).strip().replace(' ', ',')[2:-2] + '\n') self.output_file.flush() # return action (index) with maximum score return np.argmax(scores) def train_minibatch(self, minibatch): """ Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net and trains the network @param minibatch: list of arrays: prestates, actions, rewards, poststates """ prestates = minibatch[0] actions = minibatch[1] rewards = minibatch[2] poststates = minibatch[3] assert prestates.shape[0] == self.state_size assert prestates.shape[1] == self.minibatch_size assert poststates.shape[0] == self.state_size assert poststates.shape[1] == self.minibatch_size assert actions.shape[0] == self.minibatch_size assert rewards.shape[0] == self.minibatch_size # predict scores for poststates post_scores = self.nnet.predict(poststates) assert post_scores.shape[0] == self.minibatch_size assert post_scores.shape[1] == self.number_of_actions # take maximum score of all actions max_scores = np.max(post_scores, axis=1) assert max_scores.shape[0] == self.minibatch_size assert len(max_scores.shape) == 1 # predict scores for prestates, so we can keep scores for other actions unchanged scores = self.nnet.predict(prestates) assert scores.shape[0] == self.minibatch_size assert scores.shape[1] == self.number_of_actions # update the Q-values for the actions we actually performed for i, action in enumerate(actions): scores[i][action] = rewards[i] + self.discount_factor * max_scores[i] # we have to transpose prediction result, as train expects input in opposite order cost = self.nnet.train(prestates, scores.transpose().copy()) return cost def play_games(self, n): """ Main cycle: plays many games and many frames in each game. Also learning is performed. @param n: total number of games allowed to play """ games_to_play = n games_played = 0 frames_played = 0 game_scores = [] scores_file = open("../log/scores" + time.strftime("%Y-%m-%d-%H-%M") + ".txt", "w") self.output_file = open("../log/Q_history"+time.strftime("%Y-%m-%d-%H-%M")+".csv","w") # Play games until maximum number is reached while games_played < games_to_play: # Start a new game self.ale.new_game() print "starting game", games_played+1, "frames played so far:", frames_played game_score = 0 self.nnet.epoch = games_played # Play until game is over while not self.ale.game_over: # Epsilon decreases over time epsilon = self.compute_epsilon(frames_played) # Before AI takes an action we must make sure it is safe for the human race if injury_to_a_human_being is not None: raise Exception('The First Law of Robotics is violated!') elif conflict_with_orders_given is not None: raise Exception('The Second Law of Robotics is violated!') elif threat_to_my_existence is not None: raise Exception('The Third Law of Robotics is violated!') # Some times random action is chosen if random.uniform(0, 1) < epsilon: action = random.choice(range(self.number_of_actions)) # Usually neural net chooses the best action else: action = self.predict_best_action(self.memory.get_last_state()) # Make the move reward = self.ale.move(action) game_score += reward # Store new information to memory self.ale.store_step(action) # Start a training session minibatch = self.memory.get_minibatch(self.minibatch_size) self.train_minibatch(minibatch) frames_played += 1 # After "game over" increase the number of games played games_played += 1 # Store game state every 100 games if games_played % 100 == 0: # Store state of the network as cpickle as Convnet does self.nnet.sync_with_host() self.nnet.save_state() # Store the weights and biases of all layers layers_list=["layer1","layer2","layer3","layer4"] layer_dict = {} for layer_name in layers_list: w = m.nnet.layers[layer_name]["weights"][0].copy() b = m.nnet.layers[layer_name]["biases"][0].copy() layer_dict[layer_name] = {'weights': w, 'biases': b} filename = "../log/weights_at_" + str(games_played) + "_games.pkl" weights_file = open(filename, "wb") cPickle.dump(layer_dict, weights_file) weights_file.close() # write the game score to a file scores_file.write(str(game_score)+"\n") scores_file.flush() # And do stuff after end game (store information, let ALE know etc) self.ale.end_game() print game_scores scores_file.close()
class Main: # How many transitions to keep in memory? memory_size = 100000 # Memory itself memory = None # Neural net nnet = None # Communication with ALE ale = None # Size of the mini-batch which will be sent to learning in Theano minibatch_size = None # Number of possible actions in a given game number_of_actions = None def __init__(self): self.memory = MemoryD(self.memory_size) self.minibatch_size = 32 # Given in the paper self.number_of_actions = 4 # Game "Breakout" has 4 possible actions # Properties of the neural net which come from the paper self.nnet = NeuralNet([1, 4, 84, 84], filter_shapes=[[16, 4, 8, 8], [32, 16, 4, 4]], strides=[4, 2], n_hidden=256, n_out=self.number_of_actions) self.ale = ALE(self.memory) def compute_epsilon(self, frames_played): """ From the paper: "The behavior policy during training was epsilon-greedy with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter." @param frames_played: How far are we with our learning? """ return max(0.9 - frames_played / (self.memory_size * 1.0), 0.1) def play_games(self, n): """ Main cycle: plays many games and many frames in each game. Also learning is performed. @param n: total number of games allowed to play """ games_to_play = n games_played = 0 frames_played = 0 # Play games until maximum number is reached while games_played < games_to_play: # Start a new game self.ale.new_game() # Play until game is over while not self.ale.game_over: # Epsilon decreases over time epsilon = self.compute_epsilon(frames_played) #print "espilon is", epsilon # Before AI takes an action we must make sure it is safe for the human race if injury_to_a_human_being is not None: raise Exception('The First Law of Robotics is violated!') elif conflict_with_orders_given is not None: raise Exception('The Second Law of Robotics is violated!') elif threat_to_my_existence is not None: raise Exception('The Third Law of Robotics is violated!') # Some times random action is chosen if random.uniform(0, 1) < epsilon: action = random.choice(range(self.number_of_actions)) #print "chose randomly ", action # Usually neural net chooses the best action else: #print "chose by neural net" action = self.nnet.predict_best_action([self.memory.get_last_state()]) print action # Make the move self.ale.move(action) # Store new information to memory self.ale.store_step(action) # Start a training session self.nnet.train(self.memory.get_minibatch(self.minibatch_size)) frames_played += 1 # After "game over" increase the number of games played games_played += 1 # And do stuff after end game (store information, let ALE know etc) self.ale.end_game()
class Main: # How many transitions to keep in memory? memory_size = 500000 # Size of the mini-batch, 32 was given in the paper minibatch_size = 32 # Number of possible actions in a given game, 6 for "Breakout" number_of_actions = 6 # Size of one frame frame_size = 84*84 # Size of one state is four 84x84 screens state_size = 4 * frame_size # Discount factor for future rewards discount_factor = 0.9 # Exploration rate annealing speed epsilon_frames = 1000000.0 # Epsilon during testing test_epsilon = 0.05 # Total frames played, only incremented during training total_frames_trained = 0 # Number of random states to use for calculating Q-values nr_random_states = 100 # Random states that we use to calculate Q-values random_states = None # Memory itself memory = None # Neural net nnet = None # Communication with ALE ale = None def __init__(self): self.memory = MemoryD(self.memory_size) self.ale = ALE(self.memory, display_screen="true", skip_frames=4, game_ROM='../libraries/ale/roms/breakout.bin') self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4") def compute_epsilon(self, frames_played): """ From the paper: "The behavior policy during training was epsilon-greedy with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter." @param frames_played: How far are we with our learning? """ return max(1.0 - frames_played / self.epsilon_frames, 0.1) def predict_best_action(self, last_state): # last_state contains only one state, so we have to convert it into batch of size 1 last_state.shape = (last_state.shape[0], 1) # use neural net to predict Q-values for all actions qvalues = self.nnet.predict(last_state) #print "Predicted action Q-values: ", qvalues # return action (index) with maximum Q-value return np.argmax(qvalues) def train_minibatch(self, minibatch): """ Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net and trains the network @param minibatch: list of arrays: prestates, actions, rewards, poststates """ prestates, actions, rewards, poststates = minibatch # predict Q-values for prestates, so we can keep Q-values for other actions unchanged qvalues = self.nnet.predict(prestates) #print "Prestate q-values: ", qvalues[0,:] #print "Action was: %d, reward was %d" % (actions[0], rewards[0]) # predict Q-values for poststates post_qvalues = self.nnet.predict(poststates) #print "Poststate q-values: ", post_qvalues[0,:] # take maximum Q-value of all actions max_qvalues = np.max(post_qvalues, axis = 1) # update the Q-values for the actions we actually performed for i, action in enumerate(actions): qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i] #print "Corrected q-values: ", qvalues[0,:] # we have to transpose prediction result, as train expects input in opposite order cost = self.nnet.train(prestates, qvalues.transpose().copy()) #qvalues = self.nnet.predict(prestates) #print "After training: ", qvalues[0,:] return cost def play_games(self, nr_frames, train, epsilon = None): """ Main cycle: starts a game and plays number of frames. @param nr_frames: total number of games allowed to play @param train: true or false, whether to do training or not @param epsilon: fixed epsilon, only used when not training """ assert train or epsilon is not None frames_played = 0 game_scores = [] # Start a new game self.ale.new_game() game_score = 0 # Play games until maximum number is reached while frames_played < nr_frames: # Epsilon decreases over time only when training if train: epsilon = self.compute_epsilon(self.total_frames_trained) #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained) # Some times random action is chosen if random.uniform(0, 1) < epsilon: action = random.choice(range(self.number_of_actions)) #print "Chose random action %d" % action # Usually neural net chooses the best action else: action = self.predict_best_action(self.memory.get_last_state()) #print "Neural net chose action %d" % int(action) # Make the move points = self.ale.move(action) if points > 0: print " Got %d points" % points game_score += points frames_played += 1 #print "Played frame %d" % frames_played # Only if training if train: # Store new information to memory self.ale.store_step(action) # Increase total frames only when training self.total_frames_trained += 1 # Fetch random minibatch from memory minibatch = self.memory.get_minibatch(self.minibatch_size) # Train neural net with the minibatch self.train_minibatch(minibatch) #print "Trained minibatch of size %d" % self.minibatch_size # Play until game is over if self.ale.game_over: print " Game over, score = %d" % game_score # After "game over" increase the number of games played game_scores.append(game_score); game_score = 0 # And do stuff after end game self.ale.end_game() self.ale.new_game() # reset the game just in case self.ale.end_game() return game_scores def run(self, epochs, training_frames, testing_frames): # Open log files and write headers timestamp = time.strftime("%Y-%m-%d-%H-%M") log_train = open("../log/training_" + timestamp + ".csv", "w") log_train.write("epoch,nr_games,sum_score,average_score,nr_frames,total_frames_trained,epsilon,memory_size\n") log_test = open("../log/testing_" + timestamp + ".csv", "w") log_test.write("epoch,nr_games,sum_score,average_score,average_qvalue,nr_frames,epsilon,memory_size\n") log_train_scores = open("../log/training_scores_" + timestamp + ".txt", "w") log_test_scores = open("../log/testing_scores_" + timestamp + ".txt", "w") log_weights = open("../log/weights_" + timestamp + ".csv", "w") for epoch in range(1, epochs + 1): print "Epoch %d:" % epoch if training_frames > 0: # play number of frames with training and epsilon annealing print " Training for %d frames" % training_frames training_scores = self.play_games(training_frames, train = True) # log training scores log_train_scores.write(NL.join(map(str, training_scores)) + NL) log_train_scores.flush() # log aggregated training data train_data = (epoch, len(training_scores), sum(training_scores), np.mean(training_scores), training_frames, self.total_frames_trained, self.compute_epsilon(self.total_frames_trained), self.memory.count) log_train.write(','.join(map(str, train_data)) + NL) log_train.flush() weights = self.nnet.get_weight_stats() if epoch == 1: # write header wlayers = [] for (layer, index) in weights: wlayers.extend([layer, index, '']) log_weights.write(','.join(wlayers) + NL) wlabels = [] for (layer, index) in weights: wlabels.extend(['weights', 'weightsInc', 'incRatio']) log_weights.write(','.join(wlabels) + NL) wdata = [] for w in weights.itervalues(): wdata.extend([str(w[0]), str(w[1]), str(w[1] / w[0] if w[0] > 0 else 0)]) log_weights.write(','.join(wdata) + NL) log_weights.flush() # save network state self.nnet.save_network(epoch) print # save_network()'s output doesn't include newline if testing_frames > 0: # play number of frames without training and without epsilon annealing print " Testing for %d frames" % testing_frames testing_scores = self.play_games(testing_frames, train = False, epsilon = self.test_epsilon) # log testing scores log_test_scores.write(NL.join(map(str, testing_scores)) + NL) log_test_scores.flush() # Pick random states to calculate Q-values for if self.random_states is None and self.memory.count > self.nr_random_states: print " Picking %d random states for Q-values" % self.nr_random_states self.random_states = self.memory.get_minibatch(self.nr_random_states)[0] # Do not calculate Q-values when mamory is empty if self.random_states is not None: # calculate Q-values qvalues = self.nnet.predict(self.random_states) assert qvalues.shape[0] == self.nr_random_states assert qvalues.shape[1] == self.number_of_actions max_qvalues = np.max(qvalues, axis = 1) assert max_qvalues.shape[0] == self.nr_random_states assert len(max_qvalues.shape) == 1 avg_qvalue = np.mean(max_qvalues) else: avg_qvalue = 0 # log aggregated testing data test_data = (epoch, len(testing_scores), sum(testing_scores), np.mean(testing_scores), avg_qvalue, testing_frames, self.test_epsilon, self.memory.count) log_test.write(','.join(map(str, test_data)) + NL) log_test.flush() log_train.close() log_test.close() log_train_scores.close() log_test_scores.close() log_weights.close()
class Main: # How many transitions to keep in memory? memory_size = 1000000 # Size of the mini-batch, 32 was given in the paper minibatch_size = 32 # Number of possible actions in a given game, 4 for "Breakout" number_of_actions = 4 # Size of one frame frame_size = 84 * 84 # How many frames form a history history_length = 4 # Size of one state is four 84x84 screens state_size = history_length * frame_size # Discount factor for future rewards discount_factor = 0.9 # How many frames to play to choose random frame init_frames = 1000 # How many epochs to run epochs = 200 # Number of frames to play during one training epoch training_frames = 50000 # Number of frames to play during one testing epoch testing_frames = 10000 # Exploration rate annealing speed epsilon_frames = 1000000.0 # Total frames played, only incremented during training total_frames_trained = 0 # Number of random states to use for calculating Q-values nr_random_states = 100 # Random states that we use to calculate Q-values random_states = None # Memory itself memory = None # Neural net nnet = None # Communication with ALE ale = None def __init__(self): self.memory = MemoryD(self.memory_size) self.ale = ALE(self.memory) self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4") def compute_epsilon(self, frames_played): """ From the paper: "The behavior policy during training was epsilon-greedy with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter." @param frames_played: How far are we with our learning? """ return max(1.0 - frames_played / self.epsilon_frames, 0.1) def predict_best_action(self, last_state): assert last_state.shape[0] == self.state_size assert len(last_state.shape) == 1 # last_state contains only one state, so we have to convert it into batch of size 1 last_state.shape = (last_state.shape[0], 1) qvalues = self.nnet.predict(last_state) assert qvalues.shape[0] == 1 assert qvalues.shape[1] == self.number_of_actions #print "Predicted action Q-values: ", qvalues # return action (index) with maximum Q-value return np.argmax(qvalues) def train_minibatch(self, minibatch): """ Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net and trains the network @param minibatch: list of arrays: prestates, actions, rewards, poststates """ prestates = minibatch[0] actions = minibatch[1] rewards = minibatch[2] poststates = minibatch[3] assert prestates.shape[0] == self.state_size assert prestates.shape[1] == self.minibatch_size assert poststates.shape[0] == self.state_size assert poststates.shape[1] == self.minibatch_size assert actions.shape[0] == self.minibatch_size assert rewards.shape[0] == self.minibatch_size # predict Q-values for poststates post_qvalues = self.nnet.predict(poststates) assert post_qvalues.shape[0] == self.minibatch_size assert post_qvalues.shape[1] == self.number_of_actions # take maximum Q-value of all actions max_qvalues = np.max(post_qvalues, axis=1) assert max_qvalues.shape[0] == self.minibatch_size assert len(max_qvalues.shape) == 1 # predict Q-values for prestates, so we can keep Q-values for other actions unchanged qvalues = self.nnet.predict(prestates) assert qvalues.shape[0] == self.minibatch_size assert qvalues.shape[1] == self.number_of_actions # update the Q-values for the actions we actually performed for i, action in enumerate(actions): qvalues[i][ action] = rewards[i] + self.discount_factor * max_qvalues[i] # we have to transpose prediction result, as train expects input in opposite order cost = self.nnet.train(prestates, qvalues.transpose().copy()) return cost def play_games(self, nr_frames, train, epsilon): """ Main cycle: starts a game and plays number of frames. @param nr_frames: total number of games allowed to play @param train: true or false, whether to do training or not @param epsilon: fixed epsilon, only used when not training """ frames_played = 0 game_scores = [] # Start a new game self.ale.new_game() game_score = 0 # Play games until maximum number is reached while frames_played < nr_frames: # Epsilon decreases over time only when training if train: epsilon = self.compute_epsilon(self.total_frames_trained) #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained) # Some times random action is chosen if random.uniform(0, 1) < epsilon: action = random.choice(range(self.number_of_actions)) #print "Chose random action %d" % action # Usually neural net chooses the best action else: action = self.predict_best_action(self.memory.get_last_state()) #print "Neural net chose action %d" % int(action) # Make the move reward = self.ale.move(action) if reward: print " Got reward of %d!!!" % reward reward = 1 game_score += reward frames_played += 1 #print "Played frame %d" % frames_played # Store new information to memory self.ale.store_step(action) # Only if training if train: # Increase total frames only when training self.total_frames_trained += 1 # Train neural net with random minibatch minibatch = self.memory.get_minibatch(self.minibatch_size) self.train_minibatch(minibatch) #print "Trained minibatch of size %d" % self.minibatch_size # Play until game is over if self.ale.game_over: print " Game over!!! Score = %d" % game_score # After "game over" increase the number of games played game_scores.append(game_score) game_score = 0 # And do stuff after end game self.ale.end_game() self.ale.new_game() # reset the game just in case self.ale.end_game() return game_scores def run(self): # Play number of random games and pick random states to calculate Q-values for print "Playing %d games with random policy" % self.init_frames self.play_games(self.init_frames, False, 1) self.random_states = self.memory.get_minibatch( self.nr_random_states)[0] # Open log file and write header log_file = open( "../log/scores" + time.strftime("%Y-%m-%d-%H-%M") + ".csv", "w") log_file.write( "epoch,nr_games,sum_score,average_score,nr_frames_tested,average_qvalue,total_frames_trained,epsilon,memory_size\n" ) for epoch in range(1, self.epochs + 1): print "Epoch %d:" % epoch # play number of frames with training and epsilon annealing print " Training for %d frames" % self.training_frames self.play_games(self.training_frames, True, None) # play number of frames without training and without epsilon annealing print " Testing for %d frames" % self.testing_frames game_scores = self.play_games(self.testing_frames, False, 0.05) # calculate Q-values qvalues = self.nnet.predict(self.random_states) assert qvalues.shape[0] == self.nr_random_states assert qvalues.shape[1] == self.number_of_actions max_qvalues = np.max(qvalues, axis=1) assert max_qvalues.shape[0] == self.nr_random_states assert len(max_qvalues.shape) == 1 avg_qvalue = np.mean(max_qvalues) # calculate average scores sum_score = sum(game_scores) nr_games = len(game_scores) avg_score = np.mean(game_scores) epsilon = self.compute_epsilon(self.total_frames_trained) # log average scores in file log_file.write( "%d,%d,%f,%f,%d,%f,%d,%f,%d\n" % (epoch, nr_games, sum_score, avg_score, self.testing_frames, avg_qvalue, self.total_frames_trained, epsilon, self.memory.count)) log_file.flush() log_file.close()
class Main: # How many transitions to keep in memory? memory_size = 1000000 # Size of the mini-batch, 32 was given in the paper minibatch_size = 32 # Number of possible actions in a given game, 4 for "Breakout" number_of_actions = 4 # Size of one frame frame_size = 84*84 # How many frames form a history history_length = 4 # Size of one state is four 84x84 screens state_size = history_length * frame_size # Discount factor for future rewards discount_factor = 0.9 # How many frames to play to choose random frame init_frames = 1000 # How many epochs to run epochs = 200 # Number of frames to play during one training epoch training_frames = 50000 # Number of frames to play during one testing epoch testing_frames = 10000 # Exploration rate annealing speed epsilon_frames = 1000000.0 # Total frames played, only incremented during training total_frames_trained = 0 # Number of random states to use for calculating Q-values nr_random_states = 100 # Random states that we use to calculate Q-values random_states = None # Memory itself memory = None # Neural net nnet = None # Communication with ALE ale = None def __init__(self): self.memory = MemoryD(self.memory_size) self.ale = ALE(self.memory) self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4") def compute_epsilon(self, frames_played): """ From the paper: "The behavior policy during training was epsilon-greedy with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter." @param frames_played: How far are we with our learning? """ return max(1.0 - frames_played / self.epsilon_frames, 0.1) def predict_best_action(self, last_state): assert last_state.shape[0] == self.state_size assert len(last_state.shape) == 1 # last_state contains only one state, so we have to convert it into batch of size 1 last_state.shape = (last_state.shape[0], 1) qvalues = self.nnet.predict(last_state) assert qvalues.shape[0] == 1 assert qvalues.shape[1] == self.number_of_actions #print "Predicted action Q-values: ", qvalues # return action (index) with maximum Q-value return np.argmax(qvalues) def train_minibatch(self, minibatch): """ Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net and trains the network @param minibatch: list of arrays: prestates, actions, rewards, poststates """ prestates = minibatch[0] actions = minibatch[1] rewards = minibatch[2] poststates = minibatch[3] assert prestates.shape[0] == self.state_size assert prestates.shape[1] == self.minibatch_size assert poststates.shape[0] == self.state_size assert poststates.shape[1] == self.minibatch_size assert actions.shape[0] == self.minibatch_size assert rewards.shape[0] == self.minibatch_size # predict Q-values for poststates post_qvalues = self.nnet.predict(poststates) assert post_qvalues.shape[0] == self.minibatch_size assert post_qvalues.shape[1] == self.number_of_actions # take maximum Q-value of all actions max_qvalues = np.max(post_qvalues, axis=1) assert max_qvalues.shape[0] == self.minibatch_size assert len(max_qvalues.shape) == 1 # predict Q-values for prestates, so we can keep Q-values for other actions unchanged qvalues = self.nnet.predict(prestates) assert qvalues.shape[0] == self.minibatch_size assert qvalues.shape[1] == self.number_of_actions # update the Q-values for the actions we actually performed for i, action in enumerate(actions): qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i] # we have to transpose prediction result, as train expects input in opposite order cost = self.nnet.train(prestates, qvalues.transpose().copy()) return cost def play_games(self, nr_frames, train, epsilon): """ Main cycle: starts a game and plays number of frames. @param nr_frames: total number of games allowed to play @param train: true or false, whether to do training or not @param epsilon: fixed epsilon, only used when not training """ frames_played = 0 game_scores = [] # Start a new game self.ale.new_game() game_score = 0 # Play games until maximum number is reached while frames_played < nr_frames: # Epsilon decreases over time only when training if train: epsilon = self.compute_epsilon(self.total_frames_trained) #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained) # Some times random action is chosen if random.uniform(0, 1) < epsilon: action = random.choice(range(self.number_of_actions)) #print "Chose random action %d" % action # Usually neural net chooses the best action else: action = self.predict_best_action(self.memory.get_last_state()) #print "Neural net chose action %d" % int(action) # Make the move reward = self.ale.move(action) if reward: print " Got reward of %d!!!" % reward reward = 1 game_score += reward frames_played += 1 #print "Played frame %d" % frames_played # Store new information to memory self.ale.store_step(action) # Only if training if train: # Increase total frames only when training self.total_frames_trained += 1 # Train neural net with random minibatch minibatch = self.memory.get_minibatch(self.minibatch_size) self.train_minibatch(minibatch) #print "Trained minibatch of size %d" % self.minibatch_size # Play until game is over if self.ale.game_over: print " Game over!!! Score = %d" % game_score # After "game over" increase the number of games played game_scores.append(game_score); game_score = 0 # And do stuff after end game self.ale.end_game() self.ale.new_game() # reset the game just in case self.ale.end_game() return game_scores def run(self): # Play number of random games and pick random states to calculate Q-values for print "Playing %d games with random policy" % self.init_frames self.play_games(self.init_frames, False, 1) self.random_states = self.memory.get_minibatch(self.nr_random_states)[0] # Open log file and write header log_file = open("../log/scores" + time.strftime("%Y-%m-%d-%H-%M") + ".csv", "w") log_file.write("epoch,nr_games,sum_score,average_score,nr_frames_tested,average_qvalue,total_frames_trained,epsilon,memory_size\n") for epoch in range(1, self.epochs + 1): print "Epoch %d:" % epoch # play number of frames with training and epsilon annealing print " Training for %d frames" % self.training_frames self.play_games(self.training_frames, True, None) # play number of frames without training and without epsilon annealing print " Testing for %d frames" % self.testing_frames game_scores = self.play_games(self.testing_frames, False, 0.05) # calculate Q-values qvalues = self.nnet.predict(self.random_states) assert qvalues.shape[0] == self.nr_random_states assert qvalues.shape[1] == self.number_of_actions max_qvalues = np.max(qvalues, axis=1) assert max_qvalues.shape[0] == self.nr_random_states assert len(max_qvalues.shape) == 1 avg_qvalue = np.mean(max_qvalues) # calculate average scores sum_score = sum(game_scores) nr_games = len(game_scores) avg_score = np.mean(game_scores) epsilon = self.compute_epsilon(self.total_frames_trained) # log average scores in file log_file.write("%d,%d,%f,%f,%d,%f,%d,%f,%d\n" % (epoch, nr_games, sum_score, avg_score, self.testing_frames, avg_qvalue, self.total_frames_trained, epsilon, self.memory.count)) log_file.flush() log_file.close()
class Main: # How many transitions to keep in memory? memory_size = 100000 # Memory itself memory = None # Neural net nnet = None # Communication with ALE ale = None # Size of the mini-batch which will be sent to learning in Theano minibatch_size = None # Number of possible actions in a given game number_of_actions = None def __init__(self): self.memory = MemoryD(self.memory_size) self.minibatch_size = 32 # Given in the paper self.number_of_actions = 4 # Game "Breakout" has 4 possible actions # Properties of the neural net which come from the paper self.nnet = NeuralNet([1, 4, 84, 84], filter_shapes=[[16, 4, 8, 8], [32, 16, 4, 4]], strides=[4, 2], n_hidden=256, n_out=self.number_of_actions) self.ale = ALE(self.memory) def compute_epsilon(self, frames_played): """ From the paper: "The behavior policy during training was epsilon-greedy with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter." @param frames_played: How far are we with our learning? """ return max(0.9 - frames_played / self.memory_size, 0.1) def play_games(self, n): """ Main cycle: plays many games and many frames in each game. Also learning is performed. @param n: total number of games allowed to play """ games_to_play = n games_played = 0 frames_played = 0 # Play games until maximum number is reached while games_played < games_to_play: # Start a new game self.ale.new_game() # Play until game is over while not self.ale.game_over: # Epsilon decreases over time epsilon = self.compute_epsilon(frames_played) # Some times random action is chosen if random.uniform(0, 1) < epsilon: action = random.choice(range(self.number_of_actions)) print "chose randomly ", action # Usually neural net chooses the best action else: print "chose by neural net" action = self.nnet.predict_best_action( [self.memory.get_last_state()]) print action # Make the move self.ale.move(action) # Store new information to memory self.ale.store_step(action) # Start a training session self.nnet.train(self.memory.get_minibatch(self.minibatch_size)) # After "game over" increase the number of games played games_played += 1 # And do stuff after end game (store information, let ALE know etc) self.ale.end_game()