def playByPolicy(Q, maxPerEpisode): tetrominos = createTetrominos() board = Board(5, 3) board.printBoard() totalLinesCleared = 0 col = 0 rot = 0 for j in range(maxPerEpisode): tetromino = util.randChoice(tetrominos) # Moves come in the format [columnIndex, rotationIndex] possibleMoves = tetromino.getPossibleMoves(board) # Game over condition if len(possibleMoves) == 0: print("GAME OVER") print("Lines cleared: ", board.linesCleared) return s = util.strState(board.board, tetromino.shape) # Check if Q(s, :) exists, use policy if it does if s in Q: [col, rot] = util.epsilonGreedy(Q[s], -1, possibleMoves) else: [col, rot] = util.randChoice(possibleMoves) tetromino.printShape(rot) # Perform action and collect reward r = board.act(tetromino, col, rot) board.printBoard() print("Maximum number of moves reached: ", maxPerEpisode) print("Lines cleared: ", board.linesCleared)
def learn(nrows, ncols, maxPerEpisode, batchSize, nGames): # Tetris initialisations tetrominos = createTetrominos() board = Board(nrows, ncols) board.reset() avgs = [] # tShapeRows, tShapeCols = tuple(map(operator.add, tetrominos[0].shape.shape, (1, 1))) tShape = tetrominos[0].paddedRotations[0] tShapeRows, tShapeCols = tShape.shape[0], tShape.shape[1] inputLayerDim = [None, board.nrows + tShapeRows, board.ncols, 1] # (board.nrows * board.ncols) + (tShapeRows * tShapeCols) actionsDim = board.ncols * 4 tf.reset_default_graph() #Clear the Tensorflow graph. myAgent = agent(lr=1e-2, s_size=inputLayerDim, a_size=actionsDim, h_size=32) #Load the agent. total_episodes = nGames #Set total number of episodes to train agent on. max_ep = maxPerEpisode update_frequency = batchSize init = tf.global_variables_initializer() # Launch the tensorflow graph with tf.Session() as sess: sess.run(init) i = 0 total_reward = [] total_length = [] gradBuffer = sess.run(tf.trainable_variables()) for ix, grad in enumerate(gradBuffer): gradBuffer[ix] = grad * 0 while i < total_episodes: board.reset() tetromino = util.randChoice(tetrominos) s = util.cnnState(board, tetromino.paddedRotations[0]) running_reward = 0 ep_history = [] for j in range(max_ep): if j == max_ep - 1: print("reached maximum at episode ", i, " with ", running_reward) # if i % 500 == 0: # board.printBoard() possibleMoves = tetromino.getPossibleMoves(board) d = (len(possibleMoves) == 0) if d == True: #Update the network. ep_history = np.array(ep_history) ep_history[:, 2] = discount_rewards(ep_history[:, 2]) feed_dict = { myAgent.reward_holder: ep_history[:, 2], myAgent.action_holder: ep_history[:, 1], myAgent.state_in: np.vstack(ep_history[:, 0]) } grads = sess.run(myAgent.gradients, feed_dict=feed_dict) for idx, grad in enumerate(grads): gradBuffer[idx] += grad # print(i, i % update_frequency) if i % update_frequency == 0 and i != 0: # print("Updating network at episode ", i) feed_dict = dictionary = dict( zip(myAgent.gradient_holders, gradBuffer)) _ = sess.run(myAgent.update_batch, feed_dict=feed_dict) for ix, grad in enumerate(gradBuffer): gradBuffer[ix] = grad * 0 total_reward.append(running_reward) total_length.append(j) break bool_moves = [(x in possibleMoves) for x in range(actionsDim)] # Probabilistically pick an action given our network outputs. o, a_dist = sess.run([myAgent.output, myAgent.valid_moves], feed_dict={ myAgent.state_in: s, myAgent.p: [bool_moves] }) softmax_a_dist = [a_dist[0] / sum(a_dist[0])] # print(o) # print(a_dist) # print() a = np.random.choice(softmax_a_dist[0], p=softmax_a_dist[0]) a = np.argmax(softmax_a_dist == a) # if i % 500 == 0: # tetromino.printShape(0) # print(softmax_a_dist) # print(a) rot, col = divmod(a, board.ncols) r = board.act(tetromino, col, rot) # Random Tetromino for next state nextTetromino = util.randChoice(tetrominos) s1 = util.cnnState(board, nextTetromino.paddedRotations[0]) ep_history.append([s, a, r, s1]) s = s1 tetromino = nextTetromino running_reward += r #Update our running tally of scores. if i % 100 == 0: current_avg = np.mean(total_reward[-100:]) print(i, ' : ', current_avg) avgs.append(current_avg) i += 1 return avgs
def learn(epsilon, gamma, alpha, nGames, isRand, getAvgs): Q = {} tetrominos = createTetrominos() board = Board(5, 3) # board.printBoard() totalLinesCleared = 0 col = 0 rot = 0 avgs = [] for i in range(nGames): board.reset() tetromino = util.randChoice(tetrominos) while (True): # Moves come in the format [columnIndex, rotationIndex] possibleMoves = tetromino.getPossibleMoves(board) # Game over condition if len(possibleMoves) == 0: break if isRand: [rot, col] = divmod(util.randChoice(possibleMoves), board.ncols) else: s = util.strState(board.board, tetromino.shape) # Check if Q(s, :) exists, create if not if s not in Q: Q[s] = np.zeros((board.ncols, len(tetromino.rotations))) [rot, col] = divmod(util.randChoice(possibleMoves), board.ncols) else: [rot, col] = divmod( util.epsilonGreedy(Q[s], epsilon, possibleMoves), board.ncols) # Perform action and collect reward r = board.act(tetromino, col, rot) # Random Tetromino for next state nextTetromino = util.randChoice(tetrominos) if not isRand: s1 = util.strState(board.board, nextTetromino.shape) # Check if Q(s1, :) exists, create if not if s1 not in Q: Q[s1] = np.zeros( (board.ncols, len(nextTetromino.rotations))) # Q-learning value function update Q[s][col][rot] = Q[s][col][rot] + alpha * ( r + gamma * np.amax(Q[s1]) - Q[s][col][rot]) tetromino = nextTetromino totalLinesCleared += board.linesCleared if (i + 1) % 10 == 0: avgs.append(totalLinesCleared / (i + 1)) # print("Lines cleared: ", board.linesCleared) avg = totalLinesCleared / nGames avgs.append(avg) # print("Average lines cleared:", avg) if getAvgs: return avgs else: return Q
def learn(epsilon, gamma, alpha, nGames, nRows, nCols): print(epsilon, gamma, alpha, nGames) tetrominos = createTetrominos() board = Board(nRows, nCols) tShapeRows, tShapeCols = tuple( map(operator.add, tetrominos[0].shape.shape, (1, 1))) inputLayerDim = (board.nrows * board.ncols) + (tShapeRows * tShapeCols) actionsDim = board.ncols * 4 # Tensorflow network initialisation tf.reset_default_graph() # These lines establish the feed-forward part of the network used # to choose actions inputs1 = tf.placeholder( shape=[None, board.nrows + tShapeRows, board.ncols, 1], dtype=tf.float32) conv1 = tf.layers.conv2d(inputs=inputs1, filters=16, kernel_size=[2, 2]) conv2 = tf.layers.conv2d(inputs=conv1, filters=32, kernel_size=[2, 2]) flatten_layer = tf.contrib.layers.flatten(conv2) dense_connected_layer = tf.contrib.layers.fully_connected( flatten_layer, 256, activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=None) output_layer = tf.contrib.layers.fully_connected( dense_connected_layer, actionsDim, activation_fn=None, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=None) # W = tf.Variable(tf.zeros([inputLayerDim, actionsDim])) p = tf.placeholder(tf.bool, [1, actionsDim]) # Qout = tf.matmul(inputs1,W) invalidMoves = tf.constant(-1000., shape=[1, actionsDim]) validMoves = tf.where( p, output_layer, invalidMoves) # Replace invalid moves in Qout by -100 predict = tf.argmax(validMoves, 1) # Below we obtain the loss by taking the sum of squares difference between # the target and prediction Q values. nextQ = tf.placeholder(shape=[1, actionsDim], dtype=tf.float32) loss = tf.reduce_sum(tf.square(nextQ - output_layer)) trainer = tf.train.AdamOptimizer(learning_rate=0.0025) updateModel = trainer.minimize(loss) init = tf.global_variables_initializer() # create lists to contain total rewards and steps per episode jList = [] rList = [] totalLinesCleared = 0 col = 0 rot = 0 avgs = [] s = [] a = [] allQ = [] with tf.Session() as sess: sess.run(init) for i in range(nGames): print(i) board.reset() tetromino = util.randChoice(tetrominos) while (True): # Moves come in the format [columnIndex, rotationIndex] possibleMoves = tetromino.getPossibleMoves(board) # Game over condition if len(possibleMoves) == 0: break if np.random.rand(1) < epsilon: a = util.randChoice(possibleMoves) else: boolMoves = [(x in possibleMoves) for x in range(actionsDim)] s = util.cnnState(board, tetromino.paddedRotations[0]) a, allQ = sess.run([predict, output_layer], feed_dict={ inputs1: s, p: [boolMoves] }) a = a[0] rot, col = divmod(a, board.ncols) # Perform action and collect reward r = board.act(tetromino, col, rot) # Random Tetromino for next state nextTetromino = util.randChoice(tetrominos) s1 = util.cnnState(board, nextTetromino.paddedRotations[0]) Q1 = sess.run(output_layer, feed_dict={inputs1: s1}) #Obtain maxQ' and set our target value for chosen action. maxQ1 = np.max(Q1) targetQ = allQ targetQ[0, a] = r + alpha * maxQ1 #Train our network using target and predicted Q values _ = sess.run([updateModel], feed_dict={ inputs1: s, nextQ: targetQ }) tetromino = nextTetromino totalLinesCleared += board.linesCleared if (i + 1) % 100 == 0: avgs.append(totalLinesCleared / 100) totalLinesCleared = 0 # print("Lines cleared: ", board.linesCleared) # avg = totalLinesCleared/nGames # avgs.append(avg) # print("Average lines cleared:", avg) return avgs
def work(self, max_episode_length, gamma, global_AC, sess, coord, saveFreq): episode_count = sess.run(self.global_episodes) total_steps = 0 actions_list = np.arange(self.a_size) print("Starting worker " + str(self.number)) tetrominos = createTetrominos() n_tetrominos = len(tetrominos) - 1 with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run(self.update_local_ops) episode_buffer = [] episode_values = [] episode_frames = [] episode_reward = 0 episode_step_count = 0 d = False self.board.reset() tetromino_idx = randint(0, n_tetrominos) tetromino_AC = np.reshape(tetromino_idx, (1, 1)) tetromino = tetrominos[tetromino_idx] possibleMoves = tetromino.getPossibleMoves(self.board) s = util.a3cState(self.board) # rnn_state = self.local_AC.state_init # self.batch_rnn_state = rnn_state episode_frames.append(s) while True: # tetromino.printShape(0) # self.board.printBoard() # print(possibleMoves) # bool_moves = [(x in possibleMoves) for x in range(self.a_size)] #Take an action using probabilities from policy network output. a_dist, v = sess.run( [self.local_AC.policy, self.local_AC.value], feed_dict={ self.local_AC.imageIn: s, self.local_AC.tetromino: tetromino_AC }) # self.local_AC.state_in[0]:rnn_state[0], # self.local_AC.state_in[1]:rnn_state[1]}) valid_moves = [ x if i in possibleMoves else 0. for i, x in enumerate(a_dist[0]) ] # import pdb; pdb.set_trace() # if episode_count % 100 == 0: # print(a_dist[0]) # print(a_dist[0]) # print(valid_moves) sum_v = sum(valid_moves) if sum_v == 0: a = util.randChoice(possibleMoves) # tetromino.printShape(0) # self.board.printBoard() # print(possibleMoves) # print(a_dist[0]) # print(valid_moves) # print("err: invalid moves. ending game") else: softmax_a_dist = [valid_moves / sum_v] a = np.random.choice(actions_list, p=softmax_a_dist[0]) # print(softmax_a_dist) # print(a) rot, col = divmod(a, self.board.ncols) # print(rot, col) r = self.board.act(tetromino, col, rot) nextTetrominoIdx = randint(0, n_tetrominos) nextTetromino = tetrominos[nextTetrominoIdx] nextTetromino_AC = np.reshape(nextTetrominoIdx, (1, 1)) s1 = util.a3cState(self.board) possibleMoves = nextTetromino.getPossibleMoves(self.board) d = (len(possibleMoves) == 0) episode_frames.append(s1) episode_buffer.append( [s, a, r, s1, d, v[0, 0], tetromino_AC]) episode_values.append(v[0, 0]) tetromino = nextTetromino tetromino_idx = nextTetrominoIdx tetromino_AC = nextTetromino_AC episode_reward += r s = s1 total_steps += 1 episode_step_count += 1 # If the episode hasn't ended, but the experience buffer is full, then we # make an update step using that experience rollout. if len(episode_buffer) == 30 and d != True: # print("HERE") # Since we don't know what the true final return is, we "bootstrap" from our current # value estimation. v1 = sess.run(self.local_AC.value, feed_dict={ self.local_AC.imageIn: s, self.local_AC.tetromino: tetromino_AC })[0, 0] v_l, p_l, e_l, g_n, v_n = self.train( global_AC, episode_buffer, sess, gamma, v1) episode_buffer = [] sess.run(self.update_local_ops) if episode_step_count >= max_episode_length - 1: print("reached max") break elif d == True: break self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) self.episode_mean_values.append(np.mean(episode_values)) # Update the network using the experience buffer at the end of the episode. if len(episode_buffer) != 0: v_l, p_l, e_l, g_n, v_n = self.train( global_AC, episode_buffer, sess, gamma, 0.0) # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % saveFreq == 0 and episode_count != 0: mean_reward = np.mean(self.episode_rewards[-saveFreq:]) mean_length = np.mean(self.episode_lengths[-saveFreq:]) mean_value = np.mean(self.episode_mean_values[-saveFreq:]) print(mean_reward) if self.name == 'worker_0': sess.run(self.increment) episode_count += 1
def learn(epsilon, gamma, alpha, nGames, getAvgs): tetrominos = createTetrominos() board = Board(5, 3) tShapeRows, tShapeCols = tuple( map(operator.add, tetrominos[0].shape.shape, (1, 1))) inputLayerDim = (board.nrows * board.ncols) + (tShapeRows * tShapeCols) actionsDim = board.ncols * 4 # Tensorflow network initialisation tf.reset_default_graph() # These lines establish the feed-forward part of the network used # to choose actions inputs1 = tf.placeholder(shape=[1, inputLayerDim], dtype=tf.float32) W = tf.Variable(tf.zeros([inputLayerDim, actionsDim])) p = tf.placeholder(tf.bool, [1, actionsDim]) Qout = tf.matmul(inputs1, W) invalidMoves = tf.constant(-100., shape=[1, actionsDim]) validMoves = tf.where( p, Qout, invalidMoves) # Replace invalid moves in Qout by -100 predict = tf.argmax(validMoves, 1) # Below we obtain the loss by taking the sum of squares difference between # the target and prediction Q values. nextQ = tf.placeholder(shape=[1, actionsDim], dtype=tf.float32) loss = tf.reduce_sum(tf.square(nextQ - Qout)) trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1) updateModel = trainer.minimize(loss) init = tf.global_variables_initializer() # create lists to contain total rewards and steps per episode jList = [] rList = [] totalLinesCleared = 0 col = 0 rot = 0 avgs = [] s = [] a = [] allQ = [] with tf.Session() as sess: sess.run(init) for i in range(nGames): print(i) board.reset() tetromino = util.randChoice(tetrominos) while (True): # Moves come in the format [columnIndex, rotationIndex] possibleMoves = tetromino.getPossibleMoves(board) # Game over condition if len(possibleMoves) == 0: break if np.random.rand(1) < epsilon: a = util.randChoice(possibleMoves) else: boolMoves = [(x in possibleMoves) for x in range(actionsDim)] s = util.networkState(board.board, tetromino.paddedRotations[0]) a, allQ = sess.run([predict, Qout], feed_dict={ inputs1: s, p: [boolMoves] }) a = a[0] rot, col = divmod(a, board.ncols) # Perform action and collect reward r = board.act(tetromino, col, rot) # Random Tetromino for next state nextTetromino = util.randChoice(tetrominos) s1 = util.networkState(board.board, nextTetromino.paddedRotations[0]) Q1 = sess.run(Qout, feed_dict={inputs1: s1}) #Obtain maxQ' and set our target value for chosen action. maxQ1 = np.max(Q1) targetQ = allQ targetQ[0, a] = r + alpha * maxQ1 #Train our network using target and predicted Q values _, W1 = sess.run([updateModel, W], feed_dict={ inputs1: s, nextQ: targetQ }) tetromino = nextTetromino totalLinesCleared += board.linesCleared if (i + 1) % 10 == 0: avgs.append(totalLinesCleared / (i + 1)) # print("Lines cleared: ", board.linesCleared) avg = totalLinesCleared / nGames avgs.append(avg) # print("Average lines cleared:", avg) return avgs
def learn(epsilon, gamma, alpha, nGames, getAvgs): tetrominos = createTetrominos() board = Board(5, 3) board.reset() tShapeRows, tShapeCols = tuple( map(operator.add, tetrominos[0].shape.shape, (1, 1))) inputLayerDim = (board.nrows * board.ncols) + (tShapeRows * tShapeCols) actionsDim = board.ncols * 4 input_layer = tf.placeholder(shape=[None, inputLayerDim], dtype=tf.float32) hidden_layer = slim.fully_connected( input_layer, H, activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=None) output_layer = slim.fully_connected( hidden_layer, actionsDim, activation_fn=tf.nn.sigmoid, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=None) actions = tf.placeholder(shape=[None], dtype=tf.int32) rewards = tf.placeholder(shape=[None, 1], dtype=tf.float32) actions_onehot = tf.one_hot(actions, actionsDim) responsible_outputs = tf.reduce_sum(output_layer * actions_onehot, [1]) loss = -tf.reduce_mean(tf.log(responsible_outputs) * rewards) p = tf.placeholder(tf.bool, [1, actionsDim]) # Qout = tf.matmul(inputs1,W) invalidMoves = tf.constant(0., shape=[1, actionsDim]) validMoves = tf.where( p, output_layer, invalidMoves) # Replace invalid moves in Qout by -100 predict = tf.argmax(validMoves, 1) w_variables = tf.trainable_variables() gradients = [] for indx, w in enumerate(w_variables): w_holder_var = tf.placeholder(tf.float32, name="w_" + str(indx)) gradients.append(w_holder_var) all_gradients = tf.gradients(loss, tf.trainable_variables()) optimizer = tf.train.AdamOptimizer(learning_rate=1e-2) apply_grads = optimizer.apply_gradients(zip(gradients, w_variables)) totalLinesCleared = 0 col = 0 rot = 0 avgs, s, a, allQ, h = [], [], [], [], [] prev_x = None # used in computing the difference frame xs, hs, dlogps, drs, all_game_scores = [], [], [], [], [] running_reward = None reward_sum = 0 episode_number = 0 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) grad_buffer = sess.run(tf.trainable_variables()) for indx, grad in enumerate(grad_buffer): grad_buffer[indx] = grad * 0 while episode_number < 100: tetromino = util.randChoice(tetrominos) # tetromino.printShape(0) possibleMoves = tetromino.getPossibleMoves(board) # Game over condition if len(possibleMoves) > 0: boolMoves = [(x in possibleMoves) for x in range(actionsDim)] cur_x = util.pgState(board.board, tetromino.paddedRotations[0]) x = cur_x - prev_x if prev_x is not None else np.zeros((8, 3)) prev_x = cur_x a, o, h = sess.run([predict, validMoves, hidden_layer], feed_dict={ input_layer: np.reshape(x, (1, inputLayerDim)), p: [boolMoves] }) a = np.random.choice(actionsDim, 1, p=o[0] / sum(o[0]))[0] # print(a) rot, col = divmod(a, board.ncols) xs.append(np.reshape(x, (1, inputLayerDim))) # observation hs.append(h) dlogps.append(a) # dlogps.append(onehot(actionsDim, a)) # Perform action and collect reward r = board.act(tetromino, col, rot) # board.printBoard() reward_sum += r drs.append( r ) # record reward (has to be done after we call step() to get reward for previous action) # Random Tetromino for next state nextTetromino = util.randChoice(tetrominos) s1 = util.pgState(board.board, nextTetromino.paddedRotations[0]) else: episode_number += 1 # stack together all inputs, hidden states, action gradients, and rewards for this episode epx = np.vstack(xs) eph = np.vstack(hs) epdlogp = np.vstack(dlogps) epr = np.vstack(drs) xs, hs, dlogps, drs = [], [], [], [] # reset array memory # compute the discounted reward backwards through time discounted_epr = discount_rewards(epr, gamma) # standardize the rewards to be unit normal (helps control the gradient estimator variance) discounted_epr = discounted_epr - np.mean(discounted_epr) discounted_epr = discounted_epr / np.std(discounted_epr) grads = sess.run(all_gradients, feed_dict={ input_layer: epx, rewards: discounted_epr, actions: epdlogp.ravel() }) for indx, grad in enumerate(grads): grad_buffer[indx] += grad # perform rmsprop parameter update every batch_size episodes if episode_number % batch_size == 0: print("updating weights of the network") feed_dict = dict(zip(gradients, grad_buffer)) x = sess.run(apply_grads, feed_dict=feed_dict) print('HERE') print(x) for indx, grad in enumerate(grad_buffer): grad_buffer[ indx] = grad * 0 # reset batch gradient buffer # boring book-keeping running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 print('resetting env. episode reward %f. running mean: %f' % (reward_sum, running_reward)) # if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb')) all_game_scores.append(reward_sum) avgs.append(running_reward) reward_sum = 0 board.reset() # reset env prev_x = None # print(all_game_scores) return avgs