Beispiel #1
0
def playByPolicy(Q, maxPerEpisode):
    tetrominos = createTetrominos()
    board = Board(5, 3)
    board.printBoard()
    totalLinesCleared = 0
    col = 0
    rot = 0

    for j in range(maxPerEpisode):
        tetromino = util.randChoice(tetrominos)

        # Moves come in the format [columnIndex, rotationIndex]
        possibleMoves = tetromino.getPossibleMoves(board)

        # Game over condition
        if len(possibleMoves) == 0:
            print("GAME OVER")
            print("Lines cleared: ", board.linesCleared)
            return

        s = util.strState(board.board, tetromino.shape)
        # Check if Q(s, :) exists, use policy if it does
        if s in Q:
            [col, rot] = util.epsilonGreedy(Q[s], -1, possibleMoves)
        else:
            [col, rot] = util.randChoice(possibleMoves)

        tetromino.printShape(rot)

        # Perform action and collect reward
        r = board.act(tetromino, col, rot)
        board.printBoard()

    print("Maximum number of moves reached: ", maxPerEpisode)
    print("Lines cleared: ", board.linesCleared)
Beispiel #2
0
def train(nrows, ncols, max_episode_length, saveFreq):
    max_episode_length = 10000
    gamma = .99 # discount rate for advantage estimation and reward discounting

    # Tetris initialisations
    tetrominos = createTetrominos()
    board = Board(nrows, ncols)

    # t = tetrominos[0].paddedRotations[0]

    # t_rows, t_cols = t.shape[0], t.shape[1]
    s_size = [
      None,
      board.nrows,
      board.ncols,
      1
    ]

    a_size=board.ncols*4

    tf.reset_default_graph()

    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    trainer = tf.train.RMSPropOptimizer(learning_rate=1e-3, decay=0.99, epsilon=0.1)
    master_network = AC_Network(s_size,a_size,'global',None) # Generate global network
    # num_workers = 1
    num_workers = multiprocessing.cpu_count() # Set workers ot number of available CPU threads
    workers = []
    # Create worker classes
    for i in range(num_workers):
        board = Board(nrows, ncols)
        workers.append(Worker(i,s_size,a_size,trainer,global_episodes, board))

    with tf.Session() as sess:
        coord = tf.train.Coordinator()
        main_timer = threading.Timer(3000, stop_training, args=(coord,))
        main_timer.start()
        sess.run(tf.global_variables_initializer())

        # This is where the asynchronous magic happens.
        # Start the "work" process for each worker in a separate threat.
        worker_threads = []
        for worker in workers:
            worker_work = lambda: worker.work(max_episode_length,gamma,master_network,sess,coord,saveFreq)
            t = threading.Thread(target=(worker_work))
            t.start()
            worker_threads.append(t)
        gs = 0
        while not coord.should_stop():
            # s = time()
            # sleep(10)
            gs1 = sess.run(global_episodes)
            # print("Episodes", gs1, 'one for ', (time()-s)/(gs1-gs))
            gs = gs1
        coord.join(worker_threads)
def learn(nrows, ncols, maxPerEpisode, batchSize, nGames):
    # Tetris initialisations
    tetrominos = createTetrominos()
    board = Board(nrows, ncols)
    board.reset()
    avgs = []

    # tShapeRows, tShapeCols = tuple(map(operator.add, tetrominos[0].shape.shape, (1, 1)))
    tShape = tetrominos[0].paddedRotations[0]
    tShapeRows, tShapeCols = tShape.shape[0], tShape.shape[1]
    inputLayerDim = [None, board.nrows + tShapeRows, board.ncols, 1]
    # (board.nrows * board.ncols) + (tShapeRows * tShapeCols)
    actionsDim = board.ncols * 4

    tf.reset_default_graph()  #Clear the Tensorflow graph.

    myAgent = agent(lr=1e-2,
                    s_size=inputLayerDim,
                    a_size=actionsDim,
                    h_size=32)  #Load the agent.

    total_episodes = nGames  #Set total number of episodes to train agent on.
    max_ep = maxPerEpisode
    update_frequency = batchSize

    init = tf.global_variables_initializer()

    # Launch the tensorflow graph
    with tf.Session() as sess:
        sess.run(init)
        i = 0
        total_reward = []
        total_length = []

        gradBuffer = sess.run(tf.trainable_variables())
        for ix, grad in enumerate(gradBuffer):
            gradBuffer[ix] = grad * 0

        while i < total_episodes:
            board.reset()
            tetromino = util.randChoice(tetrominos)
            s = util.cnnState(board, tetromino.paddedRotations[0])
            running_reward = 0
            ep_history = []

            for j in range(max_ep):
                if j == max_ep - 1:
                    print("reached maximum at episode ", i, " with ",
                          running_reward)
                # if i % 500 == 0:
                #   board.printBoard()
                possibleMoves = tetromino.getPossibleMoves(board)
                d = (len(possibleMoves) == 0)

                if d == True:
                    #Update the network.
                    ep_history = np.array(ep_history)
                    ep_history[:, 2] = discount_rewards(ep_history[:, 2])
                    feed_dict = {
                        myAgent.reward_holder: ep_history[:, 2],
                        myAgent.action_holder: ep_history[:, 1],
                        myAgent.state_in: np.vstack(ep_history[:, 0])
                    }
                    grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                    for idx, grad in enumerate(grads):
                        gradBuffer[idx] += grad

                    # print(i, i % update_frequency)
                    if i % update_frequency == 0 and i != 0:
                        # print("Updating network at episode ", i)
                        feed_dict = dictionary = dict(
                            zip(myAgent.gradient_holders, gradBuffer))
                        _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                        for ix, grad in enumerate(gradBuffer):
                            gradBuffer[ix] = grad * 0

                    total_reward.append(running_reward)
                    total_length.append(j)
                    break

                bool_moves = [(x in possibleMoves) for x in range(actionsDim)]

                # Probabilistically pick an action given our network outputs.
                o, a_dist = sess.run([myAgent.output, myAgent.valid_moves],
                                     feed_dict={
                                         myAgent.state_in: s,
                                         myAgent.p: [bool_moves]
                                     })
                softmax_a_dist = [a_dist[0] / sum(a_dist[0])]

                # print(o)
                # print(a_dist)

                # print()
                a = np.random.choice(softmax_a_dist[0], p=softmax_a_dist[0])
                a = np.argmax(softmax_a_dist == a)
                # if i % 500 == 0:
                #     tetromino.printShape(0)
                # print(softmax_a_dist)
                # print(a)

                rot, col = divmod(a, board.ncols)
                r = board.act(tetromino, col, rot)

                # Random Tetromino for next state
                nextTetromino = util.randChoice(tetrominos)
                s1 = util.cnnState(board, nextTetromino.paddedRotations[0])

                ep_history.append([s, a, r, s1])
                s = s1
                tetromino = nextTetromino

                running_reward += r

                #Update our running tally of scores.
            if i % 100 == 0:
                current_avg = np.mean(total_reward[-100:])
                print(i, ' : ', current_avg)
                avgs.append(current_avg)
            i += 1
    return avgs
Beispiel #4
0
def learn(epsilon, gamma, alpha, nGames, isRand, getAvgs):
    Q = {}
    tetrominos = createTetrominos()
    board = Board(5, 3)
    # board.printBoard()
    totalLinesCleared = 0
    col = 0
    rot = 0
    avgs = []
    for i in range(nGames):
        board.reset()
        tetromino = util.randChoice(tetrominos)

        while (True):
            # Moves come in the format [columnIndex, rotationIndex]
            possibleMoves = tetromino.getPossibleMoves(board)

            # Game over condition
            if len(possibleMoves) == 0:
                break

            if isRand:
                [rot, col] = divmod(util.randChoice(possibleMoves),
                                    board.ncols)
            else:
                s = util.strState(board.board, tetromino.shape)

                # Check if Q(s, :) exists, create if not
                if s not in Q:
                    Q[s] = np.zeros((board.ncols, len(tetromino.rotations)))
                    [rot, col] = divmod(util.randChoice(possibleMoves),
                                        board.ncols)
                else:
                    [rot, col] = divmod(
                        util.epsilonGreedy(Q[s], epsilon, possibleMoves),
                        board.ncols)

            # Perform action and collect reward
            r = board.act(tetromino, col, rot)

            # Random Tetromino for next state
            nextTetromino = util.randChoice(tetrominos)

            if not isRand:
                s1 = util.strState(board.board, nextTetromino.shape)

                # Check if Q(s1, :) exists, create if not
                if s1 not in Q:
                    Q[s1] = np.zeros(
                        (board.ncols, len(nextTetromino.rotations)))

                # Q-learning value function update
                Q[s][col][rot] = Q[s][col][rot] + alpha * (
                    r + gamma * np.amax(Q[s1]) - Q[s][col][rot])

            tetromino = nextTetromino

        totalLinesCleared += board.linesCleared

        if (i + 1) % 10 == 0:
            avgs.append(totalLinesCleared / (i + 1))

        # print("Lines cleared: ", board.linesCleared)
    avg = totalLinesCleared / nGames
    avgs.append(avg)
    # print("Average lines cleared:", avg)
    if getAvgs:
        return avgs
    else:
        return Q
Beispiel #5
0
def learn(epsilon, gamma, alpha, nGames, nRows, nCols):
    print(epsilon, gamma, alpha, nGames)
    tetrominos = createTetrominos()
    board = Board(nRows, nCols)
    tShapeRows, tShapeCols = tuple(
        map(operator.add, tetrominos[0].shape.shape, (1, 1)))
    inputLayerDim = (board.nrows * board.ncols) + (tShapeRows * tShapeCols)
    actionsDim = board.ncols * 4

    # Tensorflow network initialisation
    tf.reset_default_graph()

    # These lines establish the feed-forward part of the network used
    # to choose actions
    inputs1 = tf.placeholder(
        shape=[None, board.nrows + tShapeRows, board.ncols, 1],
        dtype=tf.float32)
    conv1 = tf.layers.conv2d(inputs=inputs1, filters=16, kernel_size=[2, 2])
    conv2 = tf.layers.conv2d(inputs=conv1, filters=32, kernel_size=[2, 2])
    flatten_layer = tf.contrib.layers.flatten(conv2)
    dense_connected_layer = tf.contrib.layers.fully_connected(
        flatten_layer,
        256,
        activation_fn=tf.nn.relu,
        weights_initializer=tf.contrib.layers.xavier_initializer(),
        biases_initializer=None)
    output_layer = tf.contrib.layers.fully_connected(
        dense_connected_layer,
        actionsDim,
        activation_fn=None,
        weights_initializer=tf.contrib.layers.xavier_initializer(),
        biases_initializer=None)

    # W = tf.Variable(tf.zeros([inputLayerDim, actionsDim]))
    p = tf.placeholder(tf.bool, [1, actionsDim])
    # Qout = tf.matmul(inputs1,W)
    invalidMoves = tf.constant(-1000., shape=[1, actionsDim])
    validMoves = tf.where(
        p, output_layer, invalidMoves)  # Replace invalid moves in Qout by -100
    predict = tf.argmax(validMoves, 1)

    # Below we obtain the loss by taking the sum of squares difference between
    # the target and prediction Q values.
    nextQ = tf.placeholder(shape=[1, actionsDim], dtype=tf.float32)
    loss = tf.reduce_sum(tf.square(nextQ - output_layer))
    trainer = tf.train.AdamOptimizer(learning_rate=0.0025)
    updateModel = trainer.minimize(loss)

    init = tf.global_variables_initializer()

    # create lists to contain total rewards and steps per episode
    jList = []
    rList = []

    totalLinesCleared = 0
    col = 0
    rot = 0
    avgs = []
    s = []
    a = []
    allQ = []

    with tf.Session() as sess:
        sess.run(init)
        for i in range(nGames):
            print(i)
            board.reset()
            tetromino = util.randChoice(tetrominos)

            while (True):
                # Moves come in the format [columnIndex, rotationIndex]
                possibleMoves = tetromino.getPossibleMoves(board)

                # Game over condition
                if len(possibleMoves) == 0:
                    break

                if np.random.rand(1) < epsilon:
                    a = util.randChoice(possibleMoves)
                else:
                    boolMoves = [(x in possibleMoves)
                                 for x in range(actionsDim)]
                    s = util.cnnState(board, tetromino.paddedRotations[0])
                    a, allQ = sess.run([predict, output_layer],
                                       feed_dict={
                                           inputs1: s,
                                           p: [boolMoves]
                                       })
                    a = a[0]

                rot, col = divmod(a, board.ncols)

                # Perform action and collect reward
                r = board.act(tetromino, col, rot)

                # Random Tetromino for next state
                nextTetromino = util.randChoice(tetrominos)
                s1 = util.cnnState(board, nextTetromino.paddedRotations[0])

                Q1 = sess.run(output_layer, feed_dict={inputs1: s1})
                #Obtain maxQ' and set our target value for chosen action.
                maxQ1 = np.max(Q1)
                targetQ = allQ
                targetQ[0, a] = r + alpha * maxQ1
                #Train our network using target and predicted Q values
                _ = sess.run([updateModel],
                             feed_dict={
                                 inputs1: s,
                                 nextQ: targetQ
                             })

                tetromino = nextTetromino

            totalLinesCleared += board.linesCleared

            if (i + 1) % 100 == 0:
                avgs.append(totalLinesCleared / 100)
                totalLinesCleared = 0

        # print("Lines cleared: ", board.linesCleared)
    # avg = totalLinesCleared/nGames
    # avgs.append(avg)
    # print("Average lines cleared:", avg)
    return avgs
Beispiel #6
0
    def work(self, max_episode_length, gamma, global_AC, sess, coord,
             saveFreq):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        actions_list = np.arange(self.a_size)
        print("Starting worker " + str(self.number))
        tetrominos = createTetrominos()
        n_tetrominos = len(tetrominos) - 1

        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():

                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0
                d = False

                self.board.reset()
                tetromino_idx = randint(0, n_tetrominos)
                tetromino_AC = np.reshape(tetromino_idx, (1, 1))
                tetromino = tetrominos[tetromino_idx]
                possibleMoves = tetromino.getPossibleMoves(self.board)
                s = util.a3cState(self.board)
                # rnn_state = self.local_AC.state_init
                # self.batch_rnn_state = rnn_state
                episode_frames.append(s)

                while True:
                    # tetromino.printShape(0)
                    # self.board.printBoard()
                    # print(possibleMoves)

                    # bool_moves = [(x in possibleMoves) for x in range(self.a_size)]
                    #Take an action using probabilities from policy network output.
                    a_dist, v = sess.run(
                        [self.local_AC.policy, self.local_AC.value],
                        feed_dict={
                            self.local_AC.imageIn: s,
                            self.local_AC.tetromino: tetromino_AC
                        })
                    #  self.local_AC.state_in[0]:rnn_state[0],
                    #  self.local_AC.state_in[1]:rnn_state[1]})

                    valid_moves = [
                        x if i in possibleMoves else 0.
                        for i, x in enumerate(a_dist[0])
                    ]
                    # import pdb; pdb.set_trace()
                    # if episode_count % 100 == 0:
                    #   print(a_dist[0])
                    # print(a_dist[0])
                    # print(valid_moves)
                    sum_v = sum(valid_moves)
                    if sum_v == 0:
                        a = util.randChoice(possibleMoves)
                    #   tetromino.printShape(0)
                    #   self.board.printBoard()
                    #   print(possibleMoves)
                    #   print(a_dist[0])
                    #   print(valid_moves)
                    #   print("err: invalid moves. ending game")
                    else:
                        softmax_a_dist = [valid_moves / sum_v]
                        a = np.random.choice(actions_list, p=softmax_a_dist[0])
                    # print(softmax_a_dist)
                    # print(a)
                    rot, col = divmod(a, self.board.ncols)
                    # print(rot, col)
                    r = self.board.act(tetromino, col, rot)

                    nextTetrominoIdx = randint(0, n_tetrominos)
                    nextTetromino = tetrominos[nextTetrominoIdx]
                    nextTetromino_AC = np.reshape(nextTetrominoIdx, (1, 1))
                    s1 = util.a3cState(self.board)

                    possibleMoves = nextTetromino.getPossibleMoves(self.board)
                    d = (len(possibleMoves) == 0)

                    episode_frames.append(s1)

                    episode_buffer.append(
                        [s, a, r, s1, d, v[0, 0], tetromino_AC])
                    episode_values.append(v[0, 0])

                    tetromino = nextTetromino
                    tetromino_idx = nextTetrominoIdx
                    tetromino_AC = nextTetromino_AC

                    episode_reward += r
                    s = s1
                    total_steps += 1
                    episode_step_count += 1

                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
                    if len(episode_buffer) == 30 and d != True:
                        # print("HERE")
                        # Since we don't know what the true final return is, we "bootstrap" from our current
                        # value estimation.
                        v1 = sess.run(self.local_AC.value,
                                      feed_dict={
                                          self.local_AC.imageIn: s,
                                          self.local_AC.tetromino: tetromino_AC
                                      })[0, 0]
                        v_l, p_l, e_l, g_n, v_n = self.train(
                            global_AC, episode_buffer, sess, gamma, v1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)
                    if episode_step_count >= max_episode_length - 1:
                        print("reached max")
                        break
                    elif d == True:
                        break

                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))

                # Update the network using the experience buffer at the end of the episode.
                if len(episode_buffer) != 0:
                    v_l, p_l, e_l, g_n, v_n = self.train(
                        global_AC, episode_buffer, sess, gamma, 0.0)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % saveFreq == 0 and episode_count != 0:

                    mean_reward = np.mean(self.episode_rewards[-saveFreq:])
                    mean_length = np.mean(self.episode_lengths[-saveFreq:])
                    mean_value = np.mean(self.episode_mean_values[-saveFreq:])
                    print(mean_reward)

                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1
Beispiel #7
0
def learn(epsilon, gamma, alpha, nGames, getAvgs):
    tetrominos = createTetrominos()
    board = Board(5, 3)
    tShapeRows, tShapeCols = tuple(
        map(operator.add, tetrominos[0].shape.shape, (1, 1)))
    inputLayerDim = (board.nrows * board.ncols) + (tShapeRows * tShapeCols)
    actionsDim = board.ncols * 4

    # Tensorflow network initialisation
    tf.reset_default_graph()

    # These lines establish the feed-forward part of the network used
    # to choose actions
    inputs1 = tf.placeholder(shape=[1, inputLayerDim], dtype=tf.float32)
    W = tf.Variable(tf.zeros([inputLayerDim, actionsDim]))
    p = tf.placeholder(tf.bool, [1, actionsDim])
    Qout = tf.matmul(inputs1, W)
    invalidMoves = tf.constant(-100., shape=[1, actionsDim])
    validMoves = tf.where(
        p, Qout, invalidMoves)  # Replace invalid moves in Qout by -100
    predict = tf.argmax(validMoves, 1)

    # Below we obtain the loss by taking the sum of squares difference between
    # the target and prediction Q values.
    nextQ = tf.placeholder(shape=[1, actionsDim], dtype=tf.float32)
    loss = tf.reduce_sum(tf.square(nextQ - Qout))
    trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
    updateModel = trainer.minimize(loss)

    init = tf.global_variables_initializer()

    # create lists to contain total rewards and steps per episode
    jList = []
    rList = []

    totalLinesCleared = 0
    col = 0
    rot = 0
    avgs = []
    s = []
    a = []
    allQ = []

    with tf.Session() as sess:
        sess.run(init)
        for i in range(nGames):
            print(i)
            board.reset()
            tetromino = util.randChoice(tetrominos)

            while (True):
                # Moves come in the format [columnIndex, rotationIndex]
                possibleMoves = tetromino.getPossibleMoves(board)

                # Game over condition
                if len(possibleMoves) == 0:
                    break

                if np.random.rand(1) < epsilon:
                    a = util.randChoice(possibleMoves)
                else:
                    boolMoves = [(x in possibleMoves)
                                 for x in range(actionsDim)]
                    s = util.networkState(board.board,
                                          tetromino.paddedRotations[0])
                    a, allQ = sess.run([predict, Qout],
                                       feed_dict={
                                           inputs1: s,
                                           p: [boolMoves]
                                       })
                    a = a[0]

                rot, col = divmod(a, board.ncols)

                # Perform action and collect reward
                r = board.act(tetromino, col, rot)

                # Random Tetromino for next state
                nextTetromino = util.randChoice(tetrominos)
                s1 = util.networkState(board.board,
                                       nextTetromino.paddedRotations[0])

                Q1 = sess.run(Qout, feed_dict={inputs1: s1})
                #Obtain maxQ' and set our target value for chosen action.
                maxQ1 = np.max(Q1)
                targetQ = allQ
                targetQ[0, a] = r + alpha * maxQ1

                #Train our network using target and predicted Q values
                _, W1 = sess.run([updateModel, W],
                                 feed_dict={
                                     inputs1: s,
                                     nextQ: targetQ
                                 })

                tetromino = nextTetromino

            totalLinesCleared += board.linesCleared

            if (i + 1) % 10 == 0:
                avgs.append(totalLinesCleared / (i + 1))

        # print("Lines cleared: ", board.linesCleared)
    avg = totalLinesCleared / nGames
    avgs.append(avg)
    # print("Average lines cleared:", avg)
    return avgs
Beispiel #8
0
def learn(epsilon, gamma, alpha, nGames, getAvgs):
    tetrominos = createTetrominos()
    board = Board(5, 3)
    board.reset()
    tShapeRows, tShapeCols = tuple(
        map(operator.add, tetrominos[0].shape.shape, (1, 1)))
    inputLayerDim = (board.nrows * board.ncols) + (tShapeRows * tShapeCols)
    actionsDim = board.ncols * 4

    input_layer = tf.placeholder(shape=[None, inputLayerDim], dtype=tf.float32)
    hidden_layer = slim.fully_connected(
        input_layer,
        H,
        activation_fn=tf.nn.relu,
        weights_initializer=tf.contrib.layers.xavier_initializer(),
        biases_initializer=None)
    output_layer = slim.fully_connected(
        hidden_layer,
        actionsDim,
        activation_fn=tf.nn.sigmoid,
        weights_initializer=tf.contrib.layers.xavier_initializer(),
        biases_initializer=None)

    actions = tf.placeholder(shape=[None], dtype=tf.int32)
    rewards = tf.placeholder(shape=[None, 1], dtype=tf.float32)

    actions_onehot = tf.one_hot(actions, actionsDim)
    responsible_outputs = tf.reduce_sum(output_layer * actions_onehot, [1])

    loss = -tf.reduce_mean(tf.log(responsible_outputs) * rewards)

    p = tf.placeholder(tf.bool, [1, actionsDim])
    # Qout = tf.matmul(inputs1,W)
    invalidMoves = tf.constant(0., shape=[1, actionsDim])
    validMoves = tf.where(
        p, output_layer, invalidMoves)  # Replace invalid moves in Qout by -100
    predict = tf.argmax(validMoves, 1)

    w_variables = tf.trainable_variables()
    gradients = []
    for indx, w in enumerate(w_variables):
        w_holder_var = tf.placeholder(tf.float32, name="w_" + str(indx))
        gradients.append(w_holder_var)

    all_gradients = tf.gradients(loss, tf.trainable_variables())
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
    apply_grads = optimizer.apply_gradients(zip(gradients, w_variables))

    totalLinesCleared = 0
    col = 0
    rot = 0
    avgs, s, a, allQ, h = [], [], [], [], []
    prev_x = None  # used in computing the difference frame
    xs, hs, dlogps, drs, all_game_scores = [], [], [], [], []
    running_reward = None
    reward_sum = 0
    episode_number = 0

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        grad_buffer = sess.run(tf.trainable_variables())
        for indx, grad in enumerate(grad_buffer):
            grad_buffer[indx] = grad * 0

        while episode_number < 100:
            tetromino = util.randChoice(tetrominos)
            #   tetromino.printShape(0)
            possibleMoves = tetromino.getPossibleMoves(board)

            # Game over condition
            if len(possibleMoves) > 0:
                boolMoves = [(x in possibleMoves) for x in range(actionsDim)]

                cur_x = util.pgState(board.board, tetromino.paddedRotations[0])
                x = cur_x - prev_x if prev_x is not None else np.zeros((8, 3))
                prev_x = cur_x

                a, o, h = sess.run([predict, validMoves, hidden_layer],
                                   feed_dict={
                                       input_layer:
                                       np.reshape(x, (1, inputLayerDim)),
                                       p: [boolMoves]
                                   })
                a = np.random.choice(actionsDim, 1, p=o[0] / sum(o[0]))[0]
                # print(a)

                rot, col = divmod(a, board.ncols)
                xs.append(np.reshape(x, (1, inputLayerDim)))  # observation
                hs.append(h)
                dlogps.append(a)
                # dlogps.append(onehot(actionsDim, a))

                # Perform action and collect reward
                r = board.act(tetromino, col, rot)
                # board.printBoard()
                reward_sum += r
                drs.append(
                    r
                )  # record reward (has to be done after we call step() to get reward for previous action)

                # Random Tetromino for next state
                nextTetromino = util.randChoice(tetrominos)
                s1 = util.pgState(board.board,
                                  nextTetromino.paddedRotations[0])
            else:
                episode_number += 1
                # stack together all inputs, hidden states, action gradients, and rewards for this episode
                epx = np.vstack(xs)
                eph = np.vstack(hs)
                epdlogp = np.vstack(dlogps)
                epr = np.vstack(drs)
                xs, hs, dlogps, drs = [], [], [], []  # reset array memory
                # compute the discounted reward backwards through time
                discounted_epr = discount_rewards(epr, gamma)
                # standardize the rewards to be unit normal (helps control the gradient estimator variance)
                discounted_epr = discounted_epr - np.mean(discounted_epr)
                discounted_epr = discounted_epr / np.std(discounted_epr)

                grads = sess.run(all_gradients,
                                 feed_dict={
                                     input_layer: epx,
                                     rewards: discounted_epr,
                                     actions: epdlogp.ravel()
                                 })
                for indx, grad in enumerate(grads):
                    grad_buffer[indx] += grad

                # perform rmsprop parameter update every batch_size episodes
                if episode_number % batch_size == 0:
                    print("updating weights of the network")
                    feed_dict = dict(zip(gradients, grad_buffer))
                    x = sess.run(apply_grads, feed_dict=feed_dict)
                    print('HERE')
                    print(x)
                    for indx, grad in enumerate(grad_buffer):
                        grad_buffer[
                            indx] = grad * 0  # reset batch gradient buffer

                # boring book-keeping
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print('resetting env. episode reward %f. running mean: %f' %
                      (reward_sum, running_reward))
                #        if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
                all_game_scores.append(reward_sum)
                avgs.append(running_reward)
                reward_sum = 0
                board.reset()  # reset env
                prev_x = None
                # print(all_game_scores)
    return avgs