tf.multiply(tf.log(tf.gather_nd(l_out, actions_pl)), advantages_pl)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate_pl, beta1=0.8, beta2=0.92) train_f = optimizer.minimize(loss_f) saver = tf.train.Saver() # we use this later to save the model # test forward pass env = Minesweeper(display=False, ROWS=6, COLS=6, MINES=7, OUT="CONDENSED", rewards={ "win": 1, "loss": -1, "progress": 0.9, "noprogress": -0.3, "YOLO": -0.3 }) state = env.stateConverter(env.get_state()).flatten() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) action_probabilities = sess.run(fetches=l_out, feed_dict={states_pl: [state]}) print(action_probabilities) # helper functions
rows = args.size cols = args.size mines = args.mine FULL = False rewards = { "win": 0.9, "loss": -1, "progress": 0.9, "noprogress": -0.3, "YOLO": -0.3 } env = Minesweeper(display=False, FULL=FULL, ROWS=rows, COLS=cols, MINES=mines, rewards=rewards) n_inputs = rows * cols * 10 if FULL else rows * cols * 2 n_hidden = [rows * cols * 10, 250, 250, 250, 250] n_outputs = rows * cols # Model model = Sequential() model.add( Dense( input_shape=(1, n_inputs), units=n_hidden[0], activation='relu', kernel_initializer='glorot_uniform',
parser.add_argument( "-d", "--display", help= "run with this argument to display the game and see the agent play by pressing enter" ) args = parser.parse_args() if args.display: display = True env = Minesweeper(display=display, ROWS=6, COLS=6, MINES=6, OUT="FULL", rewards={ "win": 1, "loss": -1, "progress": 0.9, "noprogress": -0.3, "YOLO": -0.4 }) with tf.Session() as sess: #Restore Model saver = tf.train.Saver() saver.restore(sess, "{}/{}.ckpt".format(model, model)) #Initialize test parameters games = 0 moves = 0 stuck = 0