Exemple #1
0
            data_dict = run_game(ttt, optimal_actor, mcts_actor)
        else:
            data_dict = run_game(ttt, mcts_actor, optimal_actor)

        result = data_dict["game_status"]
        if result == GameBase.Status.PLAYER1_WIN:
            value = 1
        elif result == GameBase.Status.PLAYER2_WIN:
            value = -1
        else:
            value = 0

        states = np.array([x[:9].reshape(3, 3, 1) for x in data_dict["states"]])
        values = np.array([[value] for _ in data_dict["states"]])
        data_batch = {"states": states, "values": values}
        replay_buffer.add_data(data_batch)

        if i > 5:
            batch_size = 32
            sampled_data = replay_buffer.sample(batch_size)

            x = np.array(sampled_data["states"], dtype=np.float32)
            y = np.array(sampled_data["values"], dtype=np.float32)

            with tf.GradientTape() as tape:
                values = value_network(x)
                loss = tf.reduce_mean(tf.square(values - y))
            grads = tape.gradient(loss, value_network.trainable_variables)
            optimizer.apply_gradients(zip(grads, value_network.trainable_variables))
            print("{} -- Loss: {}".format(i, loss))