data_dict = run_game(ttt, optimal_actor, mcts_actor) else: data_dict = run_game(ttt, mcts_actor, optimal_actor) result = data_dict["game_status"] if result == GameBase.Status.PLAYER1_WIN: value = 1 elif result == GameBase.Status.PLAYER2_WIN: value = -1 else: value = 0 states = np.array([x[:9].reshape(3, 3, 1) for x in data_dict["states"]]) values = np.array([[value] for _ in data_dict["states"]]) data_batch = {"states": states, "values": values} replay_buffer.add_data(data_batch) if i > 5: batch_size = 32 sampled_data = replay_buffer.sample(batch_size) x = np.array(sampled_data["states"], dtype=np.float32) y = np.array(sampled_data["values"], dtype=np.float32) with tf.GradientTape() as tape: values = value_network(x) loss = tf.reduce_mean(tf.square(values - y)) grads = tape.gradient(loss, value_network.trainable_variables) optimizer.apply_gradients(zip(grads, value_network.trainable_variables)) print("{} -- Loss: {}".format(i, loss))