def mcts(simulation_time: float, env: Environment, root_node: typing.Optional[UCTNode] = None) -> UCTNode: start_time = time.time() root_node = UCTNode( state=env.get_state(), active_player=env.get_active_player(), action=None, parent=None, num_actions=env.get_num_actions(), valid_actions=env.get_valid_actions(), ) while time.time() - start_time < simulation_time: leaf_node, winner = root_node.select(env) if winner is not None: leaf_node.backup(winner) continue else: leaf_node.expand() winner = leaf_node.simulate(env) leaf_node.backup(winner) return root_node
def main(): """ Sets the parameters for the Environment, Critic, and Actor according to the imported config file. Creates an environment where a predefined number of episodes can be performed. Instantiates an actor to keep track of the policy, and a critic to keep track of the value at each state Runs a predefined number of episodes creating a new board for each episode. For each episode, the actor and the critic are updated according to the Actor-Critic model. Finally, epsilon is set to zero, and the environment plays a game with the updated policy. """ env = Environment(env_cfg) granularity = env_cfg["granularity"] critic = Critic(critic_cfg, granularity) actor = Actor(actor_cfg) episodes = training_cfg["number_of_episodes"] visualize_episodes = training_cfg["visualize_episodes"] steps_per_episode = [] for episode in tqdm(range(episodes), desc=f"Playing {episodes} episodes", colour='#39ff14'): env.new_simulation() path = [] positions = [] critic.reset_eli_dict() actor.reset_eli_dict() while not env.reached_top() and not env.reached_max_steps(): env.update_steps() current_state = copy(env.get_state()) legal_actions = env.get_actions() action = actor.get_action(state=current_state, legal_actions=legal_actions) path.append((str(current_state), str(action))) reward = env.perform_action(action=action) td_err = critic.compute_td_err(current_state=current_state, next_state=env.get_state(), reward=reward) # Previous states on the path are updated as well during the call to train() by eligibility traces critic.train(state=current_state, td_error=td_err) critic.update_eligs() # Update actor beliefs on SAPs for all pairs seen thus far in the episode for i, sap in enumerate(reversed(path)): actor.update_eli_dict(state=str(sap[0]), action=str(sap[1]), i=i) actor.update_policy_dict(state=str(sap[0]), action=str(sap[1]), td_err=td_err) positions.append(env.get_position()) print("steps used in this episode", env.steps) if episode in visualize_episodes: env.visualize_landscape(positions) steps_per_episode.append(env.steps) plot_learning(steps_per_episode) # Enable history tracking to visualize final simulation env.new_simulation() print(f"Actor final epsilon: {actor.epsilon}") actor.epsilon = 0 # Set exploration to 0 print("Attempting final simulation to show you how smart I am now") while not env.reached_top() and not env.reached_max_steps(): current_state = env.get_state() legal_actions = env.get_actions() action = actor.get_action(current_state, legal_actions) env.perform_action(action)