# Play from the command line against the trained agent # in an actual (non-RLlib-wrapped) open-spiel env. human_player = 1 env = Environment(args.env) while num_episodes < args.num_episodes_human_play: print("You play as {}".format("o" if human_player else "x")) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == human_player: action = ask_user_for_action(time_step) else: obs = np.array( time_step.observations["info_state"][player_id]) action = trainer.compute_single_action(obs, policy_id="main") # In case computer chooses an invalid action, pick a # random one. legal = time_step.observations["legal_actions"][player_id] if action not in legal: action = np.random.choice(legal) time_step = env.step([action]) print(f"\n{env.get_state}") print(f"\n{env.get_state}") print("End of game!") if time_step.rewards[human_player] > 0: print("You win") elif time_step.rewards[human_player] < 0: print("You lose")
trainer.restore(checkpoint_path) # Inference loop. env = StatelessCartPole() # Run manual inference loop for n episodes. for _ in range(10): episode_reward = 0.0 reward = 0.0 action = 0 done = False obs = env.reset() while not done: # Create a dummy action using the same observation n times, # as well as dummy prev-n-actions and prev-n-rewards. action, state, logits = trainer.compute_single_action( input_dict={ "obs": obs, "prev_n_obs": np.stack([obs for _ in range(num_frames)]), "prev_n_actions": np.stack([0 for _ in range(num_frames)]), "prev_n_rewards": np.stack( [1.0 for _ in range(num_frames)]), }, full_fetch=True) obs, reward, done, info = env.step(action) episode_reward += reward print(f"Episode reward={episode_reward}") ray.shutdown()
# Train for n iterations and report results (mean episode rewards). # Since we have to move at least 19 times in the env to reach the goal and # each move gives us -0.1 reward (except the last move at the end: +1.0), # we can expect to reach an optimal episode reward of -0.1*18 + 1.0 = -0.8 for i in range(5): results = trainer.train() print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}") # Perform inference (action computations) based on given env observations. # Note that we are using a slightly different env here (len 10 instead of 20), # however, this should still work as the agent has (hopefully) learned # to "just always walk right!" env = SimpleCorridor({"corridor_length": 10}) # Get the initial observation (should be: [0.0] for the starting position). obs = env.reset() done = False total_reward = 0.0 # Play one episode. while not done: # Compute a single action, given the current observation # from the environment. action = trainer.compute_single_action(obs) # Apply the computed action in the environment. obs, reward, done, info = env.step(action) # Sum up rewards for reporting purposes. total_reward += reward # Report results. print(f"Played 1 episode; total-reward={total_reward}") # __quick_start_end__