def main(argv: Optional[Sequence[str]] = None) -> None: """Run script. Args: argv: A list of argument strings to use instead of sys.argv. """ args = parse_args(argv) eval_params = { "num_steps": args.num_steps, "max_episode_steps": args.max_episode_steps, "seed": args.seed, } with open(args.input, "r") as f: data = json.load(f) with open(args.output, "w") as f, utils.IncrementalJSONEncoder(f) as encoder: metadata = data["metadata"] metadata["evaluation_parameters"] = eval_params if not args.quiet: utils.print_key_values(metadata) encoder.write("metadata", metadata) encoder.write_iterator( "evaluations", evaluated_policies( data["policies"], policy_dtype=args.policy_dtype, env=metadata["env"], eval_params=eval_params, quiet=args.quiet, ), )
def train( env_name: str, agent_name: str, agent_kwargs: Dict[str, Any], num_steps: int, seed: int, output: IO[str], max_episode_steps: Optional[int] = None, save_policy_log_steps: float = 0.1, quiet: bool = False, ) -> np.ndarray: metadata = { "env": env_name, "agent": agent_name, "agent_kwargs": copy.deepcopy(agent_kwargs), "training_parameters": { "num_steps": num_steps, "seed": seed, "max_episode_steps": max_episode_steps, }, } with utils.IncrementalJSONEncoder(output) as encoder: encoder.write("metadata", metadata) env, agent = run.prepare_env_agent( env=env_name, agent=agent_name, agent_kwargs=agent_kwargs, seed=seed, ) training_steps = env.run( agent, learn=True, num_steps=num_steps, max_episode_steps=max_episode_steps, ) if not quiet: utils.print_key_values(metadata) print() print("Training...") if tqdm is not None: training_steps = tqdm.tqdm(training_steps, total=num_steps) encoder.write_iterator( "policies", saved_policies(training_steps, agent, save_policy_log_steps=save_policy_log_steps), ) policy = agent.policy_matrix(greedy=True) if not quiet: print() print("Trained Policy") print(env.policy_string(policy)) return policy
def main(argv: Optional[Sequence[str]] = None) -> None: """Run script. Args: argv: A list of argument strings to use instead of sys.argv. """ args = parse_args(argv) for file in args.files: with open(file, "r") as f: data, _, policy = mamdp.serialization.load_results(f) print() print("=" * 30) utils.print_key_values(data) env = ENVIRONMENTS[data["metadata"]["env"]]() print() print("Policy") print(env.policy_string(policy))
def train_eval( env_name: str, agent_name: str, agent_kwargs: Dict[str, Any], num_training_steps: int, training_seed: int, max_episode_steps: Optional[int] = None, num_eval_steps: Optional[int] = None, eval_seed: int = 2, quiet: bool = False, ) -> Tuple[Dict[str, Any], List[Step], np.ndarray]: info: Dict[str, Any] = {} info["metadata"] = { "env": env_name, "agent": agent_name, "agent_kwargs": copy.deepcopy(agent_kwargs), "timestamp": time.time(), "training_parameters": { "num_steps": num_training_steps, "seed": training_seed, "max_episode_steps": max_episode_steps, }, "evaluation_parameters": { "num_steps": num_eval_steps, "seed": eval_seed, "max_episode_steps": None, }, } env, agent = run.prepare_env_agent( env=env_name, agent=agent_name, agent_kwargs=agent_kwargs, seed=training_seed, ) training_steps_iter = env.run( agent, learn=True, num_steps=num_training_steps, max_episode_steps=max_episode_steps, ) if not quiet: utils.print_key_values(info) print() print("Training...") if tqdm is not None: training_steps_iter = tqdm.tqdm( training_steps_iter, total=num_training_steps ) training_steps = list(training_steps_iter) policy = agent.policy_matrix(greedy=True) if not quiet: print() print("Trained Policy") print(env.policy_string(policy)) try: action_values = agent.q # type: ignore except AttributeError: pass else: print() print("State-Action Values") print(env.action_values_string(action_values)) if num_eval_steps: if not quiet: print() print("Evaluating...") evaluation_statistics = run.evaluate_policy( env=env_name, policy=policy, num_steps=num_eval_steps, max_episode_steps=None, seed=eval_seed, progressbar=not quiet, ) info["evaluation_statistics"] = evaluation_statistics if not quiet: print() utils.print_key_values(evaluation_statistics) return info, training_steps, policy
num_steps: int = 10_000, max_episode_steps: int = 100, seed: int = 0, ): print(env_name, policy_name) env = ENVIRONMENTS[env_name]() print(env.policy_string(policy)) stats = run.evaluate_policy( env=env, policy=policy, num_steps=num_steps, max_episode_steps=max_episode_steps, seed=seed, progressbar=True, ) utils.print_key_values(stats) def main(argv: Optional[Sequence[str]] = None) -> None: """Run script. Args: argv: A list of argument strings to use instead of sys.argv. """ args = parse_args(argv) del args UP, LEFT, DOWN, RIGHT = range(4) whisky_gold = ENVIRONMENTS["whisky-gold"]() # Ignore the whisky: Go right unless on final column then up