コード例 #1
0
def main(argv: Optional[Sequence[str]] = None) -> None:
    """Run script.

    Args:
        argv: A list of argument strings to use instead of sys.argv.
    """
    args = parse_args(argv)
    eval_params = {
        "num_steps": args.num_steps,
        "max_episode_steps": args.max_episode_steps,
        "seed": args.seed,
    }

    with open(args.input, "r") as f:
        data = json.load(f)

    with open(args.output,
              "w") as f, utils.IncrementalJSONEncoder(f) as encoder:
        metadata = data["metadata"]
        metadata["evaluation_parameters"] = eval_params
        if not args.quiet:
            utils.print_key_values(metadata)
        encoder.write("metadata", metadata)

        encoder.write_iterator(
            "evaluations",
            evaluated_policies(
                data["policies"],
                policy_dtype=args.policy_dtype,
                env=metadata["env"],
                eval_params=eval_params,
                quiet=args.quiet,
            ),
        )
コード例 #2
0
def train(
    env_name: str,
    agent_name: str,
    agent_kwargs: Dict[str, Any],
    num_steps: int,
    seed: int,
    output: IO[str],
    max_episode_steps: Optional[int] = None,
    save_policy_log_steps: float = 0.1,
    quiet: bool = False,
) -> np.ndarray:
    metadata = {
        "env": env_name,
        "agent": agent_name,
        "agent_kwargs": copy.deepcopy(agent_kwargs),
        "training_parameters": {
            "num_steps": num_steps,
            "seed": seed,
            "max_episode_steps": max_episode_steps,
        },
    }

    with utils.IncrementalJSONEncoder(output) as encoder:
        encoder.write("metadata", metadata)

        env, agent = run.prepare_env_agent(
            env=env_name,
            agent=agent_name,
            agent_kwargs=agent_kwargs,
            seed=seed,
        )
        training_steps = env.run(
            agent,
            learn=True,
            num_steps=num_steps,
            max_episode_steps=max_episode_steps,
        )

        if not quiet:
            utils.print_key_values(metadata)
            print()
            print("Training...")
            if tqdm is not None:
                training_steps = tqdm.tqdm(training_steps, total=num_steps)

        encoder.write_iterator(
            "policies",
            saved_policies(training_steps,
                           agent,
                           save_policy_log_steps=save_policy_log_steps),
        )

    policy = agent.policy_matrix(greedy=True)
    if not quiet:
        print()
        print("Trained Policy")
        print(env.policy_string(policy))

    return policy
コード例 #3
0
ファイル: summary.py プロジェクト: edlanglois/mamdp
def main(argv: Optional[Sequence[str]] = None) -> None:
    """Run script.

    Args:
        argv: A list of argument strings to use instead of sys.argv.
    """
    args = parse_args(argv)
    for file in args.files:
        with open(file, "r") as f:
            data, _, policy = mamdp.serialization.load_results(f)

        print()
        print("=" * 30)

        utils.print_key_values(data)

        env = ENVIRONMENTS[data["metadata"]["env"]]()
        print()
        print("Policy")
        print(env.policy_string(policy))
コード例 #4
0
def train_eval(
    env_name: str,
    agent_name: str,
    agent_kwargs: Dict[str, Any],
    num_training_steps: int,
    training_seed: int,
    max_episode_steps: Optional[int] = None,
    num_eval_steps: Optional[int] = None,
    eval_seed: int = 2,
    quiet: bool = False,
) -> Tuple[Dict[str, Any], List[Step], np.ndarray]:
    info: Dict[str, Any] = {}
    info["metadata"] = {
        "env": env_name,
        "agent": agent_name,
        "agent_kwargs": copy.deepcopy(agent_kwargs),
        "timestamp": time.time(),
        "training_parameters": {
            "num_steps": num_training_steps,
            "seed": training_seed,
            "max_episode_steps": max_episode_steps,
        },
        "evaluation_parameters": {
            "num_steps": num_eval_steps,
            "seed": eval_seed,
            "max_episode_steps": None,
        },
    }

    env, agent = run.prepare_env_agent(
        env=env_name,
        agent=agent_name,
        agent_kwargs=agent_kwargs,
        seed=training_seed,
    )
    training_steps_iter = env.run(
        agent,
        learn=True,
        num_steps=num_training_steps,
        max_episode_steps=max_episode_steps,
    )

    if not quiet:
        utils.print_key_values(info)
        print()
        print("Training...")
        if tqdm is not None:
            training_steps_iter = tqdm.tqdm(
                training_steps_iter, total=num_training_steps
            )

    training_steps = list(training_steps_iter)

    policy = agent.policy_matrix(greedy=True)
    if not quiet:
        print()
        print("Trained Policy")
        print(env.policy_string(policy))
        try:
            action_values = agent.q  # type: ignore
        except AttributeError:
            pass
        else:
            print()
            print("State-Action Values")
            print(env.action_values_string(action_values))

    if num_eval_steps:
        if not quiet:
            print()
            print("Evaluating...")

        evaluation_statistics = run.evaluate_policy(
            env=env_name,
            policy=policy,
            num_steps=num_eval_steps,
            max_episode_steps=None,
            seed=eval_seed,
            progressbar=not quiet,
        )
        info["evaluation_statistics"] = evaluation_statistics

        if not quiet:
            print()
            utils.print_key_values(evaluation_statistics)
    return info, training_steps, policy
コード例 #5
0
ファイル: baseline-policies.py プロジェクト: edlanglois/mamdp
    num_steps: int = 10_000,
    max_episode_steps: int = 100,
    seed: int = 0,
):
    print(env_name, policy_name)
    env = ENVIRONMENTS[env_name]()
    print(env.policy_string(policy))
    stats = run.evaluate_policy(
        env=env,
        policy=policy,
        num_steps=num_steps,
        max_episode_steps=max_episode_steps,
        seed=seed,
        progressbar=True,
    )
    utils.print_key_values(stats)


def main(argv: Optional[Sequence[str]] = None) -> None:
    """Run script.

    Args:
        argv: A list of argument strings to use instead of sys.argv.
    """
    args = parse_args(argv)
    del args

    UP, LEFT, DOWN, RIGHT = range(4)

    whisky_gold = ENVIRONMENTS["whisky-gold"]()
    # Ignore the whisky: Go right unless on final column then up