Ejemplo n.º 1
0
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: "ppo_policy",
        },
        # "num_gpus": 0,
        # "num_gpus_per_worker": 0,
        "callbacks": PlayerScoreCallbacks
    })

if restore_checkpoint:
    trainer.restore(checkpoint_path)

start = time.time()

try:
    for i in range(num_iter):
        res = trainer.train()
        print("Iteration {}. policy result: {}".format(i, res))
        if i % eval_every == 0:
            trainer_eval.set_weights(trainer.get_weights(["ppo_policy"]))
            res = trainer_eval.train()
        if i % checkpoint_every == 0:
            trainer.save()
except:
    trainer.save()

stop = time.time()
train_duration = time.strftime('%H:%M:%S', time.gmtime(stop - start))
print(
    'Training finished ({}), check the results in ~/ray_results/<dir>/'.format(
        train_duration))
Ejemplo n.º 2
0
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.stop_iters):
        print("== Iteration", i, "==")

        # improve the DQN policy
        print("-- DQN --")
        result_dqn = dqn_trainer.train()
        print(pretty_print(result_dqn))

        # improve the PPO policy
        print("-- PPO --")
        result_ppo = ppo_trainer.train()
        print(pretty_print(result_ppo))

        # Test passed gracefully.
        if args.as_test and \
                result_dqn["episode_reward_mean"] > args.stop_reward and \
                result_ppo["episode_reward_mean"] > args.stop_reward:
            print("test passed (both agents above requested reward)")
            quit(0)

        # swap weights to synchronize
        dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"]))
        ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"]))

    # Desired reward not reached.
    if args.as_test:
        raise ValueError("Desired reward ({}) not reached!".format(
            args.stop_reward))
        config=config,
        stop={"training_iteration": args.pre_training_iters},
        verbose=1,
        checkpoint_freq=1,
        checkpoint_at_end=True,
    )
    print("Pre-training done.")

    best_checkpoint = results.get_best_checkpoint(
        results.trials[0], mode="max")
    print(f".. best checkpoint was: {best_checkpoint}")

    # Create a new dummy Trainer to "fix" our checkpoint.
    new_trainer = PPOTrainer(config=config)
    # Get untrained weights for all policies.
    untrained_weights = new_trainer.get_weights()
    # Restore all policies from checkpoint.
    new_trainer.restore(best_checkpoint)
    # Set back all weights (except for 1st agent) to original
    # untrained weights.
    new_trainer.set_weights(
        {pid: w
         for pid, w in untrained_weights.items() if pid != "policy_0"})
    # Create the checkpoint from which tune can pick up the
    # experiment.
    new_checkpoint = new_trainer.save()
    new_trainer.stop()
    print(".. checkpoint to restore from (all policies reset, "
          f"except policy_0): {new_checkpoint}")

    print("Starting new tune.run")