Python EpsilonGreedyPolicyの例、wintermute.policy_evaluation.EpsilonGreedyPolicy Pythonの例

コード例 #1

0

ファイルを表示

ファイル: evaluator.py プロジェクト: floringogianu/theodon

def init_evaluator(opt, eval_queue, confirm_queue):
    """ Here we initialize the evaluator, creating objects and shit.
    """

    log = init_eval_logger(opt.out_dir)

    env = get_wrapped_atari(
        opt.game, mode="testing", seed=opt.seed, no_gym=opt.no_gym
    )

    eval_estimator = get_estimator(
        "atari",
        hist_len=opt.hist_len,
        action_no=env.action_space.n,
        hidden_sz=opt.hidden_sz,
        shared_bias=opt.shared_bias,
    )
    eval_estimator.cuda()

    epsilon = get_epsilon(name="constant", start=opt.eval_epsilon)
    policy_evaluation = EpsilonGreedyPolicy(
        eval_estimator, env.action_space, epsilon
    )

    opt.log = log
    opt.env = env
    opt.policy_evaluation = policy_evaluation
    opt.eval_queue = eval_queue
    opt.confirm_queue = confirm_queue

    evaluate(opt)

コード例 #2

0

ファイルを表示

ファイル: player.py プロジェクト: floringogianu/theodon

def init_player(opt, experience_queue, sync_queue):
    """ Function to serve as target for the player process.
    """

    log = Logger(label="label", path=opt.out_dir)
    log.add_group(
        tag="playing",
        metrics=(
            log.SumMetric("ep_cnt"),
            log.AvgMetric("rw_per_ep", emph=True),
            log.AvgMetric("rw_per_step"),
            log.MaxMetric("max_q"),
            log.FPSMetric("playing_fps"),
            log.MaxMetric("ram"),
            log.MaxMetric("gpu"),
        ),
        console_options=("white", "on_green", ["bold"]),
    )
    env = get_wrapped_atari(
        opt.game,
        mode="training",
        seed=opt.seed,
        no_gym=opt.no_gym,
        device=torch.device("cuda"),
    )

    epsilon = get_epsilon(
        steps=opt.epsilon_steps,
        end=opt.epsilon_end,
        warmup_steps=opt.learn_start,
    )
    policy_evaluation = EpsilonGreedyPolicy(opt.estimator, env.action_space.n,
                                            epsilon)

    opt.log = log
    opt.env = env
    opt.policy_evaluation = policy_evaluation
    opt.experience_queue = experience_queue
    opt.sync_queue = sync_queue

    play(opt)

コード例 #3

0

ファイルを表示

    def test(opt, crt_step, estimator, action_space, eval_env, log):
        """ Here we do the training.

            DeepMind uses a constant epsilon schedule with a very small value
            instead  of a completely Deterministic Policy.
        """

        epsilon = get_epsilon(name="constant", start=opt.eval_epsilon)
        estimator.to("cuda")
        policy_evaluation = EpsilonGreedyPolicy(estimator, action_space,
                                                epsilon)

        if eval_env is None:
            eval_env = get_wrapped_atari(opt.game,
                                         mode="testing",
                                         seed=opt.seed,
                                         no_gym=opt.no_gym)

        mean_ep_rw, mean_ep_crw = evaluate_once(crt_step, policy_evaluation,
                                                eval_env, opt.eval_steps, log)

        return mean_ep_rw, mean_ep_crw

コード例 #4

0

ファイルを表示

ファイル: play.py プロジェクト: floringogianu/theodon

def get_stuff(opt, model):
    # wrap the gym env
    env = get_wrapped_atari(
        opt.game,
        mode="testing",
        seed=42,
        no_gym=opt.no_gym,
        device=opt.mem_device,
    )
    action_no = env.action_space.n
    estimator = get_estimator(
        "atari",
        hist_len=4,
        action_no=action_no,
        hidden_sz=512,
        shared_bias=opt.shared_bias,
    )
    estimator = estimator.cuda()
    estimator.load_state_dict(model["model"])

    epsilon = get_epsilon(name="constant", start=opt.eval_epsilon)
    policy_evaluation = EpsilonGreedyPolicy(estimator, action_no, epsilon)

    return env, policy_evaluation

コード例 #5

0

ファイルを表示

def main(args):
    """ Here we initialize stuff.
    """
    args.seed = random.randint(0, 1e4) if args.seed == 42 else args.seed
    print(f"torch manual seed={args.seed}.")
    torch.manual_seed(args.seed)

    # wrap the gym env
    env = get_wrapped_atari(
        args.game,
        mode="training",
        hist_len=4,
        seed=args.seed,
        no_gym=args.no_gym,
    )
    print(env)
    print("ActionSpace: ", env.action_space)

    # construct an estimator to be used with the policy
    action_no = env.action_space.n
    estimator = get_estimator("atari",
                              hist_len=4,
                              action_no=action_no,
                              hidden_sz=256)
    estimator = estimator.cuda()

    # construct an epsilon greedy policy
    # also: epsilon = {'name':'linear', 'start':1, 'end':0.1, 'steps':1000}
    epsilon = get_epsilon(steps=args.epsilon_steps)
    policy_evaluation = EpsilonGreedyPolicy(estimator, action_no, epsilon)

    # construct a policy improvement type
    # optimizer = get_optimizer('Adam', estimator, lr=0.0001, eps=0.0003)
    optimizer = optim.Adam(estimator.parameters(),
                           lr=args.lr,
                           eps=args.adam_eps)
    policy_improvement = DQNPolicyImprovement(estimator,
                                              optimizer,
                                              gamma=0.99,
                                              is_double=args.double_dqn)

    # we also need an experience replay
    if args.prioritized:
        experience_replay = PER(
            args.mem_size,
            batch_size=32,
            alpha=0.6,
            optim_steps=((args.step_no - args.learn_start) / args.update_freq),
        )
        priority_update_cb = partial(priority_update, experience_replay)
    else:
        experience_replay = ER(args.mem_size, batch_size=32)
        # experience_replay = ER(100000, batch_size=32, hist_len=4)  # flat

    # construct a tester
    tester = None

    # construct a logger
    if not args.label:
        sampling = "prioritized" if args.prioritized else "uniform"
        label = f"{datetime.now():%Y%b%d-%H%M%S}_{args.game}_{sampling}"

    log = Logger(label=label, path=f"./results/{label}")
    train_log = log.add_group(
        tag="training",
        metrics=(
            log.SumMetric("ep_cnt", resetable=False),
            log.AvgMetric("rw_per_ep", emph=True),
            log.AvgMetric("rw_per_step"),
            log.MaxMetric("max_q"),
            log.FPSMetric("training_fps"),
            log.FPSMetric("sampling_fps"),
        ),
        console_options=("white", "on_blue", ["bold"]),
    )
    log.log_info(train_log, "date: %s." % time.strftime("%d/%m/%Y | %H:%M:%S"))
    log.log_info(train_log, "pytorch v%s." % torch.__version__)

    # Add the created objects in the args namespace
    args.env = env
    args.policy_evaluation = policy_evaluation
    args.policy_improvement = policy_improvement
    args.experience_replay = experience_replay
    args.tester = tester
    args.log = log
    if args.prioritized:
        args.priority_update = priority_update_cb

    # print the args
    print_namespace(args)

    # start the training
    train(args)

コード例 #6

0

ファイルを表示

    def run(opt):
        """ Here we initialize stuff.
        """
        opt.seed = random.randint(0, 1e4) if not opt.seed else opt.seed
        print(f"torch manual seed={opt.seed}.")
        torch.manual_seed(opt.seed)

        # wrap the gym env
        env = get_wrapped_atari(
            opt.game,
            mode="training",
            seed=opt.seed,
            no_gym=opt.no_gym,
            device=opt.mem_device,
        )

        if opt.async_eval:
            eval_env = None
        else:
            eval_env = get_wrapped_atari(opt.game,
                                         mode="testing",
                                         seed=opt.seed,
                                         no_gym=opt.no_gym)

        # construct an estimator to be used with the policy
        action_no = env.action_space.n
        estimator = get_estimator(
            "atari",
            hist_len=4,
            action_no=action_no,
            hidden_sz=512,
            shared_bias=opt.shared_bias,
        )
        estimator = estimator.cuda()

        # construct an epsilon greedy policy
        # also: epsilon = {'name':'linear', 'start':1, 'end':0.1, 'steps':1000}
        epsilon = get_epsilon(
            steps=opt.epsilon_steps,
            end=opt.epsilon_end,
            warmup_steps=opt.learn_start,
        )
        policy_evaluation = EpsilonGreedyPolicy(estimator, action_no, epsilon)

        # construct a policy improvement type
        optimizer = optim.RMSprop(
            estimator.parameters(),
            lr=opt.lr,
            momentum=opt.rmsprop_momentum,
            alpha=0.95,
            eps=opt.rmsprop_eps,
            centered=True,
        )
        policy_improvement = DQNPolicyImprovement(estimator,
                                                  optimizer,
                                                  gamma=0.99,
                                                  is_double=opt.double)

        # we also need an experience replay

        experience_replay = create_memory(opt)

        log = init_eval_logger(opt.out_dir)
        train_log = log.add_group(
            tag="training",
            metrics=(
                log.SumMetric("ep_cnt"),
                log.AvgMetric("rw_per_ep", emph=True),
                log.AvgMetric("rw_per_step"),
                log.MaxMetric("max_q"),
                log.FPSMetric("training_fps"),
                log.FPSMetric("sampling_fps"),
                log.MaxMetric("ram"),
                log.MaxMetric("gpu"),
            ),
            console_options=("white", "on_blue", ["bold"]),
        )

        log.log_info(train_log,
                     "date: %s." % time.strftime("%d/%m/%Y | %H:%M:%S"))

        log.log_info(train_log, "pytorch v%s." % torch.__version__)

        # Add the created objects in the opt namespace
        opt.env = env
        opt.eval_env = eval_env
        opt.policy_evaluation = policy_evaluation
        opt.policy_improvement = policy_improvement
        opt.experience_replay = experience_replay
        opt.log = log

        # print the opt
        print("Starting experiment using the following settings:")
        print(liftoff.config.config_to_string(opt))
        print(estimator)

        opt.eval_opt = Namespace(
            eval_steps=opt.eval_steps,
            eval_epsilon=opt.eval_epsilon,
            game=opt.game,
            seed=opt.seed,
            no_gym=opt.no_gym,
        )

        opt.evals = []

        # start the training
        train(opt)

コード例 #7

0

ファイルを表示

ファイル: dqn.py プロジェクト: bogdanbranescu/wintermute

def main(seed=42, label="results", training_steps=10000000, lr=0.0001):
    """ Here we initialize stuff.
    """
    print(f'torch manual seed={seed}.')
    torch.manual_seed(seed)

    # wrap the gym env
    env = get_wrapped_atari('PongNoFrameskip-v4', mode='training', hist_len=4)
    print(env)

    # construct an estimator to be used with the policy
    action_no = env.action_space.n
    estimator = get_estimator('atari', hist_len=4, action_no=action_no,
                              hidden_sz=512)
    estimator = estimator.cuda()

    # construct an epsilon greedy policy
    # also: epsilon = {'name'='linear', 'start'=1, 'end'=0.1, 'steps_no'=1000}
    epsilon = get_epsilon(name='linear', start=1, end=0.01, steps_no=30000)
    policy_evaluation = EpsilonGreedyPolicy(estimator, epsilon)

    # construct a policy improvement type
    # optimizer = get_optimizer('Adam', estimator, lr=0.0001, eps=0.0003)
    optimizer = optim.Adam(estimator.parameters(), lr=lr)
    policy_improvement = DQNPolicyImprovement(estimator, optimizer, gamma=0.99)

    # we also need an experience replay
    experience_replay = ER(100000, batch_size=32, hist_len=4, cuda=True)

    # construct a tester
    tester = None

    # construct a logger
    log = Logger(label=label, path=f'./{label}')
    train_log = log.add_group(
        tag="training",
        metrics=(log.SumMetric("ep_cnt", resetable=False),
                 log.AvgMetric("rw_per_ep", emph=True),
                 log.AvgMetric("rw_per_step"),
                 log.MaxMetric("max_q"),
                 log.FPSMetric("training_fps"),
                 log.FPSMetric("sampling_fps")),
        console_options=("white", "on_blue", ["bold"])
    )
    log.log_info(train_log, "date: %s." % time.strftime("%d/%m/%Y | %H:%M:%S"))
    log.log_info(train_log, "pytorch v%s." % torch.__version__)


    # construct a structure for easily accessing objects and settings
    args = SimpleNamespace(env=env,
                           policy_evaluation=policy_evaluation,
                           policy_improvement=policy_improvement,
                           experience_replay=experience_replay,
                           tester=tester,
                           log=log,
                           training_steps=training_steps,
                           start_learning_after=10000,
                           update_freq=1)
    for k, v in args.__dict__.items():
        if k != "env":
            k = clr(k, attrs=['bold'])
            print(f'{k}: {v}')

    # start the training
    train(args)