Beispiel #1
0
def main():
    NAME = "01_baseline"

    random.seed(common.SEED)
    torch.manual_seed(common.SEED)
    params = common.HYPERPARAMS["pong"]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=True,
                        action="store_true",
                        help="Enable cuda")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine_, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine_.state.iteration)
        if engine_.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, NAME)
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Beispiel #2
0
def play_func(params, net, cuda, exp_queue):
    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)
    device = torch.device("cuda" if cuda else "cpu")

    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma)

    for frame_idx, exp in enumerate(exp_source):
        epsilon_tracker.frame(frame_idx / BATCH_MUL)
        exp_queue.put(exp)
        for reward, steps in exp_source.pop_rewards_steps():
            exp_queue.put(EpisodeEnded(reward, steps, selector.epsilon))
Beispiel #3
0
            reward_scale=params.counts_reward_scale,
            hash_function=counts_hash)
    env.seed(common.SEED)
    if args.params.startswith("egreedy") or args.params == "counts":
        net = dqn_extra.MountainCarBaseDQN(env.observation_space.shape[0],
                                           env.action_space.n)
    elif args.params == "noisynet":
        net = dqn_extra.MountainCarNoisyNetDQN(env.observation_space.shape[0],
                                               env.action_space.n)
    tgt_net = ptan.agent.TargetNet(net)
    print(net)

    if args.params.startswith("egreedy"):
        selector = ptan.actions.EpsilonGreedyActionSelector(
            epsilon=params.epsilon_start)
        epsilon_tracker = common.EpsilonTracker(selector, params)
        training_enabled = not params.eps_decay_trigger
        epsilon_tracker_frame = 0
    else:
        selector = ptan.actions.ArgmaxActionSelector()
        training_enabled = True

    agent = ptan.agent.DQNAgent(net,
                                selector,
                                preprocessor=ptan.agent.float32_preprocessor)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma,
                                                           steps_count=N_STEPS)
    buffer = ptan.experience.ExperienceReplayBuffer(
Beispiel #4
0
def main():
    NAME = "03_double"
    STATES_TO_EVALUATE = 1000
    EVAL_EVERY_FRAME = 100

    random.seed(common.SEED)
    torch.manual_seed(common.SEED)
    params = common.HYPERPARAMS["pong"]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Enable cuda")
    parser.add_argument("--double",
                        default=False,
                        action="store_true",
                        help="Enable double dqn")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine_, batch):
        optimizer.zero_grad()
        loss_v = calc_loss_double_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device,
                                      double=args.double)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine_.state.iteration)
        if engine_.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        if engine_.state.iteration % EVAL_EVERY_FRAME == 0:
            eval_states = getattr(engine_.state, "eval_states", None)
            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [
                    np.array(transition.state, copy=False)
                    for transition in eval_states
                ]
                eval_states = np.array(eval_states, copy=False)
                engine_.state.eval_states = eval_states
            engine_.state.metrics["values"] = common.calc_values_of_states(
                eval_states, net, device)
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        params,
                        exp_source,
                        f"{NAME}={args.double}",
                        extra_metrics=("values", ))
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Beispiel #5
0
    b_env = data.MAgentEnv(m_env,
                           b_handle,
                           reset_env_func=reset_env,
                           is_slave=False,
                           steps_limit=MAX_EPISODE)

    obs = data.MAgentEnv.handle_obs_space(m_env, a_handle)

    net = model.DQNModel(obs.spaces[0].shape, obs.spaces[1].shape,
                         m_env.get_action_space(a_handle)[0]).to(device)
    tgt_net = ptan.agent.TargetNet(net)
    print(net)

    action_selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=PARAMS.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(action_selector, PARAMS)
    preproc = model.MAgentPreprocessor(device)

    agent = ptan.agent.DQNAgent(net,
                                action_selector,
                                device,
                                preprocessor=preproc)
    a_exp_source = ptan.experience.ExperienceSourceFirstLast(a_env,
                                                             agent,
                                                             PARAMS.gamma,
                                                             vectorized=True)
    b_exp_source = ptan.experience.ExperienceSourceFirstLast(b_env,
                                                             agent,
                                                             PARAMS.gamma,
                                                             vectorized=True)
    buffer = ptan.experience.ExperienceReplayBuffer(None, PARAMS.replay_size)