Beispiel #1
0
def main():
    NAME = "01_baseline"

    random.seed(common.SEED)
    torch.manual_seed(common.SEED)
    params = common.HYPERPARAMS["pong"]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=True,
                        action="store_true",
                        help="Enable cuda")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine_, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine_.state.iteration)
        if engine_.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, NAME)
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Beispiel #2
0
            tgt_net.sync()

        if args.params.startswith("egreedy"):
            epsilon_tracker.frame(engine.state.iteration -
                                  epsilon_tracker_frame)
            res["epsilon"] = selector.epsilon
        # reset noise every training step, this is fine in off-policy method
        if args.params == "noisynet":
            net.sample_noise()
        return res

    engine = Engine(process_batch)
    common.setup_ignite(
        engine,
        params,
        exp_source,
        args.name,
        extra_metrics=("test_reward", "avg_test_reward", "test_steps"),
    )

    @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED)
    def check_reward_trigger(trainer: Engine):
        global training_enabled, epsilon_tracker_frame
        if training_enabled:
            return
        # check trigger condition to enable epsilon decay
        if trainer.state.episode_reward > -200:
            training_enabled = True
            epsilon_tracker_frame = trainer.state.iteration
            print("Epsilon decay triggered!")
Beispiel #3
0
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(
            batch, net, tgt_net.target_model, gamma=params.gamma, device=device
        )
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine.state.iteration)
        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        if engine.state.iteration % EVAL_EVERY_FRAME == 0:
            eval_states = getattr(engine.state, "eval_states", None)
            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
                eval_states = np.array(eval_states, copy=False)
                engine.state.eval_states = eval_states
            evaluate_states(eval_states, net, device, engine)
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=("adv", "val"))
    engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size))
Beispiel #4
0
        if getattr(engine.state, "eval_states", None) is None:
            eval_states = buffer.sample(STATES_TO_EVALUATE)
            eval_states = [
                np.array(transition.state, copy=False)
                for transition in eval_states
            ]
            engine.state.eval_states = np.array(eval_states, copy=False)

        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    tb = common.setup_ignite(engine,
                             exp_source,
                             f"simple-{args.run}",
                             extra_metrics=("values_mean", ))

    @engine.on(ptan.ignite.PeriodEvents.ITERS_1000_COMPLETED)
    def sync_eval(engine: Engine):
        tgt_net.sync()

        mean_val = common.calc_values_of_states(engine.state.eval_states,
                                                net,
                                                device=device)
        engine.state.metrics["values_mean"] = mean_val
        if getattr(engine.state, "best_mean_val", None) is None:
            engine.state.best_mean_val = mean_val
        if engine.state.best_mean_val < mean_val:
            print(
                "%d: Best mean value updated %.3f -> %.3f" %
Beispiel #5
0
def main():
    NAME = "03_double"
    STATES_TO_EVALUATE = 1000
    EVAL_EVERY_FRAME = 100

    random.seed(common.SEED)
    torch.manual_seed(common.SEED)
    params = common.HYPERPARAMS["pong"]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Enable cuda")
    parser.add_argument("--double",
                        default=False,
                        action="store_true",
                        help="Enable double dqn")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine_, batch):
        optimizer.zero_grad()
        loss_v = calc_loss_double_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device,
                                      double=args.double)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine_.state.iteration)
        if engine_.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        if engine_.state.iteration % EVAL_EVERY_FRAME == 0:
            eval_states = getattr(engine_.state, "eval_states", None)
            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [
                    np.array(transition.state, copy=False)
                    for transition in eval_states
                ]
                eval_states = np.array(eval_states, copy=False)
                engine_.state.eval_states = eval_states
            engine_.state.metrics["values"] = common.calc_values_of_states(
                eval_states, net, device)
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        params,
                        exp_source,
                        f"{NAME}={args.double}",
                        extra_metrics=("values", ))
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Beispiel #6
0
                                     preproc,
                                     gamma=PARAMS.gamma,
                                     device=device)
        loss_v.backward()
        optimizer.step()
        if engine.state.iteration % PARAMS.target_net_sync == 0:
            tgt_net.sync()

        epsilon_tracker.frame(engine.state.iteration)
        return {"epsilon": action_selector.epsilon, "loss": loss_v.item()}

    engine = Engine(process_batch)
    common.setup_ignite(
        engine,
        PARAMS,
        b_exp_source,
        args.name,
        extra_metrics=("test_reward_a", "test_steps_a", "test_reward_b",
                       "test_steps_b"),
    )
    best_test_reward = None

    @engine.on(ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED)
    def test_network(engine):
        net.train(False)
        a_reward, a_steps, b_reward, b_steps = test_model(net, device, config)
        net.train(True)
        engine.state.metrics["test_reward_a"] = a_reward
        engine.state.metrics["test_steps_a"] = a_steps
        engine.state.metrics["test_reward_b"] = b_reward
        engine.state.metrics["test_steps_b"] = b_steps
        print(
Beispiel #7
0
                                     gamma=PARAMS.gamma,
                                     device=device)
        loss_v.backward()
        optimizer.step()
        if epsilon_tracker is not None:
            epsilon_tracker.frame(engine.state.iteration)
            res["epsilon"] = action_selector.epsilon
        if engine.state.iteration % PARAMS.target_net_sync == 0:
            tgt_net.sync()
        res["loss"] = loss_v.item()
        return res

    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        PARAMS,
                        exp_source,
                        args.name,
                        extra_metrics=("test_reward", "test_steps"))
    best_test_reward = None

    @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED)
    def test_network(engine):
        net.train(False)
        reward, steps = test_model(net, device, config)
        net.train(True)
        engine.state.metrics["test_reward"] = reward
        engine.state.metrics["test_steps"] = steps
        print("Test done: got %.3f reward after %.2f steps" % (reward, steps))

        global best_test_reward
        if best_test_reward is None:
Beispiel #8
0
        eps = 1 - engine.state.iteration / params.epsilon_steps
        agent.epsilon = max(params.epsilon_final, eps)
        if engine.state.iteration % params.sync_nets == 0:
            tgt_net.sync()
            tgt_prep.sync()
        return {
            "loss": loss_t.item(),
            "epsilon": agent.epsilon,
        }

    engine = Engine(process_batch)
    run_name = f"basic-{args.params}_{args.run}"
    save_path = pathlib.Path("saves") / run_name
    save_path.mkdir(parents=True, exist_ok=True)

    common.setup_ignite(engine, exp_source, run_name, extra_metrics=("val_reward", "val_steps"))

    @engine.on(ptan.ignite.PeriodEvents.ITERS_100_COMPLETED)
    def validate(engine):
        reward = 0.0
        steps = 0

        obs = val_env.reset()

        while True:
            obs_t = prep.encode_sequences([obs["obs"]]).to(device)
            cmd_t = prep.encode_commands(obs["admissible_commands"]).to(device)
            q_vals = net.q_values(obs_t, cmd_t)
            act = np.argmax(q_vals)

            obs, r, is_done, _ = val_env.step(act)
Beispiel #9
0
    buffer = dqn_extra.PrioReplayBuffer(exp_source, params.replay_size,
                                        PRIO_REPLAY_ALPHA)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch_data):
        batch, batch_indices, batch_weights = batch_data
        optimizer.zero_grad()
        loss_v, sample_prios = calc_loss_prio(
            batch,
            batch_weights,
            net,
            tgt_net.target_model,
            gamma=params.gamma**N_STEPS,
            device=device,
        )
        loss_v.backward()
        optimizer.step()
        buffer.update_priorities(batch_indices, sample_prios)
        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        return {
            "loss": loss_v.item(),
            "beta": buffer.update_beta(engine.state.iteration),
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, NAME)
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Beispiel #10
0
    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params.gamma, steps_count=args.n
    )
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine_, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(
            batch, net, tgt_net.target_model, gamma=params.gamma ** args.n, device=device
        )
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine_.state.iteration)
        if engine_.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, f"{NAME}={args.n}")
    engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size))
Beispiel #11
0
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device)
        loss_v.backward()
        optimizer.step()
        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0:
            for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()):
                engine.state.metrics[f"snr_{layer_idx+1}"] = sigma_l2
        return {
            "loss": loss_v.item(),
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        params,
                        exp_source,
                        NAME,
                        extra_metrics=("snr_1", "snr_2"))
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Beispiel #12
0
            loss += loss_v.item()
            if engine.state.iteration % PARAMS.target_net_sync == 0:
                tgt_net.sync()

        epsilon_tracker.frame(engine.state.iteration)
        res["epsilon"] = action_selector.epsilon
        res["loss"] = loss
        return res

    engine = Engine(process_batches)
    common.setup_ignite(
        engine,
        PARAMS,
        tiger_exp_source,
        args.name,
        extra_metrics=(
            "test_reward_deer",
            "test_steps_deer",
            "test_reward_tiger",
            "test_steps_tiger",
        ),
    )
    best_test_reward_deer = None
    best_test_reward_tiger = None

    @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED)
    def test_network(engine):
        net_deer.train(False)
        net_tiger.train(False)
        deer_reward, deer_steps, tiger_reward, tiger_steps = test_model(
            net_deer, net_tiger, device, config
        )