Exemple #1
0
        agent.epsilon = max(params.epsilon_final, eps)
        if engine.state.iteration % params.sync_nets == 0:
            tgt_net.sync()
            tgt_prep.sync()
        return {
            "loss": loss_t.item(),
            "epsilon": agent.epsilon,
        }

    engine = Engine(process_batch)
    run_name = f"basic-{args.params}_{args.run}"
    save_path = pathlib.Path("saves") / run_name
    save_path.mkdir(parents=True, exist_ok=True)

    common.setup_ignite(engine,
                        exp_source,
                        run_name,
                        extra_metrics=('val_reward', 'val_steps'))

    @engine.on(ptan.ignite.PeriodEvents.ITERS_100_COMPLETED)
    def validate(engine):
        reward = 0.0
        steps = 0

        obs = val_env.reset()

        while True:
            obs_t = prep.encode_sequences([obs['obs']]).to(device)
            cmd_t = prep.encode_commands(obs['admissible_commands']).to(device)
            q_vals = net.q_values(obs_t, cmd_t)
            act = np.argmax(q_vals)
        optimizer.step()
        epsilon_tracker.frame(engine.state.iteration)
        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        if engine.state.iteration % EVAL_EVERY_FRAME == 0:
            eval_states = getattr(engine.state, "eval_states", None)
            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [
                    np.array(transition.state, copy=False)
                    for transition in eval_states
                ]
                eval_states = np.array(eval_states, copy=False)
                engine.state.eval_states = eval_states
            engine.state.metrics["values"] = \
                common.calc_values_of_states(eval_states, net, device)
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        params,
                        exp_source,
                        f"{NAME}={args.double}",
                        extra_metrics=('values', ))
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent=agent,
                                                           gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)

    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net,
                                      gamma=params.gamma,
                                      device=device)
        loss_v.backward()
        optimizer.step()

        epsilon_tracker.frame(engine.state.iteration)

        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()

        return {"loss": loss_v.item(), "epsilon": selector.epsilon}

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, NAME)
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Exemple #4
0
            eval_states = buffer.sample(STATES_TO_EVALUATE)
            eval_states = [
                np.array(transition.state, copy=False)
                for transition in eval_states
            ]

            engine.state.eval_states = np.array(eval_states, copy=False)

        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    tb = common.setup_ignite(engine,
                             exp_source,
                             f"conv1d-{args.run}",
                             extra_metrics=("values_mean", ))

    @engine.on(ptan.ignite.PeriodEvents.ITERS_1000_COMPLETED)
    def sync_eval(engine: Engine):
        tgt_net.sync()

        mean_val = common.calc_values_of_states(engine.state.eval_states,
                                                net,
                                                device=device)

        engine.state.metrics["values_mean"] = mean_val

        if getattr(engine.state, "best_mean_val", None) is None:
            engine.state.best_mean_val = mean_val
        if engine.state.best_mean_val < mean_val:
    net = dqn_extra.NoisyDQN(env.observation_space.shape, env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.ArgmaxActionSelector()
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model,
                                      gamma=params.gamma, device=device)
        loss_v.backward()
        optimizer.step()
        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0:
            for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()):
                engine.state.metrics[f'snr_{layer_idx+1}'] = sigma_l2
        return {
            "loss": loss_v.item(),
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=('snr_1', 'snr_2'))
    engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size))
            "loss": loss_v.item(),
            "epsilon": 0.0,
        }
        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()

        if args.params.startswith("egreedy"):
            epsilon_tracker.frame(engine.state.iteration - epsilon_tracker_frame)
            res['epsilon'] = selector.epsilon
        # reset noise every training step, this is fine in off-policy method
        if args.params == 'noisynet':
            net.sample_noise()
        return res

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, args.name, extra_metrics=(
        'test_reward', 'avg_test_reward', 'test_steps'))

    @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED)
    def check_reward_trigger(trainer: Engine):
        global training_enabled, epsilon_tracker_frame
        if training_enabled:
            return
        # check trigger condition to enable epsilon decay
        if trainer.state.episode_reward > -200:
            training_enabled = True
            epsilon_tracker_frame = trainer.state.iteration
            print("Epsilon decay triggered!")

    @engine.on(ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED)
    def test_network(engine):
        net.train(False)
Exemple #7
0
                                device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma,
                                                           step_count=args.n)
    buffer = ptan.experience.ExperienceReplayBuffer(
        experience_source=exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net,
                                      gamma=params.gamma,
                                      device=device)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine.state.iteration)
        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()

        return {"loss": loss_v.item(), "epsilon": selector.epsilon}

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, f"{NAME}={args.n}")
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Exemple #8
0
                                         gamma=PARAMS.gamma,
                                         device=device)
            loss_v.backward()
            opt.step()
            res[name + "_loss"] = loss_v.item()
            if engine.state.iteration % PARAMS.target_net_sync == 0:
                tgt_net.sync()

        epsilon_tracker.frame(engine.state.iteration)
        res['epsilon'] = action_selector.epsilon
        return res

    engine = Engine(process_batches)
    common.setup_ignite(engine,
                        PARAMS,
                        exp_source,
                        args.name,
                        extra_metrics=('test_reward', 'test_steps'))
    best_test_reward = None

    # @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED)
    # def test_network(engine):
    #     net.train(False)
    #     reward, steps = test_model(net, device, config)
    #     net.train(True)
    #     engine.state.metrics['test_reward'] = reward
    #     engine.state.metrics['test_steps'] = steps
    #     print("Test done: got %.3f reward after %.2f steps" % (
    #         reward, steps
    #     ))
    #
Exemple #9
0
        loss_v.backward()
        optimizer.step()

        epsilon_tracker.frame(engine.state.iteration)

        if engine.state.iteration % EVAL_EVERY_FRAME == 0:
            eval_states = getattr(engine.state, "eval_states", None)

            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [
                    np.array(transition.state, copy=False)
                    for transition in eval_states
                ]
                eval_states = np.array(eval_states, copy=False)
                engine.state.eval_state = eval_states

            evaluate_states(eval_states, net, device, engine)

        return {"loss": loss_v.item(), "epsilon": selector.epsilon}

    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        params,
                        exp_source,
                        NAME,
                        extra_metrics=("adv", "val"))
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Exemple #10
0
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine.state.iteration)
        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        if engine.state.iteration % EVAL_EVERY_FRAME == 0:
            eval_states = getattr(engine.state, "eval_states", None)
            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [
                    np.array(transition.state, copy=False)
                    for transition in eval_states
                ]
                eval_states = np.array(eval_states, copy=False)
                engine.state.eval_states = eval_states
            evaluate_states(eval_states, net, device, engine)
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        params,
                        exp_source,
                        NAME,
                        extra_metrics=('adv', 'val'))
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net,
                                      gamma=params.gamma,
                                      device=device)
        loss_v.backward()
        optimizer.step()

        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()

        if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0:
            for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()):
                engine.state.metrics[f"snr_{layer_idx + 1}"] = sigma_l2

        return {"loss": loss_v.item()}

    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        params,
                        exp_source,
                        NAME,
                        extra_metrics=("snr_1", "snr_2"))

    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Exemple #12
0
        if getattr(engine.state, "eval_states", None) is None:
            eval_states = buffer.sample(STATES_TO_EVALUATE)
            eval_states = [
                np.array(transition.state, copy=False)
                for transition in eval_states
            ]
            engine.state.eval_states = np.array(eval_states, copy=False)

        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    tb = common.setup_ignite(engine,
                             exp_source,
                             f"simple-{args.run}",
                             extra_metrics=('values_mean', ))

    @engine.on(ptan.ignite.PeriodEvents.ITERS_1000_COMPLETED)
    def sync_eval(engine: Engine):
        tgt_net.sync()

        mean_val = common.calc_values_of_states(engine.state.eval_states,
                                                net,
                                                device=device)
        engine.state.metrics["values_mean"] = mean_val
        if getattr(engine.state, "best_mean_val", None) is None:
            engine.state.best_mean_val = mean_val
        if engine.state.best_mean_val < mean_val:
            print(
                "%d: Best mean value updated %.3f -> %.3f" %
            opt.step()
            res[name + "_loss"] = loss_v.item()
            loss += loss_v.item()
            if engine.state.iteration % PARAMS.target_net_sync == 0:
                tgt_net.sync()

        epsilon_tracker.frame(engine.state.iteration)
        res['epsilon'] = action_selector.epsilon
        res['loss'] = loss
        return res

    engine = Engine(process_batches)
    common.setup_ignite(engine,
                        PARAMS,
                        tiger_exp_source,
                        args.name,
                        extra_metrics=('test_reward_deer', 'test_steps_deer',
                                       'test_reward_tiger',
                                       'test_steps_tiger'))
    best_test_reward_deer = None
    best_test_reward_tiger = None

    @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED)
    def test_network(engine):
        net_deer.train(False)
        net_tiger.train(False)
        deer_reward, deer_steps, tiger_reward, tiger_steps = test_model(
            net_deer, net_tiger, device, config)
        net_deer.train(True)
        net_tiger.train(True)
        engine.state.metrics['test_reward_deer'] = deer_reward
Exemple #14
0
    fps_handler = ptan_ignite.EpisodeFPSHandler()
    batch_generator = BatchGenerator(buffer, exp_queue, fps_handler,
                                     params.replay_initial, params.batch_size)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine.state.iteration)
        if engine.state.iteration % params.target_net_sync == 0:
            print('syncing...')
            tgt_net.sync()

        return {'loss': loss_v.item(), "epsilon": batch_generator.epsilon}

    engine = Engine(process_batch)
    ptan_ignite.EndOfEpisodeHandler(batch_generator,
                                    bound_avg_reward=17.0).attach(engine)
    fps_handler.attach(engine, manual_step=True)
    common.setup_ignite(engine, params, exp_source, params.run_name)

    #engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size, 1))
    engine.run(batch_generator)
    play_proc.kill()
    play_proc.join()
Exemple #15
0
                                                           gamma=params.gamma,
                                                           steps_count=args.n)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma**args.n,
                                      device=device)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine.state.iteration)
        if engine.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    #common.setup_ignite(engine, params, exp_source, f"{NAME}={args.n}")
    common.setup_ignite(engine, params, exp_source,
                        str(NAME) + "-" + str(args.n))
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))