def main(): NAME = "01_baseline" random.seed(common.SEED) torch.manual_seed(common.SEED) params = common.HYPERPARAMS["pong"] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) env.seed(common.SEED) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine_, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine_.state.iteration) if engine_.state.iteration % params.target_net_sync == 0: tgt_net.sync() return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
tgt_net.sync() if args.params.startswith("egreedy"): epsilon_tracker.frame(engine.state.iteration - epsilon_tracker_frame) res["epsilon"] = selector.epsilon # reset noise every training step, this is fine in off-policy method if args.params == "noisynet": net.sample_noise() return res engine = Engine(process_batch) common.setup_ignite( engine, params, exp_source, args.name, extra_metrics=("test_reward", "avg_test_reward", "test_steps"), ) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def check_reward_trigger(trainer: Engine): global training_enabled, epsilon_tracker_frame if training_enabled: return # check trigger condition to enable epsilon decay if trainer.state.episode_reward > -200: training_enabled = True epsilon_tracker_frame = trainer.state.iteration print("Epsilon decay triggered!")
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn( batch, net, tgt_net.target_model, gamma=params.gamma, device=device ) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine.state.iteration % EVAL_EVERY_FRAME == 0: eval_states = getattr(engine.state, "eval_states", None) if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] eval_states = np.array(eval_states, copy=False) engine.state.eval_states = eval_states evaluate_states(eval_states, net, device, engine) return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=("adv", "val")) engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size))
if getattr(engine.state, "eval_states", None) is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] engine.state.eval_states = np.array(eval_states, copy=False) return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) tb = common.setup_ignite(engine, exp_source, f"simple-{args.run}", extra_metrics=("values_mean", )) @engine.on(ptan.ignite.PeriodEvents.ITERS_1000_COMPLETED) def sync_eval(engine: Engine): tgt_net.sync() mean_val = common.calc_values_of_states(engine.state.eval_states, net, device=device) engine.state.metrics["values_mean"] = mean_val if getattr(engine.state, "best_mean_val", None) is None: engine.state.best_mean_val = mean_val if engine.state.best_mean_val < mean_val: print( "%d: Best mean value updated %.3f -> %.3f" %
def main(): NAME = "03_double" STATES_TO_EVALUATE = 1000 EVAL_EVERY_FRAME = 100 random.seed(common.SEED) torch.manual_seed(common.SEED) params = common.HYPERPARAMS["pong"] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--double", default=False, action="store_true", help="Enable double dqn") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) env.seed(common.SEED) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine_, batch): optimizer.zero_grad() loss_v = calc_loss_double_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device, double=args.double) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine_.state.iteration) if engine_.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine_.state.iteration % EVAL_EVERY_FRAME == 0: eval_states = getattr(engine_.state, "eval_states", None) if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) engine_.state.eval_states = eval_states engine_.state.metrics["values"] = common.calc_values_of_states( eval_states, net, device) return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, f"{NAME}={args.double}", extra_metrics=("values", )) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
preproc, gamma=PARAMS.gamma, device=device) loss_v.backward() optimizer.step() if engine.state.iteration % PARAMS.target_net_sync == 0: tgt_net.sync() epsilon_tracker.frame(engine.state.iteration) return {"epsilon": action_selector.epsilon, "loss": loss_v.item()} engine = Engine(process_batch) common.setup_ignite( engine, PARAMS, b_exp_source, args.name, extra_metrics=("test_reward_a", "test_steps_a", "test_reward_b", "test_steps_b"), ) best_test_reward = None @engine.on(ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED) def test_network(engine): net.train(False) a_reward, a_steps, b_reward, b_steps = test_model(net, device, config) net.train(True) engine.state.metrics["test_reward_a"] = a_reward engine.state.metrics["test_steps_a"] = a_steps engine.state.metrics["test_reward_b"] = b_reward engine.state.metrics["test_steps_b"] = b_steps print(
gamma=PARAMS.gamma, device=device) loss_v.backward() optimizer.step() if epsilon_tracker is not None: epsilon_tracker.frame(engine.state.iteration) res["epsilon"] = action_selector.epsilon if engine.state.iteration % PARAMS.target_net_sync == 0: tgt_net.sync() res["loss"] = loss_v.item() return res engine = Engine(process_batch) common.setup_ignite(engine, PARAMS, exp_source, args.name, extra_metrics=("test_reward", "test_steps")) best_test_reward = None @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED) def test_network(engine): net.train(False) reward, steps = test_model(net, device, config) net.train(True) engine.state.metrics["test_reward"] = reward engine.state.metrics["test_steps"] = steps print("Test done: got %.3f reward after %.2f steps" % (reward, steps)) global best_test_reward if best_test_reward is None:
eps = 1 - engine.state.iteration / params.epsilon_steps agent.epsilon = max(params.epsilon_final, eps) if engine.state.iteration % params.sync_nets == 0: tgt_net.sync() tgt_prep.sync() return { "loss": loss_t.item(), "epsilon": agent.epsilon, } engine = Engine(process_batch) run_name = f"basic-{args.params}_{args.run}" save_path = pathlib.Path("saves") / run_name save_path.mkdir(parents=True, exist_ok=True) common.setup_ignite(engine, exp_source, run_name, extra_metrics=("val_reward", "val_steps")) @engine.on(ptan.ignite.PeriodEvents.ITERS_100_COMPLETED) def validate(engine): reward = 0.0 steps = 0 obs = val_env.reset() while True: obs_t = prep.encode_sequences([obs["obs"]]).to(device) cmd_t = prep.encode_commands(obs["admissible_commands"]).to(device) q_vals = net.q_values(obs_t, cmd_t) act = np.argmax(q_vals) obs, r, is_done, _ = val_env.step(act)
buffer = dqn_extra.PrioReplayBuffer(exp_source, params.replay_size, PRIO_REPLAY_ALPHA) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch_data): batch, batch_indices, batch_weights = batch_data optimizer.zero_grad() loss_v, sample_prios = calc_loss_prio( batch, batch_weights, net, tgt_net.target_model, gamma=params.gamma**N_STEPS, device=device, ) loss_v.backward() optimizer.step() buffer.update_priorities(batch_indices, sample_prios) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() return { "loss": loss_v.item(), "beta": buffer.update_beta(engine.state.iteration), } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params.gamma, steps_count=args.n ) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine_, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn( batch, net, tgt_net.target_model, gamma=params.gamma ** args.n, device=device ) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine_.state.iteration) if engine_.state.iteration % params.target_net_sync == 0: tgt_net.sync() return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, f"{NAME}={args.n}") engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size))
exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0: for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()): engine.state.metrics[f"snr_{layer_idx+1}"] = sigma_l2 return { "loss": loss_v.item(), } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=("snr_1", "snr_2")) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
loss += loss_v.item() if engine.state.iteration % PARAMS.target_net_sync == 0: tgt_net.sync() epsilon_tracker.frame(engine.state.iteration) res["epsilon"] = action_selector.epsilon res["loss"] = loss return res engine = Engine(process_batches) common.setup_ignite( engine, PARAMS, tiger_exp_source, args.name, extra_metrics=( "test_reward_deer", "test_steps_deer", "test_reward_tiger", "test_steps_tiger", ), ) best_test_reward_deer = None best_test_reward_tiger = None @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED) def test_network(engine): net_deer.train(False) net_tiger.train(False) deer_reward, deer_steps, tiger_reward, tiger_steps = test_model( net_deer, net_tiger, device, config )