def main(): NAME = "01_baseline" random.seed(common.SEED) torch.manual_seed(common.SEED) params = common.HYPERPARAMS["pong"] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) env.seed(common.SEED) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine_, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine_.state.iteration) if engine_.state.iteration % params.target_net_sync == 0: tgt_net.sync() return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
reward = 0.0 steps = 0 while True: acts, _ = agent([obs]) obs, r, is_done, _ = test_env.step(acts[0]) reward += r steps += 1 if is_done: break test_reward_avg = getattr(engine.state, "test_reward_avg", None) if test_reward_avg is None: test_reward_avg = reward else: test_reward_avg = test_reward_avg * 0.95 + 0.05 * reward engine.state.test_reward_avg = test_reward_avg print("Test done: got %.3f reward after %d steps, avg reward %.3f" % (reward, steps, test_reward_avg)) engine.state.metrics["test_reward"] = reward engine.state.metrics["avg_test_reward"] = test_reward_avg engine.state.metrics["test_steps"] = steps if test_reward_avg > params.stop_test_reward: print("Reward boundary has crossed, stopping training. Contgrats!") engine.should_terminate = True net.train(True) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
def validate(engine: Engine): res = validation.validation_run(env_tst, net, device=device) print("%d: tst: %s" % (engine.state.iteration, res)) for key, val in res.items(): engine.state.metrics[key + "_tst"] = val res = validation.validation_run(env_val, net, device=device) print("%d: val: %s" % (engine.state.iteration, res)) for key, val in res.items(): engine.state.metrics[key + "_val"] = val val_reward = res["episode_reward"] if getattr(engine.state, "best_val_reward", None) is None: engine.state.best_val_reward = val_reward if engine.state.best_val_reward < val_reward: print("Best validation reward updated: %.3f -> %.3f, model saved" % (engine.state.best_val_reward, val_reward)) engine.state.best_val_reward = val_reward path = saves_path / ("val_reward-%.3f.data" % val_reward) torch.save(net.state_dict(), path) event = ptan.ignite.PeriodEvents.ITERS_10000_COMPLETED tst_metrics = [m + "_tst" for m in validation.METRICS] tst_handler = tb_logger.OutputHandler(tag="test", metric_names=tst_metrics) tb.attach(engine, log_handler=tst_handler, event_name=event) val_metrics = [m + "_val" for m in validation.METRICS] val_handler = tb_logger.OutputHandler(tag="validation", metric_names=val_metrics) tb.attach(engine, log_handler=val_handler, event_name=event) engine.run(common.batch_generator(buffer, REPLAY_INITIAL, BATCH_SIZE))
def main(): NAME = "03_double" STATES_TO_EVALUATE = 1000 EVAL_EVERY_FRAME = 100 random.seed(common.SEED) torch.manual_seed(common.SEED) params = common.HYPERPARAMS["pong"] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--double", default=False, action="store_true", help="Enable double dqn") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) env.seed(common.SEED) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine_, batch): optimizer.zero_grad() loss_v = calc_loss_double_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device, double=args.double) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine_.state.iteration) if engine_.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine_.state.iteration % EVAL_EVERY_FRAME == 0: eval_states = getattr(engine_.state, "eval_states", None) if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) engine_.state.eval_states = eval_states engine_.state.metrics["values"] = common.calc_values_of_states( eval_states, net, device) return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, f"{NAME}={args.double}", extra_metrics=("values", )) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
engine = Engine(process_batch) common.setup_ignite(engine, PARAMS, exp_source, args.name, extra_metrics=("test_reward", "test_steps")) best_test_reward = None @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED) def test_network(engine): net.train(False) reward, steps = test_model(net, device, config) net.train(True) engine.state.metrics["test_reward"] = reward engine.state.metrics["test_steps"] = steps print("Test done: got %.3f reward after %.2f steps" % (reward, steps)) global best_test_reward if best_test_reward is None: best_test_reward = reward elif best_test_reward < reward: print("Best test reward updated %.3f <- %.3f, save model" % (best_test_reward, reward)) best_test_reward = reward torch.save(net.state_dict(), os.path.join(saves_path, "best_%.3f.dat" % reward)) engine.run( common.batch_generator(buffer, PARAMS.replay_initial, PARAMS.batch_size))
obs, r, is_done, _ = val_env.step(act) steps += 1 reward += r if is_done: break engine.state.metrics["val_reward"] = reward engine.state.metrics["val_steps"] = steps print("Validation got %.3f reward in %d steps" % (reward, steps)) best_val_reward = getattr(engine.state, "best_val_reward", None) if best_val_reward is None: engine.state.best_val_reward = reward elif best_val_reward < reward: print("Best validation reward updated: %s -> %s" % (best_val_reward, reward)) save_prep_name = save_path / ("best_val_%.3f_p.dat" % reward) save_net_name = save_path / ("best_val_%.3f_n.dat" % reward) torch.save(prep.state_dict(), save_prep_name) torch.save(net.state_dict(), save_net_name) engine.state.best_val_reward = reward @engine.on(ptan.ignite.EpisodeEvents.BEST_REWARD_REACHED) def best_reward_updated(trainer: Engine): reward = trainer.state.metrics["avg_reward"] if reward > 0: save_prep_name = save_path / ("best_train_%.3f_p.dat" % reward) save_net_name = save_path / ("best_train_%.3f_n.dat" % reward) torch.save(prep.state_dict(), save_prep_name) torch.save(net.state_dict(), save_net_name) print("%d: best avg training reward: %.3f, saved" % (trainer.state.iteration, reward)) engine.run(common.batch_generator(buffer, params.replay_initial, BATCH_SIZE))