agent.epsilon = max(params.epsilon_final, eps) if engine.state.iteration % params.sync_nets == 0: tgt_net.sync() tgt_prep.sync() return { "loss": loss_t.item(), "epsilon": agent.epsilon, } engine = Engine(process_batch) run_name = f"basic-{args.params}_{args.run}" save_path = pathlib.Path("saves") / run_name save_path.mkdir(parents=True, exist_ok=True) common.setup_ignite(engine, exp_source, run_name, extra_metrics=('val_reward', 'val_steps')) @engine.on(ptan.ignite.PeriodEvents.ITERS_100_COMPLETED) def validate(engine): reward = 0.0 steps = 0 obs = val_env.reset() while True: obs_t = prep.encode_sequences([obs['obs']]).to(device) cmd_t = prep.encode_commands(obs['admissible_commands']).to(device) q_vals = net.q_values(obs_t, cmd_t) act = np.argmax(q_vals)
optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine.state.iteration % EVAL_EVERY_FRAME == 0: eval_states = getattr(engine.state, "eval_states", None) if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) engine.state.eval_states = eval_states engine.state.metrics["values"] = \ common.calc_values_of_states(eval_states, net, device) return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, f"{NAME}={args.double}", extra_metrics=('values', )) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent=agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() return {"loss": loss_v.item(), "epsilon": selector.epsilon} engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] engine.state.eval_states = np.array(eval_states, copy=False) return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) tb = common.setup_ignite(engine, exp_source, f"conv1d-{args.run}", extra_metrics=("values_mean", )) @engine.on(ptan.ignite.PeriodEvents.ITERS_1000_COMPLETED) def sync_eval(engine: Engine): tgt_net.sync() mean_val = common.calc_values_of_states(engine.state.eval_states, net, device=device) engine.state.metrics["values_mean"] = mean_val if getattr(engine.state, "best_mean_val", None) is None: engine.state.best_mean_val = mean_val if engine.state.best_mean_val < mean_val:
net = dqn_extra.NoisyDQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.ArgmaxActionSelector() agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0: for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()): engine.state.metrics[f'snr_{layer_idx+1}'] = sigma_l2 return { "loss": loss_v.item(), } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=('snr_1', 'snr_2')) engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size))
"loss": loss_v.item(), "epsilon": 0.0, } if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() if args.params.startswith("egreedy"): epsilon_tracker.frame(engine.state.iteration - epsilon_tracker_frame) res['epsilon'] = selector.epsilon # reset noise every training step, this is fine in off-policy method if args.params == 'noisynet': net.sample_noise() return res engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, args.name, extra_metrics=( 'test_reward', 'avg_test_reward', 'test_steps')) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def check_reward_trigger(trainer: Engine): global training_enabled, epsilon_tracker_frame if training_enabled: return # check trigger condition to enable epsilon decay if trainer.state.episode_reward > -200: training_enabled = True epsilon_tracker_frame = trainer.state.iteration print("Epsilon decay triggered!") @engine.on(ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED) def test_network(engine): net.train(False)
device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma, step_count=args.n) buffer = ptan.experience.ExperienceReplayBuffer( experience_source=exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() return {"loss": loss_v.item(), "epsilon": selector.epsilon} engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, f"{NAME}={args.n}") engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
gamma=PARAMS.gamma, device=device) loss_v.backward() opt.step() res[name + "_loss"] = loss_v.item() if engine.state.iteration % PARAMS.target_net_sync == 0: tgt_net.sync() epsilon_tracker.frame(engine.state.iteration) res['epsilon'] = action_selector.epsilon return res engine = Engine(process_batches) common.setup_ignite(engine, PARAMS, exp_source, args.name, extra_metrics=('test_reward', 'test_steps')) best_test_reward = None # @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED) # def test_network(engine): # net.train(False) # reward, steps = test_model(net, device, config) # net.train(True) # engine.state.metrics['test_reward'] = reward # engine.state.metrics['test_steps'] = steps # print("Test done: got %.3f reward after %.2f steps" % ( # reward, steps # )) #
loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % EVAL_EVERY_FRAME == 0: eval_states = getattr(engine.state, "eval_states", None) if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) engine.state.eval_state = eval_states evaluate_states(eval_states, net, device, engine) return {"loss": loss_v.item(), "epsilon": selector.epsilon} engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=("adv", "val")) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine.state.iteration % EVAL_EVERY_FRAME == 0: eval_states = getattr(engine.state, "eval_states", None) if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) engine.state.eval_states = eval_states evaluate_states(eval_states, net, device, engine) return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=('adv', 'val')) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0: for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()): engine.state.metrics[f"snr_{layer_idx + 1}"] = sigma_l2 return {"loss": loss_v.item()} engine = Engine(process_batch) common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=("snr_1", "snr_2")) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))
if getattr(engine.state, "eval_states", None) is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] engine.state.eval_states = np.array(eval_states, copy=False) return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) tb = common.setup_ignite(engine, exp_source, f"simple-{args.run}", extra_metrics=('values_mean', )) @engine.on(ptan.ignite.PeriodEvents.ITERS_1000_COMPLETED) def sync_eval(engine: Engine): tgt_net.sync() mean_val = common.calc_values_of_states(engine.state.eval_states, net, device=device) engine.state.metrics["values_mean"] = mean_val if getattr(engine.state, "best_mean_val", None) is None: engine.state.best_mean_val = mean_val if engine.state.best_mean_val < mean_val: print( "%d: Best mean value updated %.3f -> %.3f" %
opt.step() res[name + "_loss"] = loss_v.item() loss += loss_v.item() if engine.state.iteration % PARAMS.target_net_sync == 0: tgt_net.sync() epsilon_tracker.frame(engine.state.iteration) res['epsilon'] = action_selector.epsilon res['loss'] = loss return res engine = Engine(process_batches) common.setup_ignite(engine, PARAMS, tiger_exp_source, args.name, extra_metrics=('test_reward_deer', 'test_steps_deer', 'test_reward_tiger', 'test_steps_tiger')) best_test_reward_deer = None best_test_reward_tiger = None @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED) def test_network(engine): net_deer.train(False) net_tiger.train(False) deer_reward, deer_steps, tiger_reward, tiger_steps = test_model( net_deer, net_tiger, device, config) net_deer.train(True) net_tiger.train(True) engine.state.metrics['test_reward_deer'] = deer_reward
fps_handler = ptan_ignite.EpisodeFPSHandler() batch_generator = BatchGenerator(buffer, exp_queue, fps_handler, params.replay_initial, params.batch_size) def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: print('syncing...') tgt_net.sync() return {'loss': loss_v.item(), "epsilon": batch_generator.epsilon} engine = Engine(process_batch) ptan_ignite.EndOfEpisodeHandler(batch_generator, bound_avg_reward=17.0).attach(engine) fps_handler.attach(engine, manual_step=True) common.setup_ignite(engine, params, exp_source, params.run_name) #engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size, 1)) engine.run(batch_generator) play_proc.kill() play_proc.join()
gamma=params.gamma, steps_count=args.n) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma**args.n, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() return { "loss": loss_v.item(), "epsilon": selector.epsilon, } engine = Engine(process_batch) #common.setup_ignite(engine, params, exp_source, f"{NAME}={args.n}") common.setup_ignite(engine, params, exp_source, str(NAME) + "-" + str(args.n)) engine.run( common.batch_generator(buffer, params.replay_initial, params.batch_size))