def process_batch(engine_for_batch, batch): optimizer.zero_grad() loss_v = utils.calc_loss_dqn(batch, net, target_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine_for_batch.state.iteration) if engine_for_batch.state.iteration % params.target_net_sync == 0: target_net.sync() if engine.state.iteration % EVAL_EVERY_FRAME == 0: eval_states = getattr(engine.state, "eval_states", None) if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] eval_states = np.array(eval_states, copy=False) engine.state.eval_states = eval_states evaluate_states(eval_states, net, device, engine) return { "loss": loss_v.item(), "epsilon": selector.epsilon, }
def process_batch(engine_for_batch, batch): optimizer.zero_grad() loss_v = utils.calc_loss_dqn(batch, net, target_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine_for_batch.state.iteration) if engine_for_batch.state.iteration % params.target_net_sync == 0: target_net.sync() return { "loss": loss_v.item(), "epsilon": selector.epsilon, }
def process_batch(engine_for_batch, batch): optimizer.zero_grad() loss_v = utils.calc_loss_dqn(batch, net, target_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() if engine_for_batch.state.iteration % params.target_net_sync == 0: target_net.sync() if engine_for_batch.state.iteration % NOISY_SNR_EVERY_ITERS == 0: for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()): engine_for_batch.state.metrics[ f'snr_{layer_idx + 1}'] = sigma_l2 return { "loss": loss_v.item(), }
with ptan.common.utils.RewardTracker(writer) as tracker: while True: frame += 1 eps_tracker.frame(frame) buffer.populate(1) reward = exp_src.pop_total_rewards() if reward: episode += 1 mean = tracker.reward( reward[-1], frame, epsilon=selector.epsilon) if mean_monitor(mean): break if len(buffer) < params.init_replay: continue optimizer.zero_grad() batch = buffer.sample(params.batch_size) loss_v = utils.calc_loss_dqn( batch, net, tgt_net, params.gamma**params.steps, device) loss_v.backward() optimizer.step() if mean and selector.epsilon <= params.eps_final: lr_scheduler.step(mean_monitor.best_reward) writer.add_scalar( 'LearningRate', scalar_value=lr_scheduler._last_lr, global_step=frame) del batch, loss_v if frame % 1000 == 0: tgt_net.sync()