def process_batch(engine_for_batch, batch):
        optimizer.zero_grad()

        loss_v = utils.calc_loss_dqn(batch, net, target_net.target_model, gamma=params.gamma, device=device)
        loss_v.backward()

        optimizer.step()
        epsilon_tracker.frame(engine_for_batch.state.iteration)

        if engine_for_batch.state.iteration % params.target_net_sync == 0:
            target_net.sync()

        if engine.state.iteration % EVAL_EVERY_FRAME == 0:
            eval_states = getattr(engine.state, "eval_states", None)

            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
                eval_states = np.array(eval_states, copy=False)
                engine.state.eval_states = eval_states

            evaluate_states(eval_states, net, device, engine)

        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }
    def process_batch(engine_for_batch, batch):
        optimizer.zero_grad()

        loss_v = utils.calc_loss_dqn(batch, net, target_net.target_model, gamma=params.gamma, device=device)
        loss_v.backward()

        optimizer.step()
        epsilon_tracker.frame(engine_for_batch.state.iteration)

        if engine_for_batch.state.iteration % params.target_net_sync == 0:
            target_net.sync()

        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }
    def process_batch(engine_for_batch, batch):
        optimizer.zero_grad()

        loss_v = utils.calc_loss_dqn(batch,
                                     net,
                                     target_net.target_model,
                                     gamma=params.gamma,
                                     device=device)
        loss_v.backward()

        optimizer.step()

        if engine_for_batch.state.iteration % params.target_net_sync == 0:
            target_net.sync()

        if engine_for_batch.state.iteration % NOISY_SNR_EVERY_ITERS == 0:
            for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()):
                engine_for_batch.state.metrics[
                    f'snr_{layer_idx + 1}'] = sigma_l2

        return {
            "loss": loss_v.item(),
        }
Exemple #4
0
    with ptan.common.utils.RewardTracker(writer) as tracker:
        while True:
            frame += 1
            eps_tracker.frame(frame)
            buffer.populate(1)
            reward = exp_src.pop_total_rewards()
            if reward:
                episode += 1
                mean = tracker.reward(
                    reward[-1], frame, epsilon=selector.epsilon)
                if mean_monitor(mean):
                    break

            if len(buffer) < params.init_replay:
                continue

            optimizer.zero_grad()
            batch = buffer.sample(params.batch_size)
            loss_v = utils.calc_loss_dqn(
                batch, net, tgt_net, params.gamma**params.steps, device)
            loss_v.backward()
            optimizer.step()

            if mean and selector.epsilon <= params.eps_final:
                lr_scheduler.step(mean_monitor.best_reward)
                writer.add_scalar(
                    'LearningRate', scalar_value=lr_scheduler._last_lr, global_step=frame)
            del batch, loss_v
            if frame % 1000 == 0:
                tgt_net.sync()