Ejemplo n.º 1
0
                                 adv_tensor,
                                 r_tensor,
                                 v_tensor,
                                 is_clip_v=args.is_clip_v)

                if args.debug:
                    logger.store(aloss=info["aloss"])
                    logger.store(vloss=info["vloss"])
                    logger.store(entropy=info["entropy"])
                    logger.store(kl=info["kl"])

        if args.anneal_lr:
            ppo.lr_scheduler()
        if args.debug:
            writer.add_scalar("aloss",
                              logger.get_stats("aloss")[0],
                              global_step=iter)
            writer.add_scalar("vloss",
                              logger.get_stats("vloss")[0],
                              global_step=iter)
            writer.add_scalar("entropy",
                              logger.get_stats("entropy")[0],
                              global_step=iter)
            writer.add_scalar("kl",
                              logger.get_stats("kl")[0],
                              global_step=iter)

        logger.log_tabular('Epoch', iter)
        logger.log_tabular("reward", total_reward / epi)
        if args.debug:
            logger.log_tabular("aloss", with_min_and_max=True)
Ejemplo n.º 2
0
                v_tensor = torch.tensor(v, dtype=torch.float32, device=device)

                info = ppo.train_ac(s_tensor,
                                    a_tensor,
                                    adv_tensor,
                                    r_tensor,
                                    v_tensor,
                                    is_clip_v=args.is_clip_v)

                if args.debug:
                    logger.store(aloss=info["aloss"])
                    logger.store(vloss=info["vloss"])
                    logger.store(entropy=info["entropy"])
                    logger.store(kl=info["kl"])

            if logger.get_stats("kl")[0] > args.target_kl:
                print("stop at:", str(i))
                break

        if args.anneal_lr:
            ppo.lr_scheduler()

        # writer.add_scalar("test_reward", logger.get_stats("test_reward")[0], global_step=iter)
        writer.add_scalar("reward",
                          logger.get_stats("reward")[0],
                          global_step=iter)
        writer.add_histogram("action",
                             np.array(replay.action),
                             global_step=iter)
        if args.debug:
            writer.add_scalar("aloss",
Ejemplo n.º 3
0
            for (s, a, r, adv, v) in replay.get_batch(batch=args.batch):
                s_tensor = torch.tensor(s, dtype=torch.float32, device=device)
                a_tensor = torch.tensor(a, dtype=torch.float32, device=device)
                adv_tensor = torch.tensor(adv, dtype=torch.float32, device=device)
                r_tensor = torch.tensor(r, dtype=torch.float32, device=device)
                v_tensor = torch.tensor(v, dtype=torch.float32, device=device)

                info = ppo.train_ac(s_tensor, a_tensor, adv_tensor, r_tensor, v_tensor, is_clip_v=args.is_clip_v)

                if args.debug:
                    logger.store(aloss=info["aloss"])
                    logger.store(vloss=info["vloss"])
                    logger.store(entropy=info["entropy"])
                    logger.store(kl=info["kl"])

            if logger.get_stats("kl", with_min_and_max=True)[3] > args.target_kl:
                print("stop at:", str(i))
                break

        if args.anneal_lr:
            ppo.lr_scheduler()

        ppo.eval()
        test_a = []
        test_a_std = []
        for i in range(args.test_epoch):
            test_obs = test_env.reset()
            test_obs = state_norm(test_obs, update=False)
            test_rew = 0

            while True:
Ejemplo n.º 4
0
                v_tensor = torch.tensor(v, dtype=torch.float32, device=device)

                info = ppo.train_ac(s_tensor,
                                    a_tensor,
                                    adv_tensor,
                                    r_tensor,
                                    v_tensor,
                                    is_clip_v=args.is_clip_v)

                if args.debug:
                    logger.store(aloss=info["aloss"])
                    logger.store(vloss=info["vloss"])
                    logger.store(entropy=info["entropy"])
                    logger.store(kl=info["kl"])

            if logger.get_stats("kl",
                                with_min_and_max=True)[3] > args.target_kl:
                print("stop at:", str(i))
                break

        if args.anneal_lr:
            ppo.lr_scheduler()

        ppo.eval()
        for i in range(args.test_epoch):
            test_obs = test_env.reset()
            test_obs = state_norm(test_obs, update=False)
            test_rew = 0

            while True:
                a_tensor, var = ppo.actor(test_obs)
                a_tensor = torch.squeeze(a_tensor, dim=0)
Ejemplo n.º 5
0
            # add L2 penalty on latent representation
            # see https://arxiv.org/pdf/1903.12436.pdf
            latent_loss = (0.5 * h.pow(2).sum(1)).mean()
            loss = rec_loss + args.decoder_latent_lambda * latent_loss

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            loss.backward()

            encoder_optimizer.step()
            decoder_optimizer.step()

            logger.store(loss=loss)

        writer.add_scalar("loss",
                          logger.get_stats("loss")[0],
                          global_step=iter)
        logger.log_tabular('Epoch', iter)
        logger.log_tabular("loss", with_min_and_max=True)
        logger.dump_tabular()

        if iter % args.log_every == 0:
            state = {
                "encoder": encoder.state_dict(),
                "decoder": decoder.state_dict(),
            }

            torch.save(
                state,
                os.path.join(logger.output_dir, "checkpoints",
                             str(iter) + '.pth'))