Exemple #1
0
def vpg(env_config, ac_type, ac_kwargs, gamma, lam, epochs, steps_per_epoch,
        lr, train_v_iters, max_ep_len, logger_kwargs, seed):
    logger = EpochLogger(**logger_kwargs)
    configs = locals().copy()
    configs.pop("logger")
    logger.save_config(configs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = make_env(env_config)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(
        obs_dim, act_dim, None, None, None)

    actor_critic = gaussian_mlp_actor_critic
    pi, logp, logp_pi, v = actor_critic(obs_ph, a_ph, **ac_kwargs)

    all_phs = [obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph]
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    buf = VPGBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss)
    train_v = tf.train.AdamOptimizer(learning_rate=lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def update():
        buffer_data = buf.get()
        #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch)
        inputs = {k: v for k, v in zip(all_phs, buffer_data)}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        sess.run(train_pi, feed_dict=inputs)

        # Training
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, v_new = sess.run(
            [pi_loss, v_loss, approx_kl, v], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    real_action = env.action_space.default()

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={obs_ph: o.reshape(1, -1)})

            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            delta = np.exp(a[0])
            delta = np.clip(delta, 0.9, 1.1)
            real_action = env.action_space.clip(real_action * delta)

            o, r, d, _ = env.step(real_action)
            ep_ret += r
            ep_len += 1

            if ep_len == max_ep_len or t == steps_per_epoch - 1:
                last_val = sess.run(v, feed_dict={obs_ph: o.reshape(1, -1)})
                #print(last_val)
                buf.finish_path(last_val)
                logger.store(EpRet=ep_ret, EpLen=ep_len)

                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                real_action = env.action_space.default()

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Exemple #2
0
def sac(args, steps_per_epoch=1500, replay_size=int(1e6), gamma=0.99,
        polyak=0.995, lr=1e-3, alpha=3e-4, batch_size=128, start_steps=1000,
        update_after=1000, update_every=1, num_test_episodes=10, max_ep_len=150,
        logger_kwargs=dict(), save_freq=1):

    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)

    torch.set_num_threads(torch.get_num_threads())

    actor_critic = core.MLPActorCritic
    ac_kwargs = dict(hidden_sizes=[args.hid] * args.l)
    gamma = args.gamma
    seed = args.seed
    epochs = args.epochs
    logger_tensor = Logger(logdir=args.logdir, run_name="{}-{}".format(args.model_name, time.ctime()))

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env = ML1.get_train_tasks('reach-v1')  # Create an environment with task `pick_place`
    tasks = env.sample_tasks(1)  # Sample a task (in this case, a goal variation)
    env.set_task(tasks[0])  # Set task

    test_env = ML1.get_train_tasks('reach-v1')  # Create an environment with task `pick_place`
    tasks = env.sample_tasks(1)  # Sample a task (in this case, a goal variation)
    test_env.set_task(tasks[0])  # Set task

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup) ** 2).mean()
        loss_q2 = ((q2 - backup) ** 2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(),
                      Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=3e-4)
    q_optimizer = Adam(q_params, lr=3e-4)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, logger_tensor, t):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)
        logger_tensor.log_value(t, loss_q.item(), "loss q")

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)
        logger_tensor.log_value(t, loss_pi.item(), "loss pi")

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32),
                      deterministic)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            logger_tensor.log_value(t, ep_ret, "test ep reward")
            logger_tensor.log_value(t, ep_len, "test ep length")

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d
        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger_tensor.log_value(t, ep_ret, "reward")
            logging.info("> total_steps={} | reward={}".format(t, ep_ret))
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0


        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, logger_tensor = logger_tensor, t = t)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)

            logger_tensor.log_value(t, epoch, "epoch")
            logger.dump_tabular(logger_tensor=logger_tensor,epoch = epoch)
            ac.save(args.save_model_dir, args.model_name)
Exemple #3
0
class gac_agent:
    def __init__(self, args, env, test_env, env_params):
        self.args = args

        # path to save the model
        if self.args.mmd:
            self.exp_name = '_'.join(
                (self.args.env_name, self.args.alg,
                 'mmd' + str(self.args.beta_mmd), 's' + str(self.args.seed),
                 datetime.now().isoformat()))
            self.data_path = os.path.join(
                self.args.save_dir, '_'.join(
                    (self.args.env_name, self.args.alg,
                     'mmd' + str(self.args.beta_mmd))), self.exp_name)
        else:
            self.exp_name = '_'.join(
                (self.args.env_name, self.args.alg, str(self.args.seed),
                 datetime.now().isoformat()))
            self.data_path = os.path.join(
                self.args.save_dir, '_'.join(
                    (self.args.env_name, self.args.alg)), self.exp_name)
        self.logger = EpochLogger(output_dir=self.data_path,
                                  exp_name=self.exp_name)
        self.logger.save_config(args)

        self.env = env
        self.test_env = test_env
        self.env_params = env_params
        # create the network
        self.actor_network = actor(env_params)
        self.critic_network1 = critic(env_params)
        self.critic_network2 = critic(env_params)
        self.advice_network1 = critic(env_params)
        self.advice_network2 = critic(env_params)
        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network1)
        sync_networks(self.critic_network2)
        sync_networks(self.advice_network1)
        sync_networks(self.advice_network2)
        # build up the target network
        # self.actor_target_network = actor(env_params)
        self.critic_target_network1 = critic(env_params)
        self.critic_target_network2 = critic(env_params)
        self.advice_target_network1 = critic(env_params)
        self.advice_target_network2 = critic(env_params)
        # load the weights into the target networks
        # self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        self.critic_target_network1.load_state_dict(
            self.critic_network1.state_dict())
        self.critic_target_network2.load_state_dict(
            self.critic_network2.state_dict())
        self.advice_target_network1.load_state_dict(
            self.advice_network1.state_dict())
        self.advice_target_network2.load_state_dict(
            self.advice_network2.state_dict())

        # if use gpu
        self.rank = MPI.COMM_WORLD.Get_rank()
        self.mpi_size = MPI.COMM_WORLD.Get_size()
        if args.cuda:
            device = 'cuda:{}'.format(self.rank % torch.cuda.device_count())
        self.device = torch.device(device)

        if self.args.cuda:
            self.actor_network.cuda(self.device)
            self.critic_network1.cuda(self.device)
            self.critic_network2.cuda(self.device)
            # self.actor_target_network.cuda(self.device)
            self.critic_target_network1.cuda(self.device)
            self.critic_target_network2.cuda(self.device)

            self.advice_network1.cuda(self.device)
            self.advice_network2.cuda(self.device)
            self.advice_target_network1.cuda(self.device)
            self.advice_target_network2.cuda(self.device)

        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim1 = torch.optim.Adam(
            self.critic_network1.parameters(), lr=self.args.lr_critic)
        self.critic_optim2 = torch.optim.Adam(
            self.critic_network2.parameters(), lr=self.args.lr_critic)
        self.advice_optim1 = torch.optim.Adam(
            self.advice_network1.parameters(), lr=self.args.lr_critic)
        self.advice_optim2 = torch.optim.Adam(
            self.advice_network2.parameters(), lr=self.args.lr_critic)

        # create the replay buffer
        self.buffer = ReplayBuffer(self.env_params['obs'],
                                   self.env_params['action'],
                                   self.args.buffer_size)

        self.logger.setup_pytorch_saver(self.actor_network)

        self.obs_mean, self.obs_std = self.buffer.obs_mean, self.buffer.obs_std

    def learn(self):
        """
        train the network

        """
        # start to collect samples
        obs, ep_rew, ep_cost, ep_len, done = self.env.reset(), 0, 0, 0, False
        for epoch in range(self.args.n_epochs):
            for _ in range(self.args.n_train_rollouts):
                for t in range(self.env_params['max_timesteps']):
                    with torch.no_grad():
                        input_tensor = self._preproc_inputs(obs)
                        action = self.actor_network(input_tensor)
                        action = action.detach().cpu().numpy().squeeze()
                    # feed the actions into the environment
                    next_obs, reward, done, info = self.env.step(
                        action * self.env_params['action_max'])
                    ep_rew += reward
                    ep_cost += info['cost']
                    ep_len += 1
                    self.buffer.store(obs, action, reward, info['cost'],
                                      next_obs, done)
                    obs = next_obs

                    if done or (ep_len == self.env_params['max_timesteps']
                                ) or (t % self.args.n_batches == 0):
                        self.buffer.obs_mean = MPI.COMM_WORLD.allreduce(
                            self.buffer.obs_mean, op=MPI.SUM) / self.mpi_size
                        self.buffer.obs_std = MPI.COMM_WORLD.allreduce(
                            self.buffer.obs_std, op=MPI.SUM) / self.mpi_size
                        self.obs_mean, self.obs_std = self.buffer.obs_mean, self.buffer.obs_std

                        self.buffer.rew_mean = MPI.COMM_WORLD.allreduce(
                            self.buffer.rew_mean, op=MPI.SUM) / self.mpi_size
                        self.buffer.rew_std = MPI.COMM_WORLD.allreduce(
                            self.buffer.rew_std, op=MPI.SUM) / self.mpi_size

                        self.buffer.cost_mean = MPI.COMM_WORLD.allreduce(
                            self.buffer.cost_mean, op=MPI.SUM) / self.mpi_size
                        self.buffer.cost_std = MPI.COMM_WORLD.allreduce(
                            self.buffer.cost_std, op=MPI.SUM) / self.mpi_size

                        for _ in range(self.args.n_batches):
                            # train the network
                            self._update_network()
                            # soft update
                            # self._soft_update_target_network(self.actor_target_network, self.actor_network)
                            self._soft_update_target_network(
                                self.critic_target_network1,
                                self.critic_network1, self.args.polyak)
                            self._soft_update_target_network(
                                self.critic_target_network2,
                                self.critic_network2, self.args.polyak)

                    if done or (ep_len == self.env_params['max_timesteps']):
                        self.logger.store(EpReward=ep_rew,
                                          EpCost=ep_cost,
                                          EpLen=ep_len)
                        obs, ep_rew, ep_cost, ep_len, done = self.env.reset(
                        ), 0, 0, 0, False

            # start to do the evaluation
            self._test_policy()

            # save some necessary objects
            state = {
                'observation_mean': self.buffer.obs_mean,
                'observation_std': self.buffer.obs_std
            }
            self.logger.save_state(state, None)

            t = ((epoch + 1) * self.mpi_size *
                 self.env_params['max_timesteps']) * self.args.n_train_rollouts

            self.logger.log_tabular('Epoch', epoch + 1)
            self.logger.log_tabular('EpReward', with_min_and_max=True)
            self.logger.log_tabular('EpCost', with_min_and_max=True)
            self.logger.log_tabular('EpLen', average_only=True)
            self.logger.log_tabular('TestReward', with_min_and_max=True)
            self.logger.log_tabular('TestCost', with_min_and_max=True)
            self.logger.log_tabular('TestLen', average_only=True)
            self.logger.log_tabular('LossPi', average_only=True)
            self.logger.log_tabular('LossQ', average_only=True)
            self.logger.log_tabular('MMDEntropy', average_only=True)
            self.logger.log_tabular('TotalEnvInteracts', t)
            self.logger.dump_tabular()

            if MPI.COMM_WORLD.Get_rank() == 0:
                print("obs_mean=", self.buffer.obs_mean)
                print("obs_std=", self.buffer.obs_std)
                print("reward_mean=", self.buffer.rew_mean)
                print("reward_std=", self.buffer.rew_std)
                print("cost_mean=", self.buffer.cost_mean)
                print("cost_std=", self.buffer.cost_std)

    # pre_process the inputs
    def _preproc_inputs(self, obs):
        inputs = ((np.array(obs) - self.obs_mean) /
                  (self.obs_std + 1e-8)).clip(-self.args.clip_range,
                                              self.args.clip_range)
        inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
        if self.args.cuda:
            inputs = inputs.cuda(self.device)
        return inputs

    # soft update
    def _soft_update_target_network(self, target, source, polyak):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_((1 - polyak) * param.data +
                                    polyak * target_param.data)

    # update the network
    def _update_network(self):
        # sample the episodes
        batches = self.buffer.sample(self.args.batch_size)

        o = torch.FloatTensor(batches['obs']).to(self.device)
        o2 = torch.FloatTensor(batches['obs2']).to(self.device)
        a = torch.FloatTensor(batches['act']).to(self.device)
        r = torch.FloatTensor(batches['rew']).to(self.device)
        c = torch.FloatTensor(batches['cost']).to(self.device)
        d = torch.FloatTensor(batches['done']).to(self.device)

        # calculate the target Q value function
        with torch.no_grad():
            # do the normalization
            # concatenate the stuffs
            a2 = self.actor_network(o2)
            q_next_value1 = self.critic_target_network1(o2, a2).detach()
            q_next_value2 = self.critic_target_network2(o2, a2).detach()
            target_q_value = r + self.args.gamma * (1 - d) * torch.min(
                q_next_value1, q_next_value2)
            target_q_value = target_q_value.detach()

            p_next_value1 = self.advice_target_network1(o2, a2).detach()
            p_next_value2 = self.advice_target_network2(o2, a2).detach()
            target_p_value = -c + self.args.gamma * (1 - d) * torch.min(
                p_next_value1, p_next_value2)
            target_p_value = target_p_value.detach()

        # the q loss
        real_q_value1 = self.critic_network1(o, a)
        real_q_value2 = self.critic_network2(o, a)
        critic_loss1 = (target_q_value - real_q_value1).pow(2).mean()
        critic_loss2 = (target_q_value - real_q_value2).pow(2).mean()

        # the p loss
        real_p_value1 = self.advice_network1(o, a)
        real_p_value2 = self.advice_network2(o, a)
        advice_loss1 = (target_p_value - real_p_value1).pow(2).mean()
        advice_loss2 = (target_p_value - real_p_value2).pow(2).mean()

        # the actor loss
        o_exp = o.repeat(self.args.expand_batch, 1)
        a_exp = self.actor_network(o_exp)
        actor_loss = -torch.min(self.critic_network1(o_exp, a_exp),
                                self.critic_network2(o_exp, a_exp)).mean()
        actor_loss -= self.args.advice * torch.min(
            self.advice_network1(o_exp, a_exp),
            self.advice_network2(o_exp, a_exp)).mean()

        mmd_entropy = torch.tensor(0.0)

        if self.args.mmd:
            # mmd is computationally expensive
            a_exp_reshape = a_exp.view(self.args.expand_batch, -1,
                                       a_exp.shape[-1]).transpose(0, 1)
            with torch.no_grad():
                uniform_actions = (2 * torch.rand_like(a_exp_reshape) - 1)
            mmd_entropy = mmd(a_exp_reshape, uniform_actions)
            if self.args.beta_mmd <= 0.0:
                mmd_entropy.detach_()
            else:
                actor_loss += self.args.beta_mmd * mmd_entropy

        # start to update the network
        self.actor_optim.zero_grad()
        actor_loss.backward()
        sync_grads(self.actor_network)
        self.actor_optim.step()
        # update the critic_network
        self.critic_optim1.zero_grad()
        critic_loss1.backward()
        sync_grads(self.critic_network1)
        self.critic_optim1.step()
        self.critic_optim2.zero_grad()
        critic_loss2.backward()
        sync_grads(self.critic_network2)
        self.critic_optim2.step()

        self.logger.store(LossPi=actor_loss.detach().cpu().numpy())
        self.logger.store(LossQ=(critic_loss1 +
                                 critic_loss2).detach().cpu().numpy())
        self.logger.store(MMDEntropy=mmd_entropy.detach().cpu().numpy())

    # do the evaluation
    def _test_policy(self):
        for _ in range(self.args.n_test_rollouts):
            obs, ep_rew, ep_cost, ep_len, done = self.test_env.reset(
            ), 0, 0, 0, False
            while (not done and ep_len < self.env_params['max_timesteps']):
                with torch.no_grad():
                    input_tensor = self._preproc_inputs(obs)
                    action = self.actor_network(input_tensor, std=0.5)
                    action = action.detach().cpu().numpy().squeeze()
                obs_next, reward, done, info = self.test_env.step(action)
                obs = obs_next
                ep_rew += reward
                ep_cost += info['cost']
                ep_len += 1
            self.logger.store(TestReward=ep_rew,
                              TestCost=ep_cost,
                              TestLen=ep_len)
Exemple #4
0
class Test:
    def __init__(self, args):
        self.args = args
        self.env = gym.make(args.env_name)
        self.env_params = get_env_params(self.env)

        self.video_file = 'data_test/test_video'
        self.output_dir = 'data_test'
        self.exp_name = 'test'
        self.logger = EpochLogger(output_dir=self.output_dir,
                                  exp_name=self.exp_name)
        # self.env = wrappers.Monitor(self.env, self.video_file, force=True)

        device = 'cuda' if args.cuda else 'cpu'
        self.device = torch.device(device)

        # load
        data_file = os.path.join(args.load_fold, 'vars.pkl')
        data = joblib.load(data_file)

        ## load obs_mean obs_std g_mean g_std
        self.obs_mean = data['observation_mean']
        self.obs_std = data['observation_std']

        ## load policy model
        model = {
            'ddpg': actor,
            'td3': actor,
            'sac': actor_sac,
            'gac': actor_gac
        }
        self.actor_network = model[args.alg](self.env_params).to(self.device)
        model_file = os.path.join(args.load_fold, 'pyt_save', 'model.pt')
        self.actor_network.load_state_dict(torch.load(model_file))

    def run(self):
        self._eval_agent()
        self.logger.log_tabular('EpReward')
        self.logger.log_tabular('EpCost')
        self.logger.dump_tabular()

    def _preproc_inputs(self, obs):
        obs_norm = np.clip((obs - self.obs_mean) / self.obs_std,
                           -self.args.clip_range, self.args.clip_range)
        # concatenate the stuffs
        inputs = torch.tensor(obs_norm, dtype=torch.float32).unsqueeze(0)
        if self.args.cuda:
            inputs = inputs.cuda(self.device)
        return inputs

    def _eval_agent(self):
        for _ in range(self.args.n_test_rollouts):
            obs, ep_reward, ep_cost = self.env.reset(), 0, 0
            for _ in range(self.env_params['max_timesteps']):
                if self.args.render:
                    self.env.render()
                    time.sleep(1e-3)

                with torch.no_grad():
                    input_tensor = self._preproc_inputs(obs)
                    if self.args.alg == 'gac':
                        pi = self.actor_network(input_tensor, std=0.5)
                    elif self.args.alg == 'sac':
                        pi, _ = self.actor_network(input_tensor)
                    else:
                        pi = self.actor_network(input_tensor)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                obs, reward, cost, info = self.env.step(actions)
                ep_reward += reward
                ep_cost += cost
                self.logger.store(EpReward=ep_reward, EpCost=ep_cost)
def ppo(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # GAedit
    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    # setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # GAedit
    # Seed
    seed = 333
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    #GAedit
    # obs_dim = env.observation_space.shape
    # act_dim = env.action_space.shape
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # number of agents
    num_agents = len(env_info.agents)
    # size of each action
    act_dim = brain.vector_action_space_size
    # examine the state space
    obs_dim = env_info.vector_observations.shape[1]

    #GAedit
    # Create actor-critic module
    # ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac = actor_critic(obs_dim, act_dim, **ac_kwargs)

    # GAedit - don't think we need to sync
    # Sync params across processes
    # sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    # GAedit
    # local_steps_per_epoch = int(steps_per_epoch / num_procs())
    local_steps_per_epoch = int(steps_per_epoch / num_agents)
    #GAedit
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch * num_agents,
                    gamma, lam)

    # buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            #GAedit
            # kl = mpi_avg(pi_info['kl'])
            kl = pi_info['kl']
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            #GAedit
            # mpi_avg_grads(ac.pi)    # average grads across MPI processes
            # ac.pi.mean()
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            #GAedit
            # mpi_avg_grads(ac.v)    # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    #GAedit
    # o, ep_ret, ep_len = env.reset(), 0, 0
    ep_ret, ep_len = 0, 0
    env_info = env.reset(train_mode=True)[brain_name]
    o = env_info.vector_observations
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))
            # GAedit
            # next_o, r, d, _ = env.step(a)
            env_info = env.step(a)[brain_name]
            next_o, r, d = env_info.vector_observations, env_info.rewards, env_info.local_done
            #GAedit
            # ep_ret += r
            ep_ret += np.mean(r)
            ep_len += 1

            # save and log
            #GAedit
            # buf.store(o, a, r, v, logp)
            for i in range(20):
                buf.store(o[i], a[i], r[i], v[i], logp[i])
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            # GAedit
            # terminal = d or timeout
            terminal = any(d) or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                # GAedit
                # o, ep_ret, ep_len = env.reset(), 0, 0
                ep_ret, ep_len = 0, 0
                env_info = env.reset(train_mode=True)[brain_name]
                o = env_info.vector_observations

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
def ppo(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=2000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):

    global RENDER, BONUS
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Reachability Trainer
    r_network = R_Network().to(device)
    trainer = R_Network_Trainer(r_network=r_network, exp_name="random1")
    episodic_memory = EpisodicMemory(embedding_shape=[EMBEDDING_DIM])

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(3, 64, 64))
    action_space = gym.spaces.Discrete(3)
    obs_dim = observation_space.shape
    act_dim = action_space.shape

    # Create actor-critic module
    ac = actor_critic(observation_space, action_space, **ac_kwargs)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            # Entropy bonus
            loss_pi += pi_info['ent'] * 0.0021
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    o, _ = env.reset()
    env.render()
    o = o.astype(np.float32) / 255.
    o = o.transpose(2, 0, 1)
    ep_ret, ep_len = 0, 0
    indices = []

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            state = torch.as_tensor(o[np.newaxis, ...], dtype=torch.float32)
            a, v, logp = ac.step(state)

            next_o, r, d, info = env.step(a)
            next_o = next_o.astype(np.float32) / 255.

            d = ep_len == max_ep_len
            trainer.store_new_state([next_o], [r], [d], [None])

            r_network.eval()
            with torch.no_grad():
                state_embedding = r_network.embed_observation(
                    torch.FloatTensor([o]).to(device)).cpu().numpy()[0]
                aggregated, _, _ = similarity_to_memory(
                    state_embedding, episodic_memory, r_network)
                curiosity_bonus = 0.03 * (0.5 - aggregated)
                if BONUS:
                    print(f'{curiosity_bonus:.3f}')
                if curiosity_bonus > 0 or len(episodic_memory) == 0:
                    idx = episodic_memory.store_new_state(state_embedding)
                    x = int(env.map_scale * info['pose']['x'])
                    y = int(env.map_scale * info['pose']['y'])
                    if idx == len(indices):
                        indices.append((x, y))
                    else:
                        indices[idx] = (x, y)

            r_network.train()

            next_o = next_o.transpose(2, 0, 1)
            ep_ret += r + curiosity_bonus
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            k = cv2.waitKey(1)
            if k == ord('s'):
                RENDER = 1 - RENDER
            elif k == ord('b'):
                BONUS = 1 - BONUS

            if RENDER:
                env.info['map'] = cv2.flip(env.info['map'], 0)
                for index in indices:
                    cv2.circle(env.info['map'], index, 3, (0, 0, 255), -1)
                env.info['map'] = cv2.flip(env.info['map'], 0)
                env.render()

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    state = torch.as_tensor(o[np.newaxis, ...],
                                            dtype=torch.float32)
                    _, v, _ = ac.step(state)
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                print(ep_ret, ep_len, len(episodic_memory))
                ep_ret, ep_len = 0, 0
                o, _ = env.reset()
                o = o.astype(np.float32) / 255.
                o = o.transpose(2, 0, 1)
                episodic_memory.reset()
                indices = []

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        if epoch > 4:
            update()
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('TotalEnvInteracts',
                               (epoch + 1) * steps_per_epoch)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('DeltaLossPi', average_only=True)
            logger.log_tabular('DeltaLossV', average_only=True)
            logger.log_tabular('Entropy', average_only=True)
            logger.log_tabular('KL', average_only=True)
            logger.log_tabular('ClipFrac', average_only=True)
            logger.log_tabular('StopIter', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

        else:
            buf.get()
Exemple #7
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        trials_per_epoch=2500,
        steps_per_trial=100,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=1000,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    x_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='x_ph')
    a_ph = tf.placeholder(dtype=tf.int32, shape=(None, None), name='a_ph')
    # adv_ph, ret_ph, logp_old_ph, rew_ph = core.placeholders(None, None, None, 1)
    adv_ph = tf.placeholder(dtype=tf.float32,
                            shape=(None, None),
                            name='adv_ph')
    ret_ph = tf.placeholder(dtype=tf.float32,
                            shape=(None, None),
                            name='ret_ph')
    logp_old_ph = tf.placeholder(dtype=tf.float32,
                                 shape=(None, None),
                                 name='logp_old_ph')
    rew_ph = tf.placeholder(dtype=tf.float32,
                            shape=(None, None, 1),
                            name='rew_ph')
    pi_state_ph = tf.placeholder(dtype=tf.float32,
                                 shape=(None, NUM_GRU_UNITS),
                                 name='pi_state_ph')
    v_state_ph = tf.placeholder(dtype=tf.float32,
                                shape=(None, NUM_GRU_UNITS),
                                name='v_state_ph')

    # Initialize rnn states for pi and v

    # Main outputs from computation graph
    pi, logp, logp_pi, v, new_pi_state, new_v_state = actor_critic(
        x_ph,
        a_ph,
        rew_ph,
        pi_state_ph,
        v_state_ph,
        NUM_GRU_UNITS,
        action_space=env.action_space)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph]

    # Every step, get: action, value, and logprob and reward
    get_action_ops = [pi, v, logp_pi, new_pi_state, new_v_state]

    # Experience buffer
    steps_per_epoch = trials_per_epoch * steps_per_trial
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(
        learning_rate=pi_lr).minimize(pi_loss - 0.01 * approx_ent)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    # tf.reset_default_graph()
    # restore_tf_graph(sess, '..//data//ppo//ppo_s0//simple_save')

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        inputs[pi_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS))
        inputs[v_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS))
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)
        print(pi_l_old, v_l_old)
        # Training
        for i in range(train_pi_iters):
            # print(f'pi:{i}')
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            # print(sess.run(pi_loss, feed_dict=inputs))
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            # print(f'v:{_}')
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        import datetime
        print(f'finish one batch training at {datetime.datetime.now()}')
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch

    for epoch in range(epochs):
        for trial in range(trials_per_epoch):
            print(f'trial: {trial}')
            old_a = np.array([0]).reshape(1, 1)
            old_r = np.array([0]).reshape((1, 1, 1))
            means = env.sample_tasks(1)[0]
            action_dict = defaultdict(int)
            for i in range(env.action_space.n):
                action_dict[i] = 0

            env.reset_task_simple(means)
            task_avg = 0.0
            pi_state_t = np.zeros((1, NUM_GRU_UNITS))
            v_state_t = np.zeros((1, NUM_GRU_UNITS))
            for step in range(steps_per_trial):
                a, v_t, logp_t, pi_state_t, v_state_t = sess.run(
                    get_action_ops,
                    feed_dict={
                        x_ph: o.reshape(1, 1, -1),
                        a_ph: old_a,
                        rew_ph: old_r,
                        pi_state_ph: pi_state_t,
                        v_state_ph: v_state_t
                    })
                # save and log
                buf.store(o, a, r, v_t, logp_t)
                logger.store(VVals=v_t)

                try:
                    o, r, d, _ = env.step(a[0][0])
                except:
                    print(a)
                    raise AssertionError

                action_dict[a[0][0]] += 1

                old_a = np.array(a).reshape(1, 1)
                old_r = np.array([r]).reshape(1, 1, 1)
                ep_ret += r
                task_avg += r
                ep_len += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal or (step == local_steps_per_epoch - 1):
                    if not (terminal):
                        print(
                            'Warning: trajectory cut off by epoch at %d steps.'
                            % ep_len)
                    # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = r if d else sess.run(
                        v, feed_dict={x_ph: o.reshape(1, -1)})
                    buf.finish_path(last_val)
                    if terminal:
                        # only save EpRet / EpLen if trajectory finished
                        logger.store(EpRet=ep_ret, EpLen=ep_len)

                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

            # logger.log_tabular('Epoch', epoch)
            # logger.log_tabular('EpRet', with_min_and_max=True)
            # logger.log_tabular('Means', means)
            # logger.dump_tabular()
            print(f'avg in trial {trial}: {task_avg / steps_per_trial}')
            print(f'Means in trial {trial}: {means}')

            print(action_dict)

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)
            # saved_path = saver.save(sess, f"/tmp/model_epoch{epoch}.ckpt")
            # print(f'Model saved in {saved_path}')
        # Perform PPO update!

        update()
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Exemple #8
0
def td3( env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000,
    epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3,
    batch_size=100, start_steps=10000, act_noise=.1, target_noise=.2, noise_clip=.5,
    policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1):

    logger = EpochLogger( **logger_kwargs)
    logger.save_config( locals())

    tf.set_random_seed(seed)
    np.random.seed( seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping
    act_limit = env.action_space.high[0]

    # Share action sapce info with A2C
    ac_kwargs['action_space'] = env.action_space

    x_ph, a_ph, x2_ph, r_ph, d_ph = \
        tf.placeholder( name='x_ph', shape=(None, obs_dim), dtype=tf.float32), \
        tf.placeholder( name='a_ph', shape=(None, act_dim), dtype=tf.float32), \
        tf.placeholder( name='x2_ph', shape=(None, obs_dim), dtype=tf.float32),\
        tf.placeholder( name='r_ph', shape=(None), dtype=tf.float32), \
        tf.placeholder( name='d_ph', shape=(None), dtype=tf.float32)

    # Actor policy and value
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi = actor_critic( x_ph, a_ph, **ac_kwargs)

    # Tghis seems a bit memory inneficient: what happens to the q values created
    # along with the target policy ? the poluicy created along the q targets ?
    # Not referenced, but still declared right, a the cost of GPU memory
    # Target policy
    with tf.variable_scope( 'target'):
        pi_targ, _, _, _  = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Target Q networks
    with tf.variable_scope( 'target', reuse=True):
        epsilon = tf.random_normal( tf.shape( pi_targ), stddev=target_noise)
        epsilon = tf.clip_by_value( epsilon, -noise_clip, noise_clip)
        a2 = pi_targ + epsilon
        a2 = tf.clip_by_value( a2, -act_limit, act_limit)

        # Target Q-Values using actions from target policy
        _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs)

    replaybuffer = ReplayBuffer( obs_dim, act_dim, size=replay_size)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    # Count variables
    var_counts = tuple( count_vars( scope) for scope in ['main/pi',
        'main/q1', 'main/q2', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts)

    # CLiped Double Q-Learning with Bellman backup
    min_q_targ = tf.minimum( q1_targ, q2_targ)
    backup = tf.stop_gradient( r_ph + gamma * (1 -d_ph) * min_q_targ)

    # TD3 Losses
    pi_loss = - tf.reduce_mean( q1_pi)
    q1_loss = tf.reduce_mean( (q1 - backup)**2)
    q2_loss = tf.reduce_mean( (q2 - backup)**2)
    q_loss = q1_loss + q2_loss

    # Trainin ops
    pi_train = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss)
    q_train = tf.train.AdamOptimizer(q_lr).minimize( q_loss)

    # Polyak wise target update
    target_update = tf.group( [ tf.assign( v_targ, polyak * v_targ + (1-polyak)
        * v_main) for v_main, v_targ in zip( get_vars('main'), get_vars('target'))])

    target_init = tf.group( [ tf.assign( v_targ, v_main) for v_targ, v_main in
        zip( get_vars('target'), get_vars('main'))])

    sess = tf.Session()
    sess.run( tf.global_variables_initializer())
    sess.run( target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2})

    def get_action( o, noise_scale):
        a = sess.run( pi, feed_dict={ x_ph: o.reshape(1,-1)})
        a += noise_scale * np.random.randn( act_dim)

        return np.clip( a, -act_limit, act_limit)

    def test_agent( n=10):
        for j in range( n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0 ,0
            while not ( d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step( get_action( o, 0))
                ep_ret += r
                ep_len += 1

            logger.store( TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0 , 0
    total_steps = steps_per_epoch * epochs

    # Main loop
    for t in range( total_steps):
        if t > start_steps:
            a = get_action( o, act_noise)
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step( a)
        ep_ret += r
        ep_len += 1

        d = False or ( ep_len == max_ep_len)

        o2 = np.squeeze( o2)

        # print( "O2: ", o2)
        replaybuffer.store( o, a, r, o2, d)

        o = o2

        if d or ( ep_len == max_ep_len):
            for j in range( ep_len):
                batch = replaybuffer.sample_batch( batch_size)
                feed_dict = {x_ph: batch['obs1'],
                                 x2_ph: batch['obs2'],
                                 a_ph: batch['acts'],
                                 r_ph: batch['rews'],
                                 d_ph: batch['done']
                                }
                q_step_ops = [q_loss, q1, q2, q_train]
                outs = sess.run( q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                if j % policy_delay == 0:
                    outs = sess.run( [pi_loss, pi_train, target_update], feed_dict)
                    logger.store( LossPi=outs[0])

            logger.store( EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Saving the model
            if (epoch % save_freq == 0) or ( epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
Exemple #9
0
def trpo(env_fn,
         actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=.99,
         delta=.01,
         vf_lr=1e-3,
         train_v_iters=80,
         damping_coeff=.1,
         cg_iters=10,
         backtrack_iters=10,
         backtrack_coeff=.8,
         lam=.97,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=10,
         algo="trpo"):

    # LOgger tools
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Seed inits
    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Environment recreation
    env = env_fn()

    # Getting obs dims
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    ac_kwargs['action_space'] = env.action_space

    # Placeholders
    x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \
        tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32)
    adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \
        tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \
        tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32)

    pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic(
        x_ph, a_ph, **ac_kwargs)

    def keys_as_sorted_list(dict):
        return sorted(list(dict.keys()))

    def values_as_sorted_list(dict):
        return [dict[k] for k in keys_as_sorted_list(dict)]

    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph
               ] + values_as_sorted_list(info_phs)

    get_action_ops = [pi, v, logp_pi] + values_as_sorted_list(info)

    # Experience buffer init
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()}
    buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes,
                    gamma, lam)

    # Count variables
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    var_counts = tuple(count_vars(scope) for scope in ["pi", "v"])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # TRPO Losses
    ratio = tf.exp(logp - logp_old_ph)
    pi_loss = -tf.reduce_mean(ratio * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Optimizer for value function
    train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # CG solver requirements
    pi_params = get_vars("pi")

    # Some helpers
    def flat_concat(xs):
        return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0)

    def flat_grad(f, params):
        return flat_concat(tf.gradients(xs=params, ys=f))

    def hessian_vector_product(f, params):
        g = flat_grad(f, params)
        x = tf.placeholder(tf.float32, shape=g.shape)

        return x, flat_grad(tf.reduce_sum(g * x), params)

    def assign_params_from_flat(x, params):
        flat_size = lambda p: int(np.prod(p.shape.as_list())
                                  )  # the 'int' is important for scalars
        splits = tf.split(x, [flat_size(p) for p in params])
        new_params = [
            tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)
        ]

        return tf.group(
            [tf.assign(p, p_new) for p, p_new in zip(params, new_params)])

    gradient = flat_grad(pi_loss, pi_params)
    v_ph, hvp = hessian_vector_product(d_kl, pi_params)
    if damping_coeff > 0:
        hvp += damping_coeff * v_ph

    # Symbols for getting and setting params
    get_pi_params = flat_concat(pi_params)
    set_pi_params = assign_params_from_flat(v_ph, pi_params)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def cg(Ax, b):
        x = np.zeros_like(b)
        r = b.copy()
        p = r.copy()
        r_dot_old = np.dot(r, r)

        for _ in range(cg_iters):
            z = Ax(p)
            alpha = r_dot_old / (np.dot(p, z) + EPS)
            x += alpha * p
            r -= alpha * z
            r_dot_new = np.dot(r, r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x

    def update():
        # Prepare hessian func, gradient eval
        # Always so elegant haha
        inputs = {k: v for k, v in zip(all_phs, buf.get())}

        def mpi_avg(x):
            """Average a scalar or vector over MPI processes."""
            return mpi_sum(x) / num_procs()

        Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))
        g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss],
                                        feed_dict=inputs)
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)

        # Core calculations for TRPO or NPG
        x = cg(Hx, g)
        alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS))  # OK
        old_params = sess.run(get_pi_params)

        def set_and_eval(step):
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})

            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)
        elif algo == "trpo":
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function updates
        for _ in range(train_v_iters):
            sess.run(train_vf, feed_dict=inputs)
            v_l_new = sess.run(v_loss, feed_dict=inputs)

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            agent_outs = sess.run(get_action_ops,
                                  feed_dict={x_ph: o.reshape(1, -1)})
            a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[
                1], agent_outs[2], agent_outs[3:]

            # Save and log
            buf.store(o, a, r, v_t, logp_t, info_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not terminal:
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)

                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform TRPO or NPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('KL', average_only=True)
        if algo == 'trpo':
            logger.log_tabular('BacktrackIters', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Exemple #10
0
def iac(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak,
        batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len,
        test_max_ep_len, number_of_tests_per_epoch, q_pi_sample_size, z_dim,
        z_type, act_noise, test_without_state, logger_kwargs, seed):
    logger = EpochLogger(**logger_kwargs)
    configs = locals().copy()
    configs.pop("logger")
    logger.save_config(configs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = make_env(env_config), make_env(env_config)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_high = env.action_space.high

    # Inputs to computation graph
    x_ph, a_ph, z_ph, x2_ph, r_ph, d_ph = core.placeholders(
        obs_dim, act_dim, z_dim, obs_dim, None, None)

    actor_critic = core.get_iac_actor_critic(ac_type)
    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, z_ph,
                                                   **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, z_ph, **ac_kwargs)

    # Experience buffer
    RB = get_replay_buffer(rb_type)
    replay_buffer = RB(obs_dim, act_dim, **rb_kwargs)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q', 'main/v', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q: %d, \t v: %d, \t total: %d\n'
        % var_counts)

    # Bellman backup for Q and V function
    q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
    min_q_pi = tf.minimum(q1_pi, q2_pi)
    v_backup = tf.stop_gradient(min_q_pi)

    # TD3 losses
    pi_loss = -tf.reduce_mean(q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q1 - q_backup)**2)
    q2_loss = 0.5 * tf.reduce_mean((q2 - q_backup)**2)
    v_loss = 0.5 * tf.reduce_mean((v - v_backup)**2)
    value_loss = q1_loss + q2_loss + v_loss

    # Separate train ops for pi, q
    policy_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_policy_op = policy_optimizer.minimize(pi_loss,
                                                var_list=get_vars('main/pi'))
    if ac_kwargs["pi_separate"]:
        train_policy_emb_op = policy_optimizer.minimize(
            pi_loss, var_list=get_vars('main/pi/emb'))
        train_policy_d_op = policy_optimizer.minimize(
            pi_loss, var_list=get_vars('main/pi/d'))
    train_value_op = value_optimizer.minimize(value_loss,
                                              var_list=get_vars('main/q') +
                                              get_vars('main/v'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    def sample_z(size):
        if z_type == "uniform":
            return np.random.random_sample(size=size)
        elif z_type == "gaussian":
            return np.random.normal(size=size)
        else:
            raise Exception("z_type error")

    def get_action(o, noise_scale):
        pi_a = sess.run(pi,
                        feed_dict={
                            x_ph: o.reshape(1, -1),
                            z_ph: sample_z((1, z_dim))
                        })[0]
        pi_a += noise_scale * np.random.randn(act_dim)
        pi_a = np.clip(pi_a, 0, 1)
        real_a = pi_a * act_high
        return pi_a, real_a

    def test_agent(n=10):
        test_actions = []
        for j in range(n):
            test_actions_ep = []
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == test_max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                if test_without_state:
                    _, real_a = get_action(np.zeros(o.shape), 0)
                else:
                    _, real_a = get_action(o, 0)
                test_actions_ep.append(real_a)
                o, r, d, _ = test_env.step(real_a)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            test_actions.append(test_actions_ep)
        return test_actions

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    rewards = []
    rets = []
    test_rets = []
    max_ret = None
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            pi_a, real_a = get_action(o, act_noise)
        else:
            pi_a, real_a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(real_a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, pi_a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):

            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                feed_dict[z_ph] = sample_z((batch_size, z_dim))

                # Policy Learning update
                for key in feed_dict:
                    feed_dict[key] = np.repeat(feed_dict[key],
                                               q_pi_sample_size,
                                               axis=0)
                feed_dict[z_ph] = sample_z(
                    (batch_size * q_pi_sample_size, z_dim))
                if ac_kwargs["pi_separate"]:
                    if len(rewards) % 2 == 0:
                        outs = sess.run([pi_loss, train_policy_emb_op],
                                        feed_dict)
                    else:
                        outs = sess.run([pi_loss, train_policy_d_op],
                                        feed_dict)
                else:
                    outs = sess.run([pi_loss, train_policy_op], feed_dict)
                logger.store(LossPi=outs[0])

                # Q-learning update
                outs = sess.run([q1_loss, v_loss, q1, v, train_value_op],
                                feed_dict)
                logger.store(LossQ=outs[0],
                             LossV=outs[1],
                             ValueQ=outs[2],
                             ValueV=outs[3])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            rewards.append(ep_ret)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Test the performance of the deterministic version of the agent.
            test_actions = test_agent(number_of_tests_per_epoch)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            ret = logger.log_tabular('EpRet', average_only=True)[0]
            test_ret = logger.log_tabular('TestEpRet', average_only=True)[0]
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('ValueQ', average_only=True)
            logger.log_tabular('ValueV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            rets.append(ret)
            test_rets.append(test_ret)

            if max_ret is None or test_ret > max_ret:
                max_ret = test_ret
                best_test_actions = test_actions

            max_ep_len += inc_ep
            sess.run(target_update, feed_dict)

    logger.save_state(
        {
            "rewards": rewards,
            "best_test_actions": best_test_actions,
            "rets": rets,
            "test_rets": test_rets,
            "max_ret": max_ret
        }, None)

    util.plot_actions(best_test_actions, act_high,
                      logger.output_dir + '/best_test_actions.png')
    logger.log("max ret: %f" % max_ret)
Exemple #11
0
def ppo(env_config, ac_type, ac_kwargs, clip_ratio, epochs, steps_per_epoch,
        optimizer, lr, train_pi_iters, max_ep_len, target_kl, logger_kwargs,
        seed):
    logger = EpochLogger(**logger_kwargs)
    configs = locals().copy()
    configs.pop("logger")
    logger.save_config(configs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = make_env(env_config)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_high = env.action_space.high

    obs_ph, a_ph, adv_ph, logp_old_ph = core.placeholders(
        obs_dim, act_dim, None, None)
    all_phs = [obs_ph, a_ph, adv_ph, logp_old_ph]

    actor_critic = get_ppo_actor_critic(ac_type)
    pi, logp, logp_pi = actor_critic(obs_ph, a_ph, **ac_kwargs)

    # Experience buffer
    buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))

    # Optimizers
    if optimizer == "adam":
        train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss)
    elif optimizer == "sgd":
        train_pi = tf.train.GradientDescentOptimizer(
            learning_rate=lr).minimize(pi_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def update():

        print(sess.run(tf.trainable_variables()))

        data = buf.get()
        #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch)
        inputs = {k: v for k, v in zip(all_phs, data[:4])}
        pi_l_old, ent = sess.run([pi_loss, approx_ent], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)

        # Log changes from update
        pi_l_new, kl, cf = sess.run([pi_loss, approx_kl, clipfrac],
                                    feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    real_action = env.action_space.default()
    o, r, d, _ = env.step(real_action)

    episode_actions = []
    episode_obs = []
    episode_actions.append(real_action)
    episode_obs.append(o)

    print(tf.trainable_variables())
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        episode_count = 0
        ep_actions = []
        for t in range(steps_per_epoch):
            a, logp_t = sess.run([pi, logp_pi],
                                 feed_dict={obs_ph: o.reshape(1, -1)})
            delta = np.exp(a[0])
            delta = np.clip(delta, 0.95, 1.05)
            real_action = env.action_space.clip(real_action * delta)

            o, r, d, _ = env.step(real_action)

            buf.store(o, a, r, logp_t)

            ep_actions.append(real_action)
            episode_actions.append(real_action)
            episode_obs.append(o)
            ep_ret += r
            ep_len += 1

            if ep_len == max_ep_len or t == steps_per_epoch - 1:
                buf.finish_path()
                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                real_action = env.action_space.default()
                o, r, d, _ = env.step(real_action)

                util.plot_seq_obs_and_actions(
                    episode_obs, episode_actions, act_high, logger.output_dir +
                    '/episode_actions_%d_%d.png' % (epoch, episode_count))
                episode_count += 1
                episode_actions = []
                episode_obs = []
                episode_actions.append(real_action)
                episode_obs.append(o)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()

        util.plot_actions(ep_actions, act_high,
                          logger.output_dir + '/ep_actions%d.png' % epoch)
Exemple #12
0
def sac(env_fn,
        seed=0,
        gamma=.99,
        lam=.97,
        hidden_sizes=(200, 100),
        alpha=.5,
        v_lr=1e-3,
        q_lr=1e-3,
        pi_lr=1e-3,
        polyak=1e-2,
        epochs=50,
        steps_per_epoch=1000,
        batch_size=100,
        start_steps=10000,
        logger_kwargs=dict(),
        replay_size=int(1e6),
        max_ep_len=1000,
        save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()

    env = env_fn()

    # Dimensions
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_limit = env.action_space.high[0]

    # Placeholders
    x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    a_ph = tf.placeholder(shape=[None, act_dim], dtype=tf.float32)
    x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    r_ph = tf.placeholder(shape=[None], dtype=tf.float32)
    d_ph = tf.placeholder(shape=[None], dtype=tf.float32)

    # Networks
    def mlp(x,
            hidden_sizes=(32, ),
            activation=tf.tanh,
            output_activation=None):
        for h in hidden_sizes[:-1]:
            x = tf.layers.dense(x, units=h, activation=activation)
        return tf.layers.dense(x,
                               units=hidden_sizes[-1],
                               activation=output_activation)

    # Why isn't the k used here ?
    def gaussian_likelihood(x, mu, log_std):
        EPS = 1e-8
        pre_sum = -0.5 * (
            ((x - mu) /
             (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi))
        return tf.reduce_sum(pre_sum, axis=1)

    def clip_but_pass_gradient(x, l=-1., u=1.):
        clip_up = tf.cast(x > u, tf.float32)
        clip_low = tf.cast(x < l, tf.float32)
        return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low)

    LOG_STD_MIN = -20
    LOG_STD_MAX = 2

    def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
        act_dim = a.shape.as_list()[-1]
        net = mlp(x, list(hidden_sizes), activation, activation)
        mu = tf.layers.dense(net, act_dim, activation=output_activation)
        """
        Because algorithm maximizes trade-off of reward and entropy,
        entropy must be unique to state---and therefore log_stds need
        to be a neural network output instead of a shared-across-states
        learnable parameter vector. But for deep Relu and other nets,
        simply sticking an activationless dense layer at the end would
        be quite bad---at the beginning of training, a randomly initialized
        net could produce extremely large values for the log_stds, which
        would result in some actions being either entirely deterministic
        or too random to come back to earth. Either of these introduces
        numerical instability which could break the algorithm. To
        protect against that, we'll constrain the output range of the
        log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is
        slightly different from the trick used by the original authors of
        SAC---they used tf.clip_by_value instead of squashing and rescaling.
        I prefer this approach because it allows gradient propagation
        through log_std where clipping wouldn't, but I don't know if
        it makes much of a difference.
        """
        log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std +
                                                                     1)

        std = tf.exp(log_std)
        pi = mu + tf.random_normal(tf.shape(mu)) * std
        logp_pi = gaussian_likelihood(pi, mu, log_std)
        return mu, pi, logp_pi

    def apply_squashing_func(mu, pi, logp_pi):
        mu = tf.tanh(mu)
        pi = tf.tanh(pi)
        # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range.
        logp_pi -= tf.reduce_sum(
            tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)
        return mu, pi, logp_pi

    with tf.variable_scope("main"):
        activation = tf.tanh
        with tf.variable_scope("pi"):
            # mu = mlp( x_ph, hidden_sizes, activation, None)
            # log_std = mlp( mu, (act_dim,), activation, None)
            # # Avoid out of range log_std. Refer to Github for explanation.
            # log_std = LOG_STD_MIN + .5 * ( LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
            #
            # mu = mlp( mu, (act_dim,), activation, None)
            #
            # pi = mu + tf.exp( log_std) * tf.random_normal( tf.shape(mu))
            # logp_pi = gaussian_likelihood( pi, mu, log_std)
            #
            # # Follow SpinningUp Implementation
            # mu = tf.tanh(mu)
            # pi = tf.tanh(pi)
            #
            # def clip_but_pass_gradient(x, l=-1., u=1.):
            #     clip_up = tf.cast(x > u, tf.float32)
            #     clip_low = tf.cast(x < l, tf.float32)
            #     # What is this supposed to mean even ?
            #     return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low)
            #
            # # Shameless copy paste
            # logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)

            # Not working version bak
            # squashed_pi = tf.tanh( pi)
            #
            # # To be sure
            # pi = tf.clip_by_value( pi, -act_limit, act_limit)
            #
            # # Must take in the squased polic
            # log_squash_pi = gaussian_likelihood( squashed_pi, mu, log_std)

            # Shamefull plug
            mu, pi, logp_pi = mlp_gaussian_policy(x_ph, a_ph, hidden_sizes,
                                                  tf.tanh, None)
            mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)

        with tf.variable_scope("q1"):
            q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (1, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q1", reuse=True):
            q1_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1),
                                   hidden_sizes + (1, ), activation, None),
                               axis=-1)

        with tf.variable_scope("q2"):
            q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (1, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q2", reuse=True):
            q2_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1),
                                   hidden_sizes + (1, ), activation, None),
                               axis=-1)

        with tf.variable_scope("v"):
            # v = mlp( x_ph, hidden_sizes+(1,), activation, None)
            v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None),
                           axis=-1)

    with tf.variable_scope("target"):

        with tf.variable_scope("v"):
            v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation,
                                    None),
                                axis=-1)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    # Count variables
    var_counts = tuple(
        count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n'
        % var_counts)

    # Targets
    q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ
    v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi
    q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient(
        v_backup_prestop)

    # Q Loss
    q1_loss = tf.reduce_mean((q1 - q_backup)**2)
    q2_loss = tf.reduce_mean((q2 - q_backup)**2)
    q_loss = q1_loss + q2_loss

    # V Loss
    v_loss = tf.reduce_mean((v - v_backup)**2)

    # Pol loss
    pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi)

    # Training ops
    v_trainop = tf.train.AdamOptimizer(v_lr).minimize(
        v_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/v"))
    q_trainop = tf.train.AdamOptimizer(q_lr).minimize(
        q_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/q"))
    pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize(
        pi_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/pi"))

    assert polyak <= .5
    # Target update op
    init_v_target = tf.group([
        tf.assign(v_target, v_main) for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    update_v_target = tf.group([
        tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main)
        for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(init_v_target)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v
                          })

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            # print( o.reshape(-1, 1))
            # input()
            while not (d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step(
                    sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}))
                ep_ret += r
                ep_len += 1

            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    #Buffer init
    buffer = ReplayBuffer(obs_dim, act_dim, replay_size)

    # Main loop
    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    for t in range(total_steps):
        if t > start_steps:
            a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        o2, r, d, _ = env.step(o)

        d = False or (ep_len == max_ep_len)

        # Still needed ?
        o2 = np.squeeze(o2)

        buffer.store(o, a, r, o2, d)

        o = o2

        if d or (ep_len == max_ep_len):
            for j in range(ep_len):
                batch = buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                # DEBUG:
                # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict)
                # print( v_backup_prestop_out.shape)
                # print( v_backup_prestop_out)
                # input()

                # Value gradient steps
                v_step_ops = [v_loss, v, v_trainop]
                outs = sess.run(v_step_ops, feed_dict)
                logger.store(LossV=outs[0], VVals=outs[1])

                # Q Gradient steps
                q_step_ops = [q_loss, q1, q2, q_trainop]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                # Policy gradient steps
                # TODO Add entropy logging
                pi_step_ops = [pi_loss, pi_trainop, update_v_target]
                outs = sess.run(pi_step_ops, feed_dict=feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Saving the model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()