Ejemplo n.º 1
0
def run_experiment(algo,
                   policy,
                   env_fn,
                   args,
                   normalizer=None,
                   log=True,
                   monitor=False,
                   render=False):
    logger = Logger(args, viz=monitor) if log else None

    # HOTFIX for Patrick's desktop: (MP is buggy on it for some reason)

    if render:
        policy.share_memory()

        train_p = mp.Process(target=algo.train,
                             args=(env_fn, policy, args.n_itr, normalizer),
                             kwargs=dict(logger=logger))
        train_p.start()

        # TODO: add normalize as a commandline argument
        renv_fn = partial(env_fn)

        renv = Normalize(Vectorize([renv_fn]))
        render_p = mp.Process(target=renderloop, args=(renv, policy))
        render_p.start()

        train_p.join()
        render_p.join()

    else:
        print("logger: ", logger)
        algo.train(env_fn, policy, args.n_itr, normalizer, logger=logger)
Ejemplo n.º 2
0
    def _sample(self,
                env_fn,
                policy,
                min_steps,
                max_traj_len,
                deterministic=False):
        """
        Sample at least min_steps number of total timesteps, truncating 
        trajectories only if they exceed max_traj_len number of timesteps
        """
        env = Vectorize([env_fn])

        memory = PPOBuffer(self.gamma, self.lam)

        num_steps = 0
        while num_steps < min_steps:
            state = torch.Tensor(env.reset())

            done = False
            value = 0
            traj_len = 0

            while not done and traj_len < max_traj_len:
                value, action = policy.act(state, deterministic)

                next_state, reward, done, _ = env.step(action.numpy())

                memory.store(state.numpy(), action.numpy(), reward,
                             value.numpy())

                state = torch.Tensor(next_state)

                traj_len += 1
                num_steps += 1

            value, _ = policy.act(state)
            memory.finish_path(last_val=(not done) * value.numpy())

        return memory
Ejemplo n.º 3
0
# TODO: add command line arguments for normalization on/off, and for ensemble policy?

if __name__ == "__main__":
    torch.set_num_threads(
        1)  # see: https://github.com/pytorch/pytorch/issues/13757

    if args.new:
        env_fn = make_env_fn(state_est=args.state_est)

        # env_fn = make_cassie_env("walking", clock_based=True)
        # env_fn = functools.partial(CassieEnv_speed, "walking", clock_based=True, state_est=False)
        # env_fn = functools.partial(CassieEnv_nodelta, "walking", clock_based=True, state_est=False)
        # env_fn = functools.partial(CassieEnv_speed_dfreq, "walking", clock_based = True, state_est=args.state_est)

        env = Vectorize([env_fn])

        obs_dim = env_fn().observation_space.shape[0]
        action_dim = env_fn().action_space.shape[0]

        policy = GaussianMLP(obs_dim,
                             action_dim,
                             nonlinearity="relu",
                             init_std=np.exp(-2),
                             learn_std=False)

        # policy2 = ActorCriticNet(obs_dim, action_dim, [256, 256])

        # #print(policy,  sum(p.numel() for p in policy.parameters()))
        # #print(policy2,  sum(p.numel() for p in policy2.parameters()))
Ejemplo n.º 4
0
    def train(self, env_fn, policy, n_itr, normalize=None, logger=None):

        if normalize != None:
            policy.train()
        else:
            policy.train(0)

        env = Vectorize([env_fn])  # this will be useful for parallelism later

        if normalize is not None:
            env = normalize(env)

            mean, std = env.ob_rms.mean, np.sqrt(env.ob_rms.var + 1E-8)
            policy.obs_mean = torch.Tensor(mean)
            policy.obs_std = torch.Tensor(std)
            policy.train(0)

        env = Vectorize([env_fn])

        old_policy = deepcopy(policy)

        optimizer = optim.SGD(policy.parameters(), lr=self.lr)

        start_time = time.time()

        for itr in range(n_itr):
            print("********** Iteration {} ************".format(itr))

            sample_t = time.time()
            if self.n_proc > 1:
                print("doing multi samp")
                batch = self.sample_parallel(env_fn, policy, self.num_steps,
                                             300)
            else:
                batch = self._sample(env_fn, policy, self.num_steps,
                                     300)  #TODO: fix this

            print("sample time: {:.2f} s".format(time.time() - sample_t))

            observations, actions, returns, values = map(
                torch.Tensor, batch.get())

            advantages = returns - values
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             self.eps)

            minibatch_size = self.minibatch_size or advantages.numel()

            print("timesteps in batch: %i" % advantages.numel())

            old_policy.load_state_dict(
                policy.state_dict())  # WAY faster than deepcopy

            for _ in range(self.epochs):
                losses = []
                sampler = BatchSampler(SubsetRandomSampler(
                    range(advantages.numel())),
                                       minibatch_size,
                                       drop_last=True)

                for indices in sampler:
                    indices = torch.LongTensor(indices)

                    obs_batch = observations[indices]
                    action_batch = actions[indices]

                    return_batch = returns[indices]
                    advantage_batch = advantages[indices]

                    values, pdf = policy.evaluate(obs_batch)

                    # TODO, move this outside loop?
                    with torch.no_grad():
                        _, old_pdf = old_policy.evaluate(obs_batch)
                        old_log_probs = old_pdf.log_prob(action_batch).sum(
                            -1, keepdim=True)

                    log_probs = pdf.log_prob(action_batch).sum(-1,
                                                               keepdim=True)

                    ratio = (log_probs - old_log_probs).exp()

                    cpi_loss = ratio * advantage_batch
                    clip_loss = ratio.clamp(1.0 - self.clip,
                                            1.0 + self.clip) * advantage_batch
                    actor_loss = -torch.min(cpi_loss, clip_loss).mean()

                    critic_loss = 0.5 * (return_batch - values).pow(2).mean()

                    entropy_penalty = -self.entropy_coeff * pdf.entropy().mean(
                    )

                    # TODO: add ability to optimize critic and actor seperately, with different learning rates

                    optimizer.zero_grad()
                    (actor_loss + critic_loss + entropy_penalty).backward()
                    optimizer.step()

                    # Do adaptive step size to satisfy KL div threshold
                    with torch.no_grad():
                        _, pdf = policy.evaluate(obs_batch)
                    curr_lr = self.lr
                    while kl_divergence(pdf, old_pdf).mean() > 0.02:
                        curr_lr /= 2
                        self.update_lr(optimizer, curr_lr)
                        policy.load_state_dict(old_policy.state_dict())
                        optimizer.step()
                        with torch.no_grad():
                            _, pdf = policy.evaluate(obs_batch)

                    if curr_lr != self.lr:
                        print(
                            "KL div threshold violated, changed step size to ",
                            curr_lr)

                    losses.append([
                        actor_loss.item(),
                        pdf.entropy().mean().item(),
                        critic_loss.item(),
                        ratio.mean().item()
                    ])

                # TODO: add verbosity arguments to suppress this
                print(' '.join(["%g" % x for x in np.mean(losses, axis=0)]))

            if logger is not None:
                test = self.sample(env,
                                   policy,
                                   800 // self.n_proc,
                                   400,
                                   deterministic=True)
                _, pdf = policy.evaluate(observations)
                _, old_pdf = old_policy.evaluate(observations)

                entropy = pdf.entropy().mean().item()
                kl = kl_divergence(pdf, old_pdf).mean().item()

                logger.record("Return (test)", np.mean(test.ep_returns))
                logger.record("Return (batch)", np.mean(batch.ep_returns))
                logger.record("Mean Eplen", np.mean(batch.ep_lens))

                logger.record("Mean KL Div", kl)
                logger.record("Mean Entropy", entropy)
                logger.dump()

            # TODO: add option for how often to save model
            # if itr % 10 == 0:
            if np.mean(test.ep_returns) > self.max_return:
                self.max_return = np.mean(test.ep_returns)
                self.save(policy, env)
                self.save_optim(optimizer)

            print("Total time: {:.2f} s".format(time.time() - start_time))
Ejemplo n.º 5
0
    def train(self,
              env_fn,
              policy, 
              n_itr,
              normalize=None,
              logger=None):

        if normalize != None:
            policy.train()
        else:
            policy.train(0)

        env = Vectorize([env_fn]) # this will be useful for parallelism later
        
        if normalize is not None:
            env = normalize(env)

            mean, std = env.ob_rms.mean, np.sqrt(env.ob_rms.var + 1E-8)
            policy.obs_mean = torch.Tensor(mean)
            policy.obs_std = torch.Tensor(std)
            policy.train(0)

        old_policy = deepcopy(policy)

        optimizer = optim.Adam(policy.parameters(), lr=self.lr, eps=self.eps)

        start_time = time.time()

        for itr in range(n_itr):
            print("********** Iteration {} ************".format(itr))

            sample_start = time.time()
            batch = self.sample_parallel(env_fn, policy, self.num_steps, self.max_traj_len)

            print("time elapsed: {:.2f} s".format(time.time() - start_time))
            print("sample time elapsed: {:.2f} s".format(time.time() - sample_start))

            observations, actions, returns, values = map(torch.Tensor, batch.get())

            advantages = returns - values
            advantages = (advantages - advantages.mean()) / (advantages.std() + self.eps)

            minibatch_size = self.minibatch_size or advantages.numel()

            print("timesteps in batch: %i" % advantages.numel())

            old_policy.load_state_dict(policy.state_dict())  # WAY faster than deepcopy

            optimizer_start = time.time()

            self.update(policy, old_policy, optimizer, observations, actions, returns, advantages, env_fn) 
           
            print("optimizer time elapsed: {:.2f} s".format(time.time() - optimizer_start))        


            if logger is not None:
                evaluate_start = time.time()
                test = self.sample_parallel(env_fn, policy, 800 // self.n_proc, self.max_traj_len, deterministic=True)
                print("evaluate time elapsed: {:.2f} s".format(time.time() - evaluate_start))

                _, pdf     = policy.evaluate(observations)
                _, old_pdf = old_policy.evaluate(observations)

                entropy = pdf.entropy().mean().item()
                kl = kl_divergence(pdf, old_pdf).mean().item()

                logger.record("Return (test)", np.mean(test.ep_returns))
                logger.record("Return (batch)", np.mean(batch.ep_returns))
                logger.record("Mean Eplen",  np.mean(batch.ep_lens))
        
                logger.record("Mean KL Div", kl)
                logger.record("Mean Entropy", entropy)
                logger.dump()

            # TODO: add option for how often to save model
            if np.mean(test.ep_returns) > self.max_return:
                self.max_return = np.mean(test.ep_returns)
                self.save(policy, env)

            print("Total time: {:.2f} s".format(time.time() - start_time))