Esempio n. 1
0
def setup_es(seed=0,
             env_id='DirHopper',
             log_path='/tmp/out',
             n_cpu=1,
             **agent_args):
    seed = MPI.COMM_WORLD.Get_rank() * 1000
    assert agent_args is not None
    np.random.seed(seed)
    env = env_selector(env_id, seed)
    env.seed(seed)
    es = ES(env, env_id, **agent_args)
    logger.log('Experiment configuration: {}'.format(str(locals())))
    return es
Esempio n. 2
0
    def test(self, fix_ppo=None, load_theta_path=None, **_):
        def objective(env, theta, pool_rank):
            agent = self.create_agent(env, pool_rank)
            loss_n_params = len(agent.get_loss().get_params_1d())
            agent.get_loss().set_params_1d(theta[:loss_n_params])
            if self._outer_evolve_policy_init:
                agent.pi.set_params_1d(theta[loss_n_params:])
            # Agent lifetime is inner_opt_freq * inner_max_n_epoch
            out = run_batch_rl(env, agent,
                               inner_opt_freq=self._inner_opt_freq,
                               inner_max_n_epoch=self._inner_max_n_epoch,
                               inner_buffer_size=self._inner_buffer_size,
                               pool_rank=0,
                               ppo_factor=1. if fix_ppo else 0.,
                               render=True, verbose=True)

        if load_theta_path is not None:
            try:
                theta = self.load_theta(load_theta_path)
                while True:
                    objective(self._env, theta, 0)
            except Exception as e:
                print(e)
        logger.log('Test run finished.')
Esempio n. 3
0
    def train(self, outer_n_epoch, outer_l2, outer_std, outer_learning_rate, outer_n_samples_per_ep,
              n_cpu=None, fix_ppo=None, **_):
        # Requires more than 1 MPI process.
        assert MPI.COMM_WORLD.Get_size() > 1
        assert n_cpu is not None
        if fix_ppo:
            ppo_factor_schedule = PiecewiseSchedule([(0, 1.), (int(outer_n_epoch / 16), 0.5)],
                                                    outside_value=0.5)
        else:
            ppo_factor_schedule = PiecewiseSchedule([(0, 1.), (int(outer_n_epoch / 8), 0.)],
                                                    outside_value=0.)

        outer_lr_scheduler = PiecewiseSchedule([(0, outer_learning_rate),
                                                (int(outer_n_epoch / 2), outer_learning_rate * 0.1)],
                                               outside_value=outer_learning_rate * 0.1)

        def objective(env, theta, pool_rank):
            agent = self.create_agent(env, pool_rank)
            loss_n_params = len(agent.get_loss().get_params_1d())
            agent.get_loss().set_params_1d(theta[:loss_n_params])
            if self._outer_evolve_policy_init:
                agent.pi.set_params_1d(theta[loss_n_params:])
            # Agent lifetime is inner_opt_freq * inner_max_n_epoch
            return run_batch_rl(env, agent,
                                inner_opt_freq=self._inner_opt_freq,
                                inner_buffer_size=self._inner_buffer_size,
                                inner_max_n_epoch=self._inner_max_n_epoch,
                                pool_rank=pool_rank,
                                ppo_factor=ppo_factor_schedule.value(epoch),
                                epoch=None)

        # Initialize theta.
        theta = self.init_theta(self._env)
        num_params = len(theta)
        logger.log('Theta dim: {}'.format(num_params))

        # Set up outer loop parameter update schedule.
        adam = Adam(shape=(num_params,), beta1=0., stepsize=outer_learning_rate, dtype=np.float32)

        # Set up intra-machine parallelization.
        logger.log('Using {} proceses per MPI process.'.format(n_cpu))
        from pathos.multiprocessing import ProcessPool
        pool = ProcessPool(nodes=n_cpu)

        begin_time, best_test_return = time.time(), -np.inf
        for epoch in range(outer_n_epoch):

            # Anneal outer learning rate
            adam.stepsize = outer_lr_scheduler.value(epoch)

            noise = np.random.randn(outer_n_samples_per_ep // NUM_EQUAL_NOISE_VECTORS, num_params)
            noise = np.repeat(noise, NUM_EQUAL_NOISE_VECTORS, axis=0)
            theta_noise = theta[np.newaxis, :] + noise * outer_std
            theta_noise = theta_noise.reshape(MPI.COMM_WORLD.Get_size(), -1)

            # Distributes theta_noise vectors to all nodes.
            logger.log('Scattering all perturbed theta vectors and running inner loops ...')

            recvbuf = np.empty(theta_noise.shape[1], dtype='float')
            MPI.COMM_WORLD.Scatter(theta_noise, recvbuf, root=0)
            theta_noise = recvbuf.reshape(-1, num_params)

            # Noise vectors are scattered, run inner loop, parallelized over `pool_size` processes.
            start_time = time.time()
            pool_size = int(outer_n_samples_per_ep / MPI.COMM_WORLD.Get_size())
            results = pool.amap(objective, [self._env] * pool_size, theta_noise, range(pool_size)).get()

            # Extract relevant results
            returns = [utils.ret_to_obj(r['ep_final_rew']) for r in results]
            update_time = [np.mean(r['update_time']) for r in results]
            env_time = [np.mean(r['env_time']) for r in results]
            ep_length = [np.mean(r['ep_length']) for r in results]
            n_ep = [len(r['ep_length']) for r in results]
            mean_ep_kl = [np.mean(r['ep_kl']) for r in results]
            final_rets = [np.mean(r['ep_return'][-3:]) for r in results]

            # We gather the results at node 0
            recvbuf = np.empty([MPI.COMM_WORLD.Get_size(), 7 * pool_size],
                               # 7 = number of scalars in results vector
                               dtype='float') if MPI.COMM_WORLD.Get_rank() == 0 else None
            results_processed_arr = np.asarray(
                [returns, update_time, env_time, ep_length, n_ep, mean_ep_kl, final_rets],
                dtype='float').ravel()
            MPI.COMM_WORLD.Gather(results_processed_arr, recvbuf, root=0)

            # Do outer loop update calculations at node 0
            if MPI.COMM_WORLD.Get_rank() == 0:
                end_time = time.time()
                logger.log(
                    'All inner loops completed, returns gathered ({:.2f} sec).'.format(
                        time.time() - start_time))

                results_processed_arr = recvbuf.reshape(MPI.COMM_WORLD.Get_size(), 7, pool_size)
                results_processed_arr = np.transpose(results_processed_arr, (0, 2, 1)).reshape(-1, 7)
                results_processed = [dict(returns=r[0],
                                          update_time=r[1],
                                          env_time=r[2],
                                          ep_length=r[3],
                                          n_ep=r[4],
                                          mean_ep_kl=r[5],
                                          final_rets=r[6]) for r in results_processed_arr]
                returns = np.asarray([r['returns'] for r in results_processed])

                # ES update
                noise = noise[::NUM_EQUAL_NOISE_VECTORS]
                returns = np.mean(returns.reshape(-1, NUM_EQUAL_NOISE_VECTORS), axis=1)
                theta_grad = relative_ranks(returns).dot(noise) / outer_n_samples_per_ep \
                             - outer_l2 * theta
                theta -= adam.step(theta_grad)

                # Perform `NUM_TEST_SAMPLES` evaluation runs on root 0.
                if epoch % self._outer_plot_freq == 0 or epoch == outer_n_epoch - 1:
                    start_test_time = time.time()
                    logger.log('Performing {} test runs in parallel on node 0 ...'.format(NUM_TEST_SAMPLES))
                    # Evaluation run with current theta
                    test_results = pool.amap(
                        objective,
                        [self._env] * NUM_TEST_SAMPLES,
                        theta[np.newaxis, :] + np.zeros((NUM_TEST_SAMPLES, num_params)),
                        range(NUM_TEST_SAMPLES)
                    ).get()
                    plotting.plot_results(epoch, test_results)
                    test_return = np.mean([utils.ret_to_obj(r['ep_return']) for r in test_results])
                    if test_return > best_test_return:
                        best_test_return = test_return
                        # Save theta as numpy array.
                        self.save_theta(theta)
                    self.save_theta(theta, str(epoch))
                    logger.log('Test runs performed ({:.2f} sec).'.format(time.time() - start_test_time))

                logger.logkv('Epoch', epoch)
                utils.log_misc_stats('Obj', logger, returns)
                logger.logkv('PPOFactor', ppo_factor_schedule.value(epoch))
                logger.logkv('EpochTimeSpent(s)', end_time - start_time)
                logger.logkv('TotalTimeSpent(s)', end_time - begin_time)
                logger.logkv('BestTestObjMean', best_test_return)
                logger.dumpkvs()
 def act_to_env_format(act):
     if np.isnan(act).any() or np.isinf(act).any():
         logger.log("WARNING: nan or inf action {}".format(act))
         return np.zeros_like(act)
     else:
         return act
    def update(self, obs, acts, rews, dones, ppo_factor, inner_opt_freq):

        epg_rews = rews
        # Want to zero out rewards to the EPG loss function?
        # epg_rews = np.zeros_like(rews)

        # Calculate auxiliary functions.
        lst_bonus = []
        for rew_bonus_eval in self.lst_rew_bonus_eval:
            lst_bonus.append(rew_bonus_eval.predict(obs).T)
        auxs = np.concatenate(lst_bonus, axis=0)

        traj_raw = np.c_[obs, acts, epg_rews, auxs, dones].astype(np.float32)
        # Update here, since we only have access to these raws at this specific spot.
        self._traj_norm.update(traj_raw)
        traj = self._traj_norm.norm(traj_raw)
        auxs_pad = np.zeros(self._buffer_size - obs.shape[0], dtype=np.float32)
        rew_pad = np.zeros(self._buffer_size - obs.shape[0], dtype=np.float32)
        done_pad = np.zeros(self._buffer_size - obs.shape[0], dtype=np.float32)
        obs_pad = np.zeros((self._buffer_size - obs.shape[0], obs.shape[1]),
                           dtype=np.float32)
        act_pad = np.zeros((self._buffer_size - acts.shape[0], acts.shape[1]),
                           dtype=np.float32)
        pad = np.hstack([
            obs_pad, act_pad, rew_pad[:, None], auxs_pad[:, None],
            done_pad[:, None]
        ])
        traj = np.vstack([pad, traj])
        traj[:, obs.shape[1] + acts.shape[1]] = epg_rews
        traj[:, -1] = dones

        # Since the buffer length can be larger than the set of new samples, we truncate the
        # trajectories here for PPO.
        dones = dones[-inner_opt_freq:]
        rews = rews[-inner_opt_freq:]
        acts = acts[-inner_opt_freq:]
        obs = obs[-inner_opt_freq:]
        _obs = traj[-inner_opt_freq:, :obs.shape[1]]
        n = len(obs)

        if self._use_ppo:
            old_params_sym = self._pi_f(_obs)
            vp = np.ravel(self._vf_f(_obs).data)
            old_params = [item.data for item in old_params_sym]
            advs = gamma_expand(
                rews + self._ppo_gam *
                (1 - dones) * np.append(vp[1:], vp[-1]) - vp,
                self._ppo_gam * self._ppo_lam * (1 - dones))
            vt = advs + vp
            at = (advs - advs.mean()) / advs.std()

        epg_surr_loss = 0.
        pi_params_before = self._pi_f(_obs)
        for _ in range(self.inner_n_opt_steps):
            for idx in np.array_split(np.random.permutation(n),
                                      n // self.inner_opt_batch_size):
                # Clear gradients
                for v in self.backprop_params:
                    v.cleargrad()

                # Forward pass through loss function.
                # Apply temporal conv to input trajectory
                processed_traj = self._process_trajectory(traj)
                # Compute epg loss value
                epg_surr_loss_sym = self._compute_loss(traj[idx],
                                                       processed_traj[idx])
                epg_surr_loss += epg_surr_loss_sym.data

                # Add bootstrapping signal if needed.
                if self._use_ppo:
                    old_params_idx = [item[idx] for item in old_params]
                    ppo_surr_loss = self._compute_ppo_loss(
                        _obs[idx], acts[idx], at[idx], vt[idx], old_params_idx)
                    total_surr_loss = epg_surr_loss_sym * (
                        1 - ppo_factor) + ppo_surr_loss * ppo_factor
                else:
                    total_surr_loss = epg_surr_loss_sym

                # Backward pass through loss function
                total_surr_loss.backward()
                for v, adam in zip(self.backprop_params, self._lst_adam):
                    if np.isnan(v.grad).any() or np.isinf(v.grad).any():
                        logger.log(
                            "WARNING: gradient update nan on node {}".format(
                                MPI.COMM_WORLD.Get_rank()))
                    else:
                        v.data += adam.step(v.grad)

        pi_params_after = self._pi_f(_obs)

        return epg_surr_loss / (n // self.inner_opt_batch_size) / self.inner_n_opt_steps, \
               np.mean(self.kl(pi_params_before, pi_params_after).data)
Esempio n. 6
0
def main(test):
    d = datetime.datetime.now()
    date = '{}-{}'.format(d.month, d.day)
    time = '{:02d}-{:02d}'.format(d.hour, d.minute)

    # Experiment params
    # -----------------
    env_id = 'DirHopper'
    # Number of noise vector seeds for ES
    outer_n_samples_per_ep = 8
    # Perform policy SGD updates every `inner_opt_freq` steps
    inner_opt_freq = 64
    # Perform `inner_max_n_epoch` total SGD policy updates,
    # so in total `inner_steps` = `inner_opt_freq` * `inner_max_n_epoch`
    inner_max_n_epoch = 128
    # Temporal convolutions slide over buffer of length `inner_buffer_size`
    inner_buffer_size = inner_opt_freq * 8
    # Use PPO bootstrapping?
    ppo = True
    # Evolve policy initialization togeher with loss function?
    gpi = False
    # Fix PPO alpha (ppo_factor) to 0.5?
    fix_ppo = False
    # Use memory structure?
    mem = False
    # Number of outer loop epochs
    outer_n_epoch = 2000
    # Outer loop theta L2 penalty
    outer_l2 = 0.001
    # Outer loop noise standard deviation
    outer_std = 0.01
    # Outer loop Adam step size
    outer_learning_rate = 1e-2
    # Inner loop batch size per gradient update
    inner_opt_batch_size = 32
    # Number of times to cycle through the sampled dataset in the inner loop
    inner_n_opt_steps = 1
    # Inner loop adam step size
    inner_lr = 1e-3
    # Plotting frequency in number of outer loop epochs
    plot_freq = 50
    # Maximum number of cpus used per MPI process
    max_cpu = 2
    # Local experiment log path
    launcher.LOCAL_LOG_PATH = os.path.expanduser("~/EPG_experiments")
    # Where to load theta from for `--test true` purposes
    theta_load_path = '~/EPG_experiments/<path_to_theta.npy>/theta.npy'
    # -----------------

    exp_tag = '{}-{}-{}{}{}{}'.format(
        outer_n_samples_per_ep,
        inner_opt_freq,
        inner_max_n_epoch,
        '-p' if ppo else '',
        '-i' if gpi else '',
        '-f' if fix_ppo else '',
    ).replace('.', '')
    exp_name = '{}-{}-{}'.format(time, env_id.lower(), exp_tag)
    job_name = 'epg-{}--{}'.format(date, exp_name)

    epg_args = dict(
        env_id=env_id,
        n_cpu=max_cpu,
        log_path=os.path.join(launcher.LOCAL_LOG_PATH, date, exp_name),
        load_theta_path=theta_load_path if test else None,
        plot_freq=plot_freq,
        outer_n_epoch=outer_n_epoch,
        outer_l2=outer_l2,
        outer_std=outer_std,
        outer_learning_rate=outer_learning_rate,
        outer_n_samples_per_ep=outer_n_samples_per_ep,
        inner_opt_freq=inner_opt_freq,
        inner_max_n_epoch=inner_max_n_epoch,
        inner_opt_batch_size=inner_opt_batch_size,
        inner_buffer_size=inner_buffer_size,
        inner_n_opt_steps=inner_n_opt_steps,
        inner_lr=inner_lr,
        mem=mem,
        inner_use_ppo=ppo,
        fix_ppo=fix_ppo,
        gpi=gpi,
    )

    mpi_machines = 1
    mpi_proc_per_machine = int(
        np.ceil(outer_n_samples_per_ep / mpi_machines / float(max_cpu)))
    logger.log(
        'Running experiment {}/{} with {} noise vectors on {} machines with {}'
        ' MPI processes per machine, each using {} pool processes.'.format(
            date, exp_name, outer_n_samples_per_ep, mpi_machines,
            mpi_proc_per_machine, max_cpu))

    # Experiment launcher
    launcher.call(job_name=job_name,
                  fn=test_run if test else run,
                  kwargs=epg_args,
                  log_relpath=os.path.join(date, exp_name),
                  mpi_proc_per_machine=mpi_proc_per_machine,
                  mpi_machines=mpi_machines)
Esempio n. 7
0
def run_batch_rl(env,
                 agent,
                 inner_opt_freq,
                 inner_max_n_epoch,
                 inner_buffer_size,
                 pool_rank,
                 ppo_factor,
                 epoch=None,
                 render=False,
                 verbose=False):
    from collections import deque
    assert isinstance(inner_opt_freq, int)
    assert isinstance(inner_max_n_epoch, int)
    assert isinstance(inner_buffer_size, int)
    lst_ep_rew, lst_loss, lst_ep_steps, lst_kl = [], [], [], []
    buffer = deque(maxlen=inner_buffer_size)
    n_ep, ep_rew, ep_steps = 0, 0., 0
    tot_update_time, start_env_time = 0., time.time()
    # Assumes meta wrapper used.
    if epoch is not None:
        env.meta_reset(epoch)
        env.seed(epoch)
    else:
        env.meta_reset(pool_rank + utils.get_time_seed())
        env.seed(pool_rank + utils.get_time_seed())

    obs = env.reset()
    n_steps = 0
    for itr in range(inner_max_n_epoch):
        ep_obs = []
        for _ in range(inner_opt_freq):
            obs = obs.astype(np.float32)
            act = agent.act(obs)
            obs_prime, rew, done, _ = env.step(agent.act_to_env_format(act))
            ep_obs.append(obs)
            buffer.append((obs, act, rew, done))
            ep_rew += rew
            ep_steps += 1
            n_steps += 1
            if done:
                obs = env.reset()
                lst_ep_rew.append(ep_rew)
                lst_ep_steps.append(ep_steps)
                if verbose and pool_rank == 0:
                    logger.log('Train run (ep {}, return {:.3f})'.format(
                        n_ep, ep_rew))
                ep_steps, ep_rew = 0, 0.
                n_ep += 1
            else:
                obs = obs_prime

        # This is disabled for now. But it's easy to add an exploration bonus as an additional
        # input the the loss function!
        # for rew_bonus_eval in agent.lst_rew_bonus_eval:
        #     rew_bonus_eval.fit_before_process_samples(obs)

        start_update_time = time.time()
        loss_input = [
            np.array([e[i] for e in buffer], dtype=np.float32)
            for i in range(len(buffer[0]))
        ]
        loss_input += [ppo_factor, inner_opt_freq]
        loss, kl = agent.update(*loss_input)
        lst_loss.append(loss)
        lst_kl.append(kl)
        tot_update_time += time.time() - start_update_time

    # Evaluate final policy
    obs, final_rew, ep_counter = env.reset(), [0., 0., 0.], 0
    while ep_counter < 3:
        obs = obs.astype(np.float32)
        act = agent.act(obs)
        obs_prime, rew, done, _ = env.step(agent.act_to_env_format(act))
        final_rew[ep_counter] += rew
        if done:
            obs = env.reset()
            ep_counter += 1
        else:
            obs = obs_prime

    tot_env_time = time.time() - start_env_time - tot_update_time

    if render:
        logger.log('Rendering final policy for 5 steps ...')
        obs, ep_rew = env.reset(), 0.
        ep_counter = 0
        while ep_counter < 5:
            obs = obs.astype(np.float32)
            act = agent.act(obs)
            obs_prime, rew, done, _ = env.step(agent.act_to_env_format(act))
            env.render()
            ep_rew += rew
            if done:
                logger.log(
                    'Test run with final policy (return {:.3f}).'.format(
                        ep_rew))
                time.sleep(2)
                obs, ep_rew = env.reset(), 0.
                ep_counter += 1
            else:
                obs = obs_prime

    return dict(ep_return=np.asarray(lst_ep_rew),
                ep_final_rew=np.asarray(final_rew),
                ep_loss=lst_loss,
                ep_length=lst_ep_steps,
                ep_kl=np.asarray(lst_kl),
                update_time=tot_update_time,
                env_time=tot_env_time)