예제 #1
0
    def fit(self,
            obs,
            act,
            obs_next,
            cp_obs,
            cp_act,
            future_bool,
            epochs=1000,
            compute_normalization=True,
            valid_split_ratio=None,
            rolling_average_persitency=None,
            verbose=False,
            log_tabular=False,
            max_logging=5000):

        assert obs.ndim == 2 and obs.shape[
            1] == self.obs_space_dims * self.future_length
        assert obs_next.ndim == 2 and obs_next.shape[
            1] == self.obs_space_dims * self.future_length
        assert act.ndim == 2 and act.shape[
            1] == self.action_space_dims * self.future_length
        assert cp_obs.ndim == 2 and cp_obs.shape[1] == (self.obs_space_dims *
                                                        self.history_length)
        assert cp_act.ndim == 2 and cp_act.shape[1] == (
            self.action_space_dims * self.history_length)
        assert future_bool.ndim == 2 and future_bool.shape[
            1] == self.future_length

        if valid_split_ratio is None:
            valid_split_ratio = self.valid_split_ratio
        if rolling_average_persitency is None:
            rolling_average_persitency = self.rolling_average_persitency

        assert 1 > valid_split_ratio >= 0

        sess = tf.compat.v1.get_default_session()

        obs = obs.reshape(-1, self.obs_space_dims)
        obs_next = obs_next.reshape(-1, self.obs_space_dims)
        delta = self.env.targ_proc(obs, obs_next)
        back_delta = self.env.targ_proc(obs_next, obs)

        obs = obs.reshape(-1, self.future_length * self.obs_space_dims)
        obs_next = obs_next.reshape(-1,
                                    self.future_length * self.obs_space_dims)
        delta = delta.reshape(-1, self.future_length * self.obs_space_dims)
        back_delta = back_delta.reshape(
            -1, self.future_length * self.obs_space_dims)

        single_obs = obs[:, :self.obs_space_dims]
        single_act = act[:, :self.action_space_dims]
        single_delta = delta[:, :self.obs_space_dims]
        single_back_delta = back_delta[:, :self.obs_space_dims]

        if self._dataset is None:
            self._dataset = dict(obs=obs,
                                 act=act,
                                 delta=delta,
                                 cp_obs=cp_obs,
                                 cp_act=cp_act,
                                 future_bool=future_bool,
                                 obs_next=obs_next,
                                 back_delta=back_delta,
                                 single_obs=single_obs,
                                 single_act=single_act,
                                 single_delta=single_delta,
                                 single_back_delta=single_back_delta)
        else:
            self._dataset['obs'] = np.concatenate([self._dataset['obs'], obs])
            self._dataset['act'] = np.concatenate([self._dataset['act'], act])
            self._dataset['delta'] = np.concatenate(
                [self._dataset['delta'], delta])
            self._dataset['cp_obs'] = np.concatenate(
                [self._dataset['cp_obs'], cp_obs])
            self._dataset['cp_act'] = np.concatenate(
                [self._dataset['cp_act'], cp_act])
            self._dataset['future_bool'] = np.concatenate(
                [self._dataset['future_bool'], future_bool])
            self._dataset['obs_next'] = np.concatenate(
                [self._dataset['obs_next'], obs_next])
            self._dataset['back_delta'] = np.concatenate(
                [self._dataset['back_delta'], back_delta])

            self._dataset['single_obs'] = np.concatenate(
                [self._dataset['single_obs'], single_obs])
            self._dataset['single_act'] = np.concatenate(
                [self._dataset['single_act'], single_act])
            self._dataset['single_delta'] = np.concatenate(
                [self._dataset['single_delta'], single_delta])
            self._dataset['single_back_delta'] = np.concatenate(
                [self._dataset['single_back_delta'], single_back_delta])

        self.compute_normalization(self._dataset['single_obs'],
                                   self._dataset['single_act'],
                                   self._dataset['single_delta'],
                                   self._dataset['cp_obs'],
                                   self._dataset['cp_act'],
                                   self._dataset['single_back_delta'])

        dataset_size = self._dataset['obs'].shape[0]
        n_valid_split = min(int(dataset_size * valid_split_ratio), max_logging)
        permutation = np.random.permutation(dataset_size)
        train_obs, valid_obs = self._dataset['obs'][permutation[
            n_valid_split:]], self._dataset['obs'][permutation[:n_valid_split]]
        train_act, valid_act = self._dataset['act'][permutation[
            n_valid_split:]], self._dataset['act'][permutation[:n_valid_split]]
        train_delta, valid_delta = self._dataset['delta'][
            permutation[n_valid_split:]], self._dataset['delta'][
                permutation[:n_valid_split]]
        train_cp_obs, valid_cp_obs = self._dataset['cp_obs'][
            permutation[n_valid_split:]], self._dataset['cp_obs'][
                permutation[:n_valid_split]]
        train_cp_act, valid_cp_act = self._dataset['cp_act'][
            permutation[n_valid_split:]], self._dataset['cp_act'][
                permutation[:n_valid_split]]
        train_obs_next, valid_obs_next = self._dataset['obs_next'][
            permutation[n_valid_split:]], self._dataset['obs_next'][
                permutation[:n_valid_split]]
        train_future_bool, valid_future_bool = self._dataset['future_bool'][
            permutation[n_valid_split:]], self._dataset['future_bool'][
                permutation[:n_valid_split]]
        train_back_delta, valid_back_delta = self._dataset['back_delta'][
            permutation[n_valid_split:]], self._dataset['back_delta'][
                permutation[:n_valid_split]]

        train_obs, train_act, train_delta, train_obs_next, train_back_delta, train_cp_obs, train_cp_act = \
            self._preprocess_inputs(train_obs, train_act, train_delta, train_cp_obs, train_cp_act, train_future_bool, train_obs_next, train_back_delta)
        if n_valid_split > 0:
            valid_obs, valid_act, valid_delta, valid_obs_next, valid_back_delta, valid_cp_obs, valid_cp_act = \
                self._preprocess_inputs(valid_obs, valid_act, valid_delta, valid_cp_obs, valid_cp_act, valid_future_bool, valid_obs_next, valid_back_delta)

        valid_loss_rolling_average = None
        epoch_times = []

        train_dataset_size = train_obs.shape[0]
        if self.ensemble_size > 1:
            bootstrap_idx = np.random.randint(0,
                                              train_dataset_size,
                                              size=(self.ensemble_size,
                                                    train_dataset_size))
        else:
            bootstrap_idx = np.tile(
                np.arange(train_dataset_size, dtype='int32'),
                (self.ensemble_size, 1))

        valid_dataset_size = valid_obs.shape[0]
        valid_boostrap_idx = np.tile(
            np.arange(valid_dataset_size, dtype='int32'),
            (self.ensemble_size, 1))

        def shuffle_rows(arr):
            idxs = np.argsort(np.random.uniform(size=arr.shape), axis=-1)
            return arr[np.arange(arr.shape[0])[:, None], idxs]

        """ ------- Looping over training epochs ------- """
        for epoch in range(epochs):

            # preparations for recording training stats
            mse_losses, back_mse_losses, recon_losses = [], [], []
            t0 = time.time()

            bootstrap_idx = shuffle_rows(bootstrap_idx)
            """ ------- Looping through the shuffled and batched dataset for one epoch -------"""
            for batch_num in range(
                    int(np.ceil(bootstrap_idx.shape[-1] / self.batch_size))):
                batch_idxs = bootstrap_idx[:, batch_num *
                                           self.batch_size:(batch_num + 1) *
                                           self.batch_size]

                bootstrap_train_obs = train_obs[batch_idxs]
                bootstrap_train_act = train_act[batch_idxs]
                bootstrap_train_delta = train_delta[batch_idxs]
                bootstrap_train_obs_next = train_obs_next[batch_idxs]
                bootstrap_train_back_delta = train_back_delta[batch_idxs]
                bootstrap_train_cp_obs = train_cp_obs[batch_idxs]
                bootstrap_train_cp_act = train_cp_act[batch_idxs]

                feed_dict = self.get_feed_dict(
                    bootstrap_train_obs, bootstrap_train_act,
                    bootstrap_train_delta, bootstrap_train_obs_next,
                    bootstrap_train_back_delta, bootstrap_train_cp_obs,
                    bootstrap_train_cp_act)

                mse_loss, back_mse_loss, recon_loss, _ = sess.run(
                    [
                        self.mse_loss, self.back_mse_loss, self.recon_loss,
                        self.train_op
                    ],
                    feed_dict=feed_dict)

                mse_losses.append(mse_loss)
                back_mse_losses.append(back_mse_loss)
                recon_losses.append(recon_loss)
            """ ------- Validation -------"""
            if n_valid_split > 0:
                bootstrap_valid_obs = valid_obs[valid_boostrap_idx]
                bootstrap_valid_act = valid_act[valid_boostrap_idx]
                bootstrap_valid_delta = valid_delta[valid_boostrap_idx]
                bootstrap_valid_obs_next = valid_obs_next[valid_boostrap_idx]
                bootstrap_valid_back_delta = valid_back_delta[
                    valid_boostrap_idx]
                bootstrap_valid_cp_obs = valid_cp_obs[valid_boostrap_idx]
                bootstrap_valid_cp_act = valid_cp_act[valid_boostrap_idx]

                feed_dict = self.get_feed_dict(
                    bootstrap_valid_obs, bootstrap_valid_act,
                    bootstrap_valid_delta, bootstrap_valid_obs_next,
                    bootstrap_valid_back_delta, bootstrap_valid_cp_obs,
                    bootstrap_valid_cp_act)

                v_mse_loss, v_back_mse_loss, v_recon_loss = sess.run(
                    [
                        self.mse_loss,
                        self.back_mse_loss,
                        self.recon_loss,
                    ],
                    feed_dict=feed_dict)

                if verbose:
                    logger.log(
                        "Training DynamicsModel - finished epoch %i --"
                        "[Training] mse loss: %.4f  back mse loss: %.4f  recon loss:  %.4f "
                        "[Validation] mse loss: %.4f  back mse loss: %.4f  recon loss:  %.4f  epoch time: %.2f"
                        %
                        (epoch, np.mean(mse_losses), np.mean(back_mse_losses),
                         np.mean(recon_losses), v_mse_loss, v_back_mse_loss,
                         v_recon_loss, time.time() - t0))

                # Early Stopping with Validation Loss
                if valid_loss_rolling_average is None:
                    valid_loss_rolling_average = 1.5 * v_recon_loss  # set initial rolling to a higher value avoid too early stopping
                    valid_loss_rolling_average_prev = 2 * v_recon_loss
                    if v_recon_loss < 0:
                        valid_loss_rolling_average = v_recon_loss / 1.5  # set initial rolling to a higher value avoid too early stopping
                        valid_loss_rolling_average_prev = v_recon_loss / 2

                valid_loss_rolling_average = rolling_average_persitency*valid_loss_rolling_average \
                                                + (1.0-rolling_average_persitency)*v_recon_loss

                if valid_loss_rolling_average_prev < valid_loss_rolling_average:
                    logger.log(
                        'Stopping Training of Model since its valid_loss_rolling_average decreased'
                    )
                    break

            else:
                if verbose:
                    logger.log(
                        "Training DynamicsModel - finished epoch %i --"
                        "[Training] mse loss: %.4f  back mse loss: %.4f  recon loss: %.4f  epoch time: %.2f"
                        %
                        (epoch, np.mean(mse_losses), np.mean(back_mse_losses),
                         np.mean(recon_losses), time.time() - t0))

            valid_loss_rolling_average_prev = valid_loss_rolling_average
        """ ------- Tabular Logging ------- """
        if log_tabular:
            logger.logkv('AvgModelEpochTime', np.mean(epoch_times))
            logger.logkv('Epochs', epoch)
예제 #2
0
def learn(
    *,
    policy,
    dynamics_model,
    env,
    nsteps,
    total_timesteps,
    ent_coef,
    lr,
    vf_coef=0.5,
    max_grad_norm=0.5,
    gamma=0.99,
    lam=0.95,
    history_length=10,
    state_diff=1,
    load_path='',
    n_layers=2,
    log_interval=10,
    nminibatches=4,
    noptepochs=4,
    cliprange=0.2,
    save_interval=0,
    n_parallel=1,
    num_rollouts=1,
    max_path_length=200,
    seed=0,
    hidden_size=512,
    test_range=[],
    num_test=4,
    total_test=20,
    test_interval=0,
    env_flag='pendulum',
    normalize_flag=0,
    no_test_flag=False,
    only_test_flag=False,
    cp_dim_output=10,
):

    f_test_list = []
    for i in range(0, num_test):
        file_name = '%s/test_c%d.txt' % (logger.get_dir(), i)
        f_test = open(file_name, 'w+')
        f_test_list.append(f_test)

    file_name = '%s/test_tot.txt' % (logger.get_dir())
    f_test_tot = open(file_name, 'w+')

    file_name = '%s/train.txt' % (logger.get_dir())
    f_train = open(file_name, 'w+')

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    if n_parallel > 1:
        vec_env = ParallelEnvExecutor(env, n_parallel, num_rollouts,
                                      max_path_length)
    else:
        vec_env = IterativeEnvExecutor(env, num_rollouts, max_path_length)

    nenvs = vec_env.num_envs
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    obs_dim = env.observation_space.shape[0]
    proc_obs_dim = env.proc_observation_space_dims
    if len(env.action_space.shape) == 0:
        act_dim = env.action_space.n
        discrete = True
    else:
        act_dim = env.action_space.shape[0]
        discrete = False

    make_model = lambda: Model(policy=policy,
                               proc_obs_dim=proc_obs_dim,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               hidden_size=hidden_size,
                               cp_dim_output=cp_dim_output,
                               n_layers=n_layers)

    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    runner = Runner(env=env,
                    dynamics_model=dynamics_model,
                    vec_env=vec_env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    env_flag=env_flag,
                    normalize_flag=normalize_flag,
                    history_length=history_length,
                    state_diff=state_diff)

    if load_path:
        dynamics_model.load(load_path)
        logger.log("Successfully loaded parameters from {}".format(load_path))
    else:
        logger.log("Failed to load parameters from {}".format(load_path))

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch

    test_env_list = []
    if env_flag == 'cartpole':
        env_cls = RandomCartPole_Force_Length
    elif env_flag == 'pendulum':
        env_cls = RandomPendulumAll
    elif env_flag == 'halfcheetah':
        env_cls = HalfCheetahEnv
    elif env_flag == 'cripple_halfcheetah':
        env_cls = CrippleHalfCheetahEnv
    elif env_flag == 'ant':
        env_cls = AntEnv
    elif env_flag == 'slim_humanoid':
        env_cls = SlimHumanoidEnv

    train_env = env_cls()
    train_env.seed(0)
    train_env = normalize(train_env)
    for i in range(0, num_test):
        test_env = env_cls(test_range[i][0], test_range[i][1])
        test_env.seed(0)
        test_env = normalize(test_env)
        vec_test_env = ParallelEnvExecutor(test_env, n_parallel, 10,
                                           max_path_length)
        test_env_list.append(vec_test_env)

    if n_parallel > 1:
        vec_train_env = ParallelEnvExecutor(train_env, n_parallel, 10,
                                            max_path_length)
    else:
        vec_train_env = IterativeEnvExecutor(train_env, 10, max_path_length)

    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, contexts, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        epinfobuf.extend(epinfos)
        mblossvals = []
        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, contexts, returns, masks,
                                          actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, contexts, returns, masks,
                                          actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('epminrew',
                         safemin([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('epmaxrew',
                         safemax([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
            runner.save(savepath)

        if not no_test_flag:
            # TEST
            if test_interval and update % test_interval == 0 and logger.get_dir(
            ):
                train_reward = context_pred_rollout_multi(
                    vec_env=vec_train_env,
                    env=train_env,
                    obs_dim=obs_dim,
                    act_dim=act_dim,
                    discrete=discrete,
                    model=model,
                    history_length=history_length,
                    state_diff=state_diff,
                    test_total=total_test,
                    runner=runner)

                print("train reward: " + str(train_reward))
                f_train.write("{}\n".format(train_reward))
                f_train.flush()
                os.fsync(f_train.fileno())

                total_test_reward = 0.0
                for i in range(0, num_test):
                    test_reward = context_pred_rollout_multi(
                        vec_env=test_env_list[i],
                        env=test_env,
                        obs_dim=obs_dim,
                        act_dim=act_dim,
                        discrete=discrete,
                        model=model,
                        history_length=history_length,
                        state_diff=state_diff,
                        test_total=total_test,
                        runner=runner)

                    print("test c" + str(i) + " reward: " + str(test_reward))
                    f_test_list[i].write("{}\n".format(test_reward))
                    f_test_list[i].flush()
                    os.fsync(f_test_list[i].fileno())
                    total_test_reward += test_reward

                f_test_tot.write("{}\n".format(total_test_reward))
                f_test_tot.flush()
                os.fsync(f_test_tot.fileno())
    for i in range(0, num_test):
        f_test_list[i].close()

    f_test_tot.close()
    f_train.close()
    logger.log("Training finished")
예제 #3
0
    def _log_path_stats(self,
                        paths,
                        log=False,
                        log_prefix='',
                        writer=None,
                        itr=None):
        # compute log stats
        average_discounted_return = np.mean(
            [path["returns"][0] for path in paths])
        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        if log == 'reward':
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            if writer is not None:
                writer.add_scalar("log/AverageReturn",
                                  np.mean(undiscounted_returns))

        elif log == 'all' or log is True:
            logger.logkv(log_prefix + 'AverageDiscountedReturn',
                         average_discounted_return)
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            logger.logkv(log_prefix + 'NumTrajs', len(paths))
            logger.logkv(log_prefix + 'StdReturn',
                         np.std(undiscounted_returns))
            logger.logkv(log_prefix + 'MaxReturn',
                         np.max(undiscounted_returns))
            logger.logkv(log_prefix + 'MinReturn',
                         np.min(undiscounted_returns))
            if writer is not None:
                writer.add_scalar("log/AverageDiscountedReturn",
                                  average_discounted_return, itr)
                writer.add_scalar("log/AverageReturn",
                                  np.mean(undiscounted_returns), itr)
                writer.add_scalar("log/NumTrajs", len(paths), itr)
                writer.add_scalar("log/StdReturn",
                                  np.std(undiscounted_returns), itr)
                writer.add_scalar("log/MaxReturn",
                                  np.max(undiscounted_returns), itr)
                writer.add_scalar("log/MinReturn",
                                  np.min(undiscounted_returns), itr)
예제 #4
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        running_paths = _get_empty_running_paths_dict()

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy

        # initial reset of meta_envs
        obs = np.asarray(self.env.reset())

        ts = 0

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if random:
                action = self.env.action_space.sample()
                agent_info = {}
            else:
                action, agent_info = policy.get_action(obs)
                if action.ndim == 2:
                    action = action[0]
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obs, reward, done, env_info = self.env.step(action)

            ts += 1
            done = done or ts >= self.max_path_length
            if done:
                next_obs = self.env.reset()
                ts = 0

            env_time += time.time() - t

            new_samples = 0

            # append new samples to running paths
            if isinstance(reward, np.ndarray):
                reward = reward[0]
            running_paths["observations"].append(obs)
            running_paths["actions"].append(action)
            running_paths["rewards"].append(reward)
            running_paths["dones"].append(done)
            running_paths["env_infos"].append(env_info)
            running_paths["agent_infos"].append(agent_info)

            # if running path is done, add it to paths and empty the running path
            if done:
                paths.append(
                    dict(
                        observations=np.asarray(running_paths["observations"]),
                        actions=np.asarray(running_paths["actions"]),
                        rewards=np.asarray(running_paths["rewards"]),
                        dones=np.asarray(running_paths["dones"]),
                        env_infos=utils.stack_tensor_dict_list(
                            running_paths["env_infos"]),
                        agent_infos=utils.stack_tensor_dict_list(
                            running_paths["agent_infos"]),
                    ))
                new_samples += len(running_paths["rewards"])
                running_paths = _get_empty_running_paths_dict()

            pbar.update(new_samples)
            n_samples += new_samples
            obs = next_obs
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
예제 #5
0
    def train(self):
        """
        Collects data and trains the dynamics model
        """
        f_test_list = []
        for i in range(0, self.num_test):
            file_name = '%s/test_c%d.txt' % (logger.get_dir(), i)
            f_test = open(file_name, 'w+')
            f_test_list.append(f_test)

        file_name = '%s/test_tot.txt' % (logger.get_dir())
        f_test_tot = open(file_name, 'w+')

        file_name = '%s/train.txt' % (logger.get_dir())
        f_train = open(file_name, 'w+')

        itr_times = []
        t0 = time.time()

        test_env_list = []

        if self.env_flag == 'cartpole':
            env_cls = RandomCartPole_Force_Length
        elif self.env_flag == 'pendulum':
            env_cls = RandomPendulumAll
        elif self.env_flag == 'halfcheetah':
            env_cls = HalfCheetahEnv
        elif self.env_flag == 'cripple_halfcheetah':
            env_cls = CrippleHalfCheetahEnv
        elif self.env_flag == 'ant':
            env_cls = AntEnv
        elif self.env_flag == 'slim_humanoid':
            env_cls = SlimHumanoidEnv
        else:
            raise ValueError(self.env_flag)

        train_env = env_cls()
        train_env.seed(0)
        train_env = normalize(train_env)
        for i in range(0, self.num_test):
            test_env = env_cls(self.test_range[i][0], self.test_range[i][1])
            test_env.seed(0)
            test_env = normalize(test_env)
            vec_test_env = ParallelEnvExecutor(test_env, self.test_n_parallel,
                                               self.test_num_rollouts,
                                               self.test_max_epochs)
            test_env_list.append(vec_test_env)

        if len(train_env.action_space.shape) == 0:
            act_dim = train_env.action_space.n
            discrete = True
        else:
            act_dim = train_env.action_space.shape[0]
            discrete = False

        with self.sess.as_default() as sess:

            sess.run(tf.compat.v1.initializers.global_variables())

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                if not self.only_test:
                    itr_start_time = time.time()
                    logger.log(
                        "\n ---------------- Iteration %d ----------------" %
                        itr)

                    time_env_sampling_start = time.time()

                    if self.initial_random_samples and itr == 0:
                        logger.log(
                            "Obtaining random samples from the environment...")
                        env_paths = self.sampler.obtain_samples(log=True,
                                                                random=True,
                                                                log_prefix='')
                    else:
                        logger.log(
                            "Obtaining samples from the environment using the policy..."
                        )
                        env_paths = self.sampler.obtain_samples(log=True,
                                                                log_prefix='')

                    logger.record_tabular(
                        'Time-EnvSampling',
                        time.time() - time_env_sampling_start)
                    ''' -------------- Process the samples ----------------'''
                    logger.log("Processing environment samples...")

                    time_env_samp_proc = time.time()
                    samples_data = self.sample_processor.process_samples(
                        env_paths, log=True, itr=itr)
                    logger.record_tabular('Time-EnvSampleProc',
                                          time.time() - time_env_samp_proc)
                    ''' --------------- Fit the dynamics model --------------- '''

                    time_fit_start = time.time()

                    logger.log("Training dynamics model for %i epochs ..." %
                               (self.dynamics_model_max_epochs))
                    if self.context:
                        self.dynamics_model.fit(
                            samples_data['concat_obs'],
                            samples_data['concat_act'],
                            samples_data['concat_next_obs'],
                            samples_data['cp_observations'],
                            samples_data['cp_actions'],
                            samples_data['concat_bool'],
                            epochs=self.dynamics_model_max_epochs,
                            verbose=True,
                            log_tabular=True)
                    else:
                        self.dynamics_model.fit(
                            samples_data['observations'],
                            samples_data['actions'],
                            samples_data['next_observations'],
                            epochs=self.dynamics_model_max_epochs,
                            verbose=True,
                            log_tabular=True)

                    logger.record_tabular('Time-ModelFit',
                                          time.time() - time_fit_start)
                    """ ------------------- Logging --------------------------"""
                    logger.logkv('Itr', itr)
                    logger.logkv('n_timesteps',
                                 self.sampler.total_timesteps_sampled)

                    logger.logkv('Time', time.time() - start_time)
                    logger.logkv('ItrTime', time.time() - itr_start_time)

                    logger.log("Saving snapshot...")
                    params = self.get_itr_snapshot(itr)
                    self.log_diagnostics(env_paths, '')
                    logger.save_itr_params(itr, params)
                    print(logger.get_dir())
                    checkdir = osp.join(logger.get_dir(), 'checkpoints')
                    os.makedirs(checkdir, exist_ok=True)
                    savepath = osp.join(checkdir,
                                        'params_epoch_{}'.format(itr))
                    self.dynamics_model.save(savepath)
                    logger.log("Saved")

                    logger.dumpkvs()
                else:
                    logger.log("Test - {}/{} iterations".format(
                        itr + 1, self.n_itr))
                    checkdir = osp.join(logger.get_dir(), 'checkpoints')
                    loadpath = osp.join(checkdir,
                                        'params_epoch_{}'.format(itr))
                    self.dynamics_model.load(loadpath)
                    logger.log("Succesfully loaded parameters from {}".format(
                        loadpath))
                    if itr != 0:
                        itr_times.append(time.time() - t0)
                        avg_itr_time = np.mean(itr_times)
                        eta = avg_itr_time * (self.n_itr - itr) / 60.
                        logger.log(
                            "Test - {}/{} iterations | ETA: {:.2f} mins".
                            format(itr + 1, self.n_itr, eta))
                        t0 = time.time()

                if self.no_test:
                    print('no test')
                else:
                    if itr % 1 == 0 or itr == self.n_itr - 1:
                        if self.context:
                            rollout = context_rollout_multi
                        else:
                            rollout = rollout_multi

                        total_test_reward = 0.0
                        for i in range(0, self.num_test):
                            test_reward = rollout(
                                vec_env=test_env_list[i],
                                policy=self.policy,
                                discrete=discrete,
                                num_rollouts=self.test_num_rollouts,
                                test_total=self.total_test,
                                act_dim=act_dim,
                                use_cem=self.use_cem,
                                horizon=self.horizon,
                                context=self.context,
                                history_length=self.history_length,
                                state_diff=self.state_diff)

                            print("test c" + str(i) + " reward: " +
                                  str(test_reward))
                            f_test_list[i].write("{}\n".format(test_reward))
                            f_test_list[i].flush()
                            os.fsync(f_test_list[i].fileno())
                            self.writer.add_scalar("test/c{}".format(i),
                                                   test_reward, itr)
                            total_test_reward += test_reward / self.num_test

                        f_test_tot.write("{}\n".format(total_test_reward))
                        f_test_tot.flush()
                        os.fsync(f_test_tot.fileno())
                        self.writer.add_scalar("test/total_test",
                                               total_test_reward, itr)

                if itr == 1:
                    sess.graph.finalize()

        for i in range(0, self.num_test):
            f_test_list[i].close()

        f_test_tot.close()
        f_train.close()
        logger.log("Training finished")
        self.sess.close()
예제 #6
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (list): A list of dicts with the samples
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        num_envs = self.vec_env.num_envs
        running_paths = [
            _get_empty_running_paths_dict() for _ in range(num_envs)
        ]

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        if self.use_cem:
            for i in range(num_envs):
                self.reset_cem(i)

        # initial reset of meta_envs
        obses = np.asarray(self.vec_env.reset())
        state_counts = [0] * self.vec_env.num_envs

        # history
        self.obs_dim = obses.shape[1]
        history_state = np.zeros(
            (obses.shape[0], self.obs_dim * self.history_length))
        history_act = np.zeros(
            (obses.shape[0], self.act_dim * self.history_length))

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if random:
                actions = np.stack(
                    [self.env.action_space.sample() for _ in range(num_envs)],
                    axis=0)
                agent_infos = {}
            else:
                if self.use_cem:
                    if self.context:
                        cem_solutions, agent_infos = policy.get_actions(
                            obses,
                            init_mean=self.prev_sol,
                            init_var=self.init_var,
                            cp_obs=history_state,
                            cp_act=history_act)
                    else:
                        cem_solutions, agent_infos = policy.get_actions(
                            obses,
                            init_mean=self.prev_sol,
                            init_var=self.init_var)
                    self.prev_sol[:, :-1] = cem_solutions[:, 1:].copy()
                    self.prev_sol[:, -1:] = 0.
                    actions = cem_solutions[:, 0].copy()
                else:
                    if self.context:
                        actions, agent_infos = policy.get_actions(
                            obses, cp_obs=history_state, cp_act=history_act)
                    else:
                        actions, agent_infos = policy.get_actions(obses)
                if len(self.env.action_space.shape) == 0:
                    actions = actions.reshape(-1)

            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(
                agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if len(self.env.action_space.shape) == 0:
                    action = np.eye(self.act_dim)[action]
                else:
                    if action.ndim == 0:
                        action = np.expand_dims(action, 0)
                assert action.ndim == 1, (action, action.shape)

                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                running_paths[idx]["cp_obs"].append(history_state[idx].copy())
                running_paths[idx]["cp_act"].append(history_act[idx].copy())

                # making a history buffer
                if state_counts[idx] < self.history_length:
                    if self.state_diff:
                        history_state[idx][state_counts[idx] * self.obs_dim:(
                            state_counts[idx] +
                            1) * self.obs_dim] = next_obses[idx] - observation
                    else:
                        history_state[idx][state_counts[idx] *
                                           self.obs_dim:(state_counts[idx] +
                                                         1) *
                                           self.obs_dim] = observation
                    history_act[idx][state_counts[idx] *
                                     self.act_dim:(state_counts[idx] + 1) *
                                     self.act_dim] = action
                else:
                    history_state[idx][:-self.obs_dim] = history_state[idx][
                        self.obs_dim:]
                    if self.state_diff:
                        history_state[idx][
                            -self.obs_dim:] = next_obses[idx] - observation
                    else:
                        history_state[idx][-self.obs_dim:] = observation
                    history_act[idx][:-self.
                                     act_dim] = history_act[idx][self.act_dim:]
                    history_act[idx][-self.act_dim:] = action

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths.append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                            cp_obs=np.asarray(running_paths[idx]["cp_obs"]),
                            cp_act=np.asarray(running_paths[idx]["cp_act"]),
                        ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()
                    if not random and self.use_cem:
                        self.reset_cem(idx)

                    state_counts[idx] = 0
                    history_state[idx] = np.zeros(
                        (self.obs_dim * self.history_length))
                    history_act[idx] = np.zeros(
                        (self.act_dim * self.history_length))
                else:
                    state_counts[idx] += 1
            pbar.update(self.vec_env.num_envs)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths