def log_diagnostics(self, paths, prefix):
     progs = [
         path["observations"][-1][-3] - path["observations"][0][-3]
         for path in paths
         ]
     logger.logkv(prefix + 'AverageForwardProgress', np.mean(progs))
     logger.logkv(prefix + 'MaxForwardProgress', np.max(progs))
     logger.logkv(prefix + 'MinForwardProgress', np.min(progs))
     logger.logkv(prefix + 'StdForwardProgress', np.std(progs))
Example #2
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (list): A list of dicts with the samples
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        num_envs = self.vec_env.num_envs
        running_paths = [
            _get_empty_running_paths_dict() for _ in range(num_envs)
        ]

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True] * self.vec_env.num_envs)

        # initial reset of meta_envs
        obses = np.asarray(self.vec_env.reset())

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if random:
                actions = np.stack(
                    [self.env.action_space.sample() for _ in range(num_envs)],
                    axis=0)
                agent_infos = {}
            else:
                a_bs = self.adapt_batch_size
                if a_bs is not None and len(
                        running_paths[0]['observations']) > a_bs + 1:
                    adapt_obs = [
                        np.stack(running_paths[idx]['observations'][-a_bs -
                                                                    1:-1])
                        for idx in range(num_envs)
                    ]
                    adapt_act = [
                        np.stack(running_paths[idx]['actions'][-a_bs - 1:-1])
                        for idx in range(num_envs)
                    ]
                    adapt_next_obs = [
                        np.stack(running_paths[idx]['observations'][-a_bs:])
                        for idx in range(num_envs)
                    ]
                    policy.dynamics_model.switch_to_pre_adapt()
                    policy.dynamics_model.adapt(adapt_obs, adapt_act,
                                                adapt_next_obs)
                actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(
                agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths.append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            pbar.update(self.vec_env.num_envs)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
    def train(self):
        """
        Collects data and trains the dynamics model
        """
        with self.sess.as_default() as sess:

            # Initialize uninitialized vars  (only initialize vars that were not loaded)
            # uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
            sess.run(tf.initializers.global_variables())

            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log("\n ---------------- Iteration %d ----------------" % itr)

                time_env_sampling_start = time.time()

                if self.initial_random_samples and itr == 0:
                    logger.log("Obtaining random samples from the environment...")
                    env_paths = self.sampler.obtain_samples(log=True, random=True, log_prefix='')

                else:
                    logger.log("Obtaining samples from the environment using the policy...")
                    env_paths = self.sampler.obtain_samples(log=True, log_prefix='')

                logger.record_tabular('Time-EnvSampling', time.time() - time_env_sampling_start)

                ''' -------------- Process the samples ----------------'''
                logger.log("Processing environment samples...")

                time_env_samp_proc = time.time()
                samples_data = self.sample_processor.process_samples(env_paths, log=True)
                logger.record_tabular('Time-EnvSampleProc', time.time() - time_env_samp_proc)

                ''' --------------- Fit the dynamics model --------------- '''

                time_fit_start = time.time()

                logger.log("Training dynamics model for %i epochs ..." % (self.dynamics_model_max_epochs))
                self.dynamics_model.fit(samples_data['observations'],
                                        samples_data['actions'],
                                        samples_data['next_observations'],
                                        epochs=self.dynamics_model_max_epochs,
                                        verbose=True,
                                        log_tabular=True)

                logger.record_tabular('Time-ModelFit', time.time() - time_fit_start)

                """ ------------------- Logging --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)
                self.log_diagnostics(env_paths, '')
                logger.save_itr_params(itr, params)
                logger.log("Saved")

                logger.dumpkvs()
                if itr == 1:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
    def fit(self, obs, act, obs_next, epochs=1000, compute_normalization=True,
            valid_split_ratio=None, rolling_average_persitency=None, verbose=False, log_tabular=False):

        assert obs.ndim == 3 and obs.shape[2] == self.obs_space_dims
        assert obs_next.ndim == 3 and obs_next.shape[2] == self.obs_space_dims
        assert act.ndim == 3 and act.shape[2] == self.action_space_dims

        if valid_split_ratio is None: valid_split_ratio = self.valid_split_ratio
        if rolling_average_persitency is None: rolling_average_persitency = self.rolling_average_persitency

        assert 1 > valid_split_ratio >= 0

        sess = tf.get_default_session()

        if (self.normalization is None or compute_normalization) and self.normalize_input:
            self.compute_normalization(obs, act, obs_next)

        if self.normalize_input:
            # Normalize data
            obs, act, delta = self._normalize_data(obs, act, obs_next)
            assert obs.ndim == act.ndim == obs_next.ndim == 3
        else:
            delta = obs_next - obs

        # Split into valid and test set
        obs_train, act_train, delta_train, obs_test, act_test, delta_test = train_test_split(obs, act, delta,
                                                                                             test_split_ratio=valid_split_ratio)
        if self._dataset_test is None:
            self._dataset_test = dict(obs=obs_test, act=act_test, delta=delta_test)
            self._dataset_train = dict(obs=obs_train, act=act_train, delta=delta_train)
        else:
            self._dataset_test['obs'] = np.concatenate([self._dataset_test['obs'], obs_test])
            self._dataset_test['act'] = np.concatenate([self._dataset_test['act'], act_test])
            self._dataset_test['delta'] = np.concatenate([self._dataset_test['delta'], delta_test])

            self._dataset_train['obs'] = np.concatenate([self._dataset_train['obs'], obs_train])
            self._dataset_train['act'] = np.concatenate([self._dataset_train['act'], act_train])
            self._dataset_train['delta'] = np.concatenate([self._dataset_train['delta'], delta_train])

        valid_loss_rolling_average = None
        epoch_times = []

        """ ------- Looping over training epochs ------- """
        num_steps_per_epoch = max(int(np.prod(self._dataset_train['obs'].shape[:2])
                                  / (self.meta_batch_size * self.batch_size * 2)), 1)
        num_steps_test = max(int(np.prod(self._dataset_test['obs'].shape[:2])
                                 / (self.meta_batch_size * self.batch_size * 2)), 1)

        for epoch in range(epochs):

            # preparations for recording training stats
            pre_batch_losses = []
            post_batch_losses = []
            t0 = time.time()

            """ ------- Looping through the shuffled and batched dataset for one epoch -------"""
            for _ in range(num_steps_per_epoch):
                obs_batch, act_batch, delta_batch = self._get_batch(train=True)

                pre_batch_loss, post_batch_loss, _ = sess.run([self.pre_loss, self.post_loss, self.train_op],
                                                               feed_dict={self.obs_ph: obs_batch,
                                                               self.act_ph: act_batch,
                                                               self.delta_ph: delta_batch})

                pre_batch_losses.append(pre_batch_loss)
                post_batch_losses.append(post_batch_loss)

            valid_losses = []
            for _ in range(num_steps_test):
                obs_test, act_test, delta_test = self._get_batch(train=False)

                # compute validation loss
                feed_dict = {self.obs_ph: obs_test,
                             self.act_ph: act_test,
                             self.delta_ph: delta_test}
                valid_loss = sess.run(self.loss, feed_dict=feed_dict)
                valid_losses.append(valid_loss)

            valid_loss = np.mean(valid_losses)
            if valid_loss_rolling_average is None:
                valid_loss_rolling_average = 1.5 * valid_loss  # set initial rolling to a higher value avoid too early stopping
                valid_loss_rolling_average_prev = 2 * valid_loss
                if valid_loss < 0:
                    valid_loss_rolling_average = valid_loss/1.5  # set initial rolling to a higher value avoid too early stopping
                    valid_loss_rolling_average_prev = valid_loss/2

            valid_loss_rolling_average = rolling_average_persitency*valid_loss_rolling_average \
                                         + (1.0-rolling_average_persitency)*valid_loss

            epoch_times.append(time.time() - t0)

            if verbose:
                logger.log("Training DynamicsModel - finished epoch %i - "
                           "train loss: %.4f   valid loss: %.4f   valid_loss_mov_avg: %.4f   epoch time: %.2f"
                           % (epoch, np.mean(post_batch_losses), valid_loss, valid_loss_rolling_average,
                              time.time() - t0))

            if valid_loss_rolling_average_prev < valid_loss_rolling_average or epoch == epochs - 1:
                logger.log('Stopping Training of Model since its valid_loss_rolling_average decreased')
                break
            valid_loss_rolling_average_prev = valid_loss_rolling_average

        """ ------- Tabular Logging ------- """
        if log_tabular:
            logger.logkv('AvgModelEpochTime', np.mean(epoch_times))
            logger.logkv('Post-Loss', np.mean(post_batch_losses))
            logger.logkv('Pre-Loss', np.mean(pre_batch_losses))
            logger.logkv('Epochs', epoch)
Example #5
0
    def _log_path_stats(self, paths, log=False, log_prefix=''):
        # compute log stats
        average_discounted_return = np.mean(
            [path["returns"][0] for path in paths])
        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        if log == 'reward':
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))

        elif log == 'all' or log is True:
            logger.logkv(log_prefix + 'AverageDiscountedReturn',
                         average_discounted_return)
            logger.logkv(log_prefix + 'AverageReturn',
                         np.mean(undiscounted_returns))
            logger.logkv(log_prefix + 'NumTrajs', len(paths))
            logger.logkv(log_prefix + 'StdReturn',
                         np.std(undiscounted_returns))
            logger.logkv(log_prefix + 'MaxReturn',
                         np.max(undiscounted_returns))
            logger.logkv(log_prefix + 'MinReturn',
                         np.min(undiscounted_returns))
Example #6
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        running_paths = _get_empty_running_paths_dict()

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True])

        # initial reset of meta_envs
        obs = np.asarray(self.env.reset())

        ts = 0

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if random:
                action = self.env.action_space.sample()
                agent_info = {}
            else:
                action, agent_info = policy.get_action(obs)
                if action.ndim == 2:
                    action = action[0]
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obs, reward, done, env_info = self.env.step(action)

            ts += 1
            done = done or ts >= self.max_path_length
            if done:
                next_obs = self.env.reset()
                ts = 0

            env_time += time.time() - t

            new_samples = 0

            # append new samples to running paths
            if isinstance(reward, np.ndarray):
                reward = reward[0]
            running_paths["observations"].append(obs)
            running_paths["actions"].append(action)
            running_paths["rewards"].append(reward)
            running_paths["dones"].append(done)
            running_paths["env_infos"].append(env_info)
            running_paths["agent_infos"].append(agent_info)

            # if running path is done, add it to paths and empty the running path
            if done:
                paths.append(
                    dict(
                        observations=np.asarray(running_paths["observations"]),
                        actions=np.asarray(running_paths["actions"]),
                        rewards=np.asarray(running_paths["rewards"]),
                        dones=np.asarray(running_paths["dones"]),
                        env_infos=utils.stack_tensor_dict_list(
                            running_paths["env_infos"]),
                        agent_infos=utils.stack_tensor_dict_list(
                            running_paths["agent_infos"]),
                    ))
                new_samples += len(running_paths["rewards"])
                running_paths = _get_empty_running_paths_dict()

            pbar.update(new_samples)
            n_samples += new_samples
            obs = next_obs
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
    def fit(self,
            obs,
            act,
            obs_next,
            epochs=1000,
            compute_normalization=True,
            valid_split_ratio=None,
            rolling_average_persitency=None,
            verbose=False,
            log_tabular=False):
        assert obs.ndim == 3 and obs.shape[2] == self.obs_space_dims
        assert obs_next.ndim == 3 and obs_next.shape[2] == self.obs_space_dims
        assert act.ndim == 3 and act.shape[2] == self.action_space_dims

        if valid_split_ratio is None:
            valid_split_ratio = self.valid_split_ratio
        if rolling_average_persitency is None:
            rolling_average_persitency = self.rolling_average_persitency

        assert 1 > valid_split_ratio >= 0

        sess = tf.get_default_session()

        if (self.normalization is None
                or compute_normalization) and self.normalize_input:
            self.compute_normalization(obs, act, obs_next)

        if self.normalize_input:
            # normalize data
            obs, act, delta = self._normalize_data(obs, act, obs_next)
            assert obs.ndim == act.ndim == obs_next.ndim == 3
        else:
            delta = obs_next - obs

        obs_train, act_train, delta_train, obs_test, act_test, delta_test = train_test_split(
            obs, act, delta, test_split_ratio=valid_split_ratio)

        if self._dataset_test is None:
            self._dataset_test = dict(obs=obs_test,
                                      act=act_test,
                                      delta=delta_test)
            self._dataset_train = dict(obs=obs_train,
                                       act=act_train,
                                       delta=delta_train)
        else:
            self._dataset_test['obs'] = np.concatenate(
                [self._dataset_test['obs'], obs_test])
            self._dataset_test['act'] = np.concatenate(
                [self._dataset_test['act'], act_test])
            self._dataset_test['delta'] = np.concatenate(
                [self._dataset_test['delta'], delta_test])

            self._dataset_train['obs'] = np.concatenate(
                [self._dataset_train['obs'], obs_train])
            self._dataset_train['act'] = np.concatenate(
                [self._dataset_train['act'], act_train])
            self._dataset_train['delta'] = np.concatenate(
                [self._dataset_train['delta'], delta_train])

            # create data queue
        if self.next_batch is None:
            self.next_batch, self.iterator = self._data_input_fn(
                self._dataset_train['obs'],
                self._dataset_train['act'],
                self._dataset_train['delta'],
                batch_size=self.batch_size)

        valid_loss_rolling_average = None
        epoch_times = []
        """ ------- Looping over training epochs ------- """
        for epoch in range(epochs):

            # initialize data queue
            feed_dict = {
                self.obs_dataset_ph: self._dataset_train['obs'],
                self.act_dataset_ph: self._dataset_train['act'],
                self.delta_dataset_ph: self._dataset_train['delta']
            }

            sess.run(self.iterator.initializer, feed_dict=feed_dict)

            # preparations for recording training stats
            batch_losses = []
            """ ------- Looping through the shuffled and batched dataset for one epoch -------"""
            t0 = time.time()
            while True:
                try:
                    obs_batch, act_batch, delta_batch = sess.run(
                        self.next_batch)
                    hidden_batch = self.get_initial_hidden(obs_batch.shape[0])
                    seq_len = obs_batch.shape[1]

                    # run train op
                    all_grads = []
                    for i in range(0, seq_len, self.backprop_steps):
                        end_i = i + self.backprop_steps
                        feed_dict = {
                            self.obs_ph: obs_batch[:, i:end_i, :],
                            self.act_ph: act_batch[:, i:end_i, :],
                            self.delta_ph: delta_batch[:, i:end_i, :]
                        }
                        hidden_feed_dict = dict(
                            zip(self.hidden_state_ph, hidden_batch))
                        feed_dict.update(hidden_feed_dict)

                        batch_loss, grads, hidden_batch = sess.run(
                            [
                                self.loss, self._gradients_vars,
                                self.next_hidden_state_var
                            ],
                            feed_dict=feed_dict)

                        all_grads.append(grads)
                        batch_losses.append(batch_loss)

                    grads = [np.mean(grad, axis=0) for grad in zip(*all_grads)]
                    feed_dict = dict(zip(self._gradients_ph, grads))
                    _ = sess.run(self.train_op, feed_dict=feed_dict)

                except tf.errors.OutOfRangeError:
                    obs_test = self._dataset_test['obs']
                    act_test = self._dataset_test['act']
                    delta_test = self._dataset_test['delta']
                    hidden_batch = self.get_initial_hidden(obs_test.shape[0])

                    # compute validation loss
                    feed_dict = {
                        self.obs_ph: obs_test,
                        self.act_ph: act_test,
                        self.delta_ph: delta_test,
                        self.hidden_state_ph: hidden_batch
                    }
                    valid_loss = sess.run(self.loss, feed_dict=feed_dict)

                    if valid_loss_rolling_average is None:
                        valid_loss_rolling_average = 1.5 * valid_loss  # set initial rolling to a higher value avoid too early stopping
                        valid_loss_rolling_average_prev = 2 * valid_loss
                        if valid_loss < 0:
                            valid_loss_rolling_average = valid_loss / 1.5  # set initial rolling to a higher value avoid too early stopping
                            valid_loss_rolling_average_prev = valid_loss / 2

                    valid_loss_rolling_average = rolling_average_persitency*valid_loss_rolling_average \
                                                 + (1.0-rolling_average_persitency)*valid_loss

                    epoch_times.append(time.time() - t0)
                    if verbose:
                        logger.log(
                            "Training RNNDynamicsModel - finished epoch %i --"
                            "train loss: %.4f  valid loss: %.4f  valid_loss_mov_avg: %.4f  epoch time: %.2f"
                            % (epoch, np.mean(batch_losses), valid_loss,
                               valid_loss_rolling_average, time.time() - t0))
                    break

            if valid_loss_rolling_average_prev < valid_loss_rolling_average or epoch == epochs - 1:
                logger.log(
                    'Stopping Training of Model since its valid_loss_rolling_average decreased'
                )
                break

            valid_loss_rolling_average_prev = valid_loss_rolling_average
        """ ------- Tabular Logging ------- """
        if log_tabular:
            logger.logkv('AvgModelEpochTime', np.mean(epoch_times))
            logger.logkv('Epochs', epoch)