Example #1
0
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Average return.

        """
        paths = self.process_samples(itr, paths)
        epoch = itr / self.steps_per_epoch

        self.episode_rewards.extend(paths['undiscounted_returns'])
        last_average_return = np.mean(self.episode_rewards)
        for _ in range(self.n_train_steps):
            if (self.replay_buffer.n_transitions_stored >=
                    self.min_buffer_size):
                self.evaluate = True
                qf_loss = self.optimize_policy(epoch, None)
                self.episode_qf_losses.append(qf_loss)

        if self.evaluate:
            if itr % self.target_network_update_freq == 0:
                self._qf_update_ops()

        if itr % self.steps_per_epoch == 0:
            if self.evaluate:
                mean100ep_rewards = round(np.mean(self.episode_rewards[-100:]),
                                          1)
                mean100ep_qf_loss = np.mean(self.episode_qf_losses[-100:])
                tabular.record('Epoch', epoch)
                tabular.record('AverageReturn', np.mean(self.episode_rewards))
                tabular.record('StdReturn', np.std(self.episode_rewards))
                tabular.record('Episode100RewardMean', mean100ep_rewards)
                tabular.record('{}/Episode100LossMean'.format(self.qf.name),
                               mean100ep_qf_loss)
        return last_average_return
Example #2
0
    def train(self, trainer):
        """Obtain samplers and start actual training for each epoch.

        Args:
            trainer (Trainer): Experiment trainer, which provides services
                such as snapshotting and sampler control.

        Returns:
            float: The average return in last epoch cycle.

        """
        if not self._eval_env:
            self._eval_env = trainer.get_env_copy()
        last_returns = [float('nan')]
        trainer.enable_logging = False

        qf_losses = []
        for _ in trainer.step_epochs():
            for cycle in range(self._steps_per_epoch):
                trainer.step_episode = trainer.obtain_episodes(
                    trainer.step_itr)
                if hasattr(self.exploration_policy, 'update'):
                    self.exploration_policy.update(trainer.step_episode)
                qf_losses.extend(
                    self._train_once(trainer.step_itr, trainer.step_episode))
                if (cycle == 0 and self._replay_buffer.n_transitions_stored >=
                        self._min_buffer_size):
                    trainer.enable_logging = True
                    eval_episodes = obtain_evaluation_episodes(
                        self.policy, self._eval_env)
                    last_returns = log_performance(trainer.step_itr,
                                                   eval_episodes,
                                                   discount=self._discount)
                trainer.step_itr += 1
            tabular.record('DQN/QFLossMean', np.mean(qf_losses))
            tabular.record('DQN/QFLossStd', np.std(qf_losses))

        return np.mean(last_returns)
Example #3
0
    def _fit_baseline(self, samples_data):
        """Update baselines from samples."""
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Augment reward from baselines
        rewards_tensor = self.f_rewards(*policy_opt_input_values)
        returns_tensor = self.f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor, -1)

        paths = samples_data['paths']
        valids = samples_data['valids']
        baselines = [path['baselines'] for path in paths]

        # Recompute parts of samples_data
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids,
                                       paths):
            path['rewards'] = rew[val.astype(np.bool)]
            path['returns'] = ret[val.astype(np.bool)]
            aug_rewards.append(path['rewards'])
            aug_returns.append(path['returns'])
        aug_rewards = tensor_utils.concat_tensor_list(aug_rewards)
        aug_returns = tensor_utils.concat_tensor_list(aug_returns)
        samples_data['rewards'] = aug_rewards
        samples_data['returns'] = aug_returns

        # Calculate explained variance
        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           aug_returns)
        tabular.record('{}/ExplainedVariance'.format(self.baseline.name), ev)

        # Fit baseline
        logger.log('Fitting baseline...')
        if hasattr(self.baseline, 'fit_with_samples'):
            self.baseline.fit_with_samples(paths, samples_data)
        else:
            self.baseline.fit(paths)
Example #4
0
    def train(self, runner):
        """Obtain samplers and start actual training for each epoch.

        Args:
            runner (LocalRunner): LocalRunner is passed to give algorithm
                the access to runner.step_epochs(), which provides services
                such as snapshotting and sampler control.

        Returns:
            float: The average return in last epoch cycle.

        """

        for _ in runner.step_epochs():
            if self.replay_buffer.n_transitions_stored < self.min_buffer_size:
                batch_size = self.min_buffer_size
            else:
                batch_size = None
            runner.step_path = runner.obtain_samples(runner.step_itr, batch_size)
            for sample in runner.step_path:
                self.replay_buffer.store(obs=sample.observation,
                                        act=sample.action,
                                        rew=sample.reward,
                                        next_obs=sample.next_observation,
                                        done=sample.terminal)
            self.episode_rewards.append(sum([sample.reward for sample in runner.step_path]))
            for _ in range(self.gradient_steps):
                last_return, policy_loss, qf1_loss, qf2_loss = self.train_once(runner.step_itr,
                                              runner.step_path)
            log_performance(
                runner.step_itr,
                self._obtain_evaluation_samples(runner.get_env_copy(), num_trajs=10),
                discount=self.discount)
            self.log_statistics(policy_loss, qf1_loss, qf2_loss)
            tabular.record('TotalEnvSteps', runner.total_env_steps)
            runner.step_itr += 1

        return last_return
Example #5
0
    def train(self, runner):
        """Obtain samplers and start actual training for each epoch.

        Args:
            runner (LocalRunner): LocalRunner is passed to give algorithm
                the access to runner.step_epochs(), which provides services
                such as snapshotting and sampler control.

        Returns:
            float: The average return in last epoch cycle.

        """
        last_return = None

        for _ in runner.step_epochs():
            for _ in range(self.n_samples):
                runner.step_path = runner.obtain_samples(runner.step_itr)
                tabular.record('TotalEnvSteps', runner.total_env_steps)
                last_return = self.train_once(runner.step_itr,
                                              runner.step_path)
                runner.step_itr += 1

        return last_return
Example #6
0
    def _train_once(self):
        """Perform one iteration of training."""
        policy_loss_list = []
        qf_loss_list = []
        alpha_loss_list = []
        alpha_list = []
        for _ in range(self._num_steps_per_epoch):
            indices = np.random.choice(range(self._num_train_tasks),
                                       self._meta_batch_size)
            policy_loss, qf_loss, alpha_loss, alpha = self._optimize_policy(
                indices)
            policy_loss_list.append(policy_loss)
            qf_loss_list.append(qf_loss)
            alpha_loss_list.append(alpha_loss)
            alpha_list.append(alpha)

        with tabular.prefix('MetaTrain/Average/'):
            tabular.record('PolicyLoss',
                           np.average(np.array(policy_loss_list)))
            tabular.record('QfLoss', np.average(np.array(qf_loss_list)))
            tabular.record('AlphaLoss', np.average(np.array(alpha_loss_list)))
            tabular.record('AlphaLoss', np.average(np.array(alpha_loss_list)))
            tabular.record('Alpha', np.average(np.array(alpha_list)))
Example #7
0
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            float: The average return in last epoch cycle.

        """
        paths = self.process_samples(itr, paths)

        epoch = itr // self.n_samples
        i_sample = itr - epoch * self.n_samples

        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)

        rtn = paths['average_return']
        self._all_returns.append(paths['average_return'])

        if (itr + 1) % self.n_samples == 0:
            avg_rtns = np.array(self._all_returns)
            self._es.tell(self._all_params, -avg_rtns)
            self.policy.set_param_values(self._es.best.get()[0])

            # Clear for next epoch
            rtn = max(self._all_returns)
            self._all_returns.clear()
            self._all_params = self._sample_params()

        self._cur_params = self._all_params[(i_sample + 1) % self.n_samples]
        self.policy.set_param_values(self._cur_params)

        logger.log(tabular)
        return rtn
    def step(self, action):
        """
        Run one timestep of the environment's dynamics. When end of episode
        is reached, reset() should be called to reset the environment's internal state.
        Input
        -----
        action : an action provided by the policy, here a combination of f and l1, l2, ...
        
        Outputs
        -------
        (observation, reward, done, info)
        observation : agent's observation of the current environment
        reward [Float] : amount of reward due to the previous action
        done : a boolean, indicating whether the episode has ended
        info : a dictionary containing other diagnostic information from the previous action
        """
        self.step_cnt += 1
        action = np.clip(action.copy(), self.action_space.low,
                         self.action_space.high)
        f = action[0]  # input force
        l = np.sum(action[1:])  # bar length
        f_total = f + (self.m1 + self.m2) * self.g - self.k * self.y1
        a = f_total / (self.m1 + self.m2)
        self.y1, self.v1 = self.simulate_w_mid_point_euler(self.y1, self.v1, a)
        y2 = self.y1 + l
        obs = np.array([self.y1, self.v1])
        reward = self.calc_reward(y2, f, self.v1)
        done = False
        info = {}
        if self.step_cnt == self.n_steps_per_episode:
            print()
            print('y2: ', y2)
            print('v2: ', self.v1)
            print('l: ', l)
            tabular.record('Env/FinalL', l)

        return obs, reward, done, info
    def fit(self, paths):
        """Fit regressor based on paths.

        Args:
            paths (dict[numpy.ndarray]): Sample paths.

        """
        xs = np.concatenate([p['observations'] for p in paths])
        if isinstance(self._env_spec.observation_space, akro.Image) and \
                len(xs[0].shape) < \
                len(self._env_spec.observation_space.shape):
            xs = self._env_spec.observation_space.unflatten_n(xs)
        ys = np.concatenate([p['returns'] for p in paths])
        ys = ys.reshape((-1, 1))

        if self._subsample_factor < 1:
            num_samples_tot = xs.shape[0]
            idx = np.random.randint(
                0, num_samples_tot,
                int(num_samples_tot * self._subsample_factor))
            xs, ys = xs[idx], ys[idx]

        if self._normalize_inputs:
            # recompute normalizing constants for inputs
            self._x_mean.load(np.mean(xs, axis=0, keepdims=True))
            self._x_std.load(np.std(xs, axis=0, keepdims=True) + 1e-8)
            self._old_network.x_mean.load(np.mean(xs, axis=0, keepdims=True))
            self._old_network.x_std.load(
                np.std(xs, axis=0, keepdims=True) + 1e-8)
        if self._normalize_outputs:
            # recompute normalizing constants for outputs
            self._y_mean.load(np.mean(ys, axis=0, keepdims=True))
            self._y_std.load(np.std(ys, axis=0, keepdims=True) + 1e-8)
            self._old_network.y_mean.load(np.mean(ys, axis=0, keepdims=True))
            self._old_network.y_std.load(
                np.std(ys, axis=0, keepdims=True) + 1e-8)
        inputs = [xs, ys]
        loss_before = self._optimizer.loss(inputs)
        tabular.record('{}/LossBefore'.format(self._name), loss_before)
        self._optimizer.optimize(inputs)
        loss_after = self._optimizer.loss(inputs)
        tabular.record('{}/LossAfter'.format(self._name), loss_after)
        if self._use_trust_region:
            tabular.record('{}/MeanKL'.format(self._name),
                           self._optimizer.constraint_val(inputs))
        tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after)
        self._old_model.parameters = self.parameters
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            float: The average return in last epoch cycle.

        """
        # -- Stage: Calculate baseline
        if hasattr(self._baseline, 'predict_n'):
            baseline_predictions = self._baseline.predict_n(paths)
        else:
            baseline_predictions = [
                self._baseline.predict(path) for path in paths
            ]

        # -- Stage: Pre-process samples based on collected paths
        samples_data = paths_to_tensors(paths, self.max_path_length,
                                        baseline_predictions, self._discount)

        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        samples_data['average_return'] = np.mean(undiscounted_returns)

        epoch = itr // self._n_samples
        i_sample = itr - epoch * self._n_samples

        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)

        rtn = samples_data['average_return']
        self._all_returns.append(samples_data['average_return'])

        if (itr + 1) % self._n_samples == 0:
            avg_rtns = np.array(self._all_returns)
            self._es.tell(self._all_params, -avg_rtns)
            self.policy.set_param_values(self._es.best.get()[0])

            # Clear for next epoch
            rtn = max(self._all_returns)
            self._all_returns.clear()
            self._all_params = self._sample_params()

        self._cur_params = self._all_params[(i_sample + 1) % self._n_samples]
        self.policy.set_param_values(self._cur_params)

        logger.log(tabular)
        return rtn
    def fit(self, xs, ys):
        """Fit with input data xs and label ys.

        Args:
            xs (numpy.ndarray): Input data.
            ys (numpy.ndarray): Label of input data.

        """
        if self._subsample_factor < 1:
            num_samples_tot = xs.shape[0]
            idx = np.random.randint(
                0, num_samples_tot,
                int(num_samples_tot * self._subsample_factor))
            xs, ys = xs[idx], ys[idx]

        if self._normalize_inputs:
            # recompute normalizing constants for inputs
            self.model.networks['default'].x_mean.load(
                np.mean(xs, axis=0, keepdims=True))
            self.model.networks['default'].x_std.load(
                np.std(xs, axis=0, keepdims=True) + 1e-8)
            self._old_model.networks['default'].x_mean.load(
                np.mean(xs, axis=0, keepdims=True))
            self._old_model.networks['default'].x_std.load(
                np.std(xs, axis=0, keepdims=True) + 1e-8)
        if self._normalize_outputs:
            # recompute normalizing constants for outputs
            self.model.networks['default'].y_mean.load(
                np.mean(ys, axis=0, keepdims=True))
            self.model.networks['default'].y_std.load(
                np.std(ys, axis=0, keepdims=True) + 1e-8)
            self._old_model.networks['default'].y_mean.load(
                np.mean(ys, axis=0, keepdims=True))
            self._old_model.networks['default'].y_std.load(
                np.std(ys, axis=0, keepdims=True) + 1e-8)
        inputs = [xs, ys]
        loss_before = self._optimizer.loss(inputs)
        tabular.record('{}/LossBefore'.format(self._name), loss_before)
        self._optimizer.optimize(inputs)
        loss_after = self._optimizer.loss(inputs)
        tabular.record('{}/LossAfter'.format(self._name), loss_after)
        if self._use_trust_region:
            tabular.record('{}/MeanKL'.format(self._name),
                           self._optimizer.constraint_val(inputs))
        tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after)
        self._old_model.parameters = self.model.parameters
Example #12
0
    def train_once(self, itr, paths, obs_upper, obs_lower, action_upper,
                   action_lower, evaluate_paths):
        """Train the algorithm once."""
        paths = self.process_samples(itr, paths)
        evaluate_paths = self.process_samples(itr, evaluate_paths)
        epoch = itr / self.n_epoch_cycles

        self.episode_rewards.extend(evaluate_paths['undiscounted_returns'])
        last_average_return = np.mean(self.episode_rewards)
        for train_itr in range(self.n_train_steps):
            if (self.replay_buffer.n_transitions_stored >=
                    self.min_buffer_size):
                self.evaluate = True
                qf_loss = self.optimize_policy(epoch, None)
                self.episode_qf_losses.append(qf_loss)

        if self.evaluate:
            if itr % self.target_network_update_freq == 0:
                self._qf_update_ops()

        if itr % self.n_epoch_cycles == 0:
            if self.evaluate:
                mean100ep_rewards = round(np.mean(self.episode_rewards[-100:]),
                                          1)
                mean100ep_qf_loss = np.mean(self.episode_qf_losses[-100:])
                tabular.record('Epoch', epoch)
                tabular.record('AverageReturn', np.mean(self.episode_rewards))
                tabular.record('StdReturn', np.std(self.episode_rewards))
                tabular.record('Episode100RewardMean', mean100ep_rewards)
                tabular.record('{}/Episode100LossMean'.format(self.qf.name),
                               mean100ep_qf_loss)

            if not self.smooth_return:
                self.episode_rewards = []

        return last_average_return
    def _train_irl(self, paths, itr=0):
        if self.no_reward:
            total_rew = 0.
            for path in paths:
                total_rew += np.sum(path['rewards'])
                path['rewards'] *= 0
            tabular.record('OriginalTaskAverageReturn',
                           total_rew / float(len(paths)))

        if self.irl_model_wt <= 0:
            return paths

        max_iters = self.discrim_train_itrs
        mean_loss = self._irl.train(paths)

        tabular.record('IRLLoss', mean_loss)
        self.irl_params = self._irl.get_params()

        estimated_rewards = self._irl.eval(paths, gamma=self.discount, itr=itr)

        tabular.record('IRLRewardMean',
                       np.mean(np.concatenate(estimated_rewards)))
        tabular.record('IRLRewardMean',
                       np.max(np.concatenate(estimated_rewards)))
        tabular.record('IRLRewardMean',
                       np.min(np.concatenate(estimated_rewards)))

        # Replace the original reward signal with learned reward signal
        # This will be used by agents to learn policy
        if self._irl.score_trajectories:
            for i, path in enumerate(paths):
                path['rewards'][-1] += self.irl_model_wt * estimated_rewards[i]
        else:
            for i, path in enumerate(paths):
                path['rewards'] += self.irl_model_wt * estimated_rewards[i]
        return paths
Example #14
0
    def train(self, runner):
        """Obtain samplers and start actual training for each epoch.

        Args:
            runner (LocalRunner): LocalRunner is passed to give algorithm
                the access to runner.step_epochs(), which provides services
                such as snapshotting and sampler control.

        Returns:
            float: The average return in last epoch cycle.

        """
        if not self._eval_env:
            self._eval_env = runner.get_env_copy()
        last_returns = [float('nan')]
        runner.enable_logging = False

        qf_losses = []
        for _ in runner.step_epochs():
            for cycle in range(self._steps_per_epoch):
                runner.step_path = runner.obtain_trajectories(runner.step_itr)
                qf_losses.extend(
                    self.train_once(runner.step_itr, runner.step_path))
                if (cycle == 0 and self.replay_buffer.n_transitions_stored >=
                        self._min_buffer_size):
                    runner.enable_logging = True
                    eval_samples = obtain_evaluation_samples(
                        self.policy, self._eval_env)
                    last_returns = log_performance(runner.step_itr,
                                                   eval_samples,
                                                   discount=self._discount)
                runner.step_itr += 1
            tabular.record('DQN/QFLossMean', np.mean(qf_losses))
            tabular.record('DQN/QFLossStd', np.std(qf_losses))

        return np.mean(last_returns)
Example #15
0
 def fit(self, xs, ys):
     if self._subsample_factor < 1:
         num_samples_tot = xs.shape[0]
         idx = np.random.randint(
             0, num_samples_tot,
             int(num_samples_tot * self._subsample_factor))
         xs, ys = xs[idx], ys[idx]
     sess = tf.compat.v1.get_default_session()
     if self._normalize_inputs:
         # recompute normalizing constants for inputs
         feed_dict = {
             self._x_mean_var_ph: np.mean(xs, axis=0, keepdims=True),
             self._x_std_var_ph: np.std(xs, axis=0, keepdims=True) + 1e-8,
         }
         sess.run([
             self._assign_x_mean,
             self._assign_x_std,
         ], feed_dict=feed_dict)  # yapf: disable
     if self._normalize_outputs:
         # recompute normalizing constants for outputs
         feed_dict = {
             self._y_mean_var_ph: np.mean(ys, axis=0, keepdims=True),
             self._y_std_var_ph: np.std(ys, axis=0, keepdims=True) + 1e-8,
         }
         sess.run([self._assign_y_mean, self._assign_y_std],
                  feed_dict=feed_dict)
     if self._use_trust_region:
         old_means, old_log_stds = self._f_pdists(xs)
         inputs = [xs, ys, old_means, old_log_stds]
     else:
         inputs = [xs, ys]
     loss_before = self._optimizer.loss(inputs)
     if self._name:
         prefix = self._name + '/'
     else:
         prefix = ''
     tabular.record(prefix + 'LossBefore', loss_before)
     self._optimizer.optimize(inputs)
     loss_after = self._optimizer.loss(inputs)
     tabular.record(prefix + 'LossAfter', loss_after)
     if self._use_trust_region:
         tabular.record(prefix + 'MeanKL',
                        self._optimizer.constraint_val(inputs))
     tabular.record(prefix + 'dLoss', loss_before - loss_after)
Example #16
0
    def fit(self, xs, ys):
        """Optimize the regressor based on the inputs."""
        if self._subsample_factor < 1:
            num_samples_tot = xs.shape[0]
            idx = np.random.randint(
                0, num_samples_tot,
                int(num_samples_tot * self._subsample_factor))
            xs, ys = xs[idx], ys[idx]

        sess = tf.get_default_session()
        if self._normalize_inputs:
            # recompute normalizing constants for inputs
            sess.run([
                tf.assign(self._x_mean_var, np.mean(xs, axis=0,
                                                    keepdims=True)),
                tf.assign(self._x_std_var,
                          np.std(xs, axis=0, keepdims=True) + 1e-8),
            ])
        if self._normalize_outputs:
            # recompute normalizing constants for outputs
            sess.run([
                tf.assign(self._y_mean_var, np.mean(ys, axis=0,
                                                    keepdims=True)),
                tf.assign(self._y_std_var,
                          np.std(ys, axis=0, keepdims=True) + 1e-8),
            ])
        if self._use_trust_region:
            old_means, old_log_stds = self._f_pdists(xs)
            inputs = [xs, ys, old_means, old_log_stds]
        else:
            inputs = [xs, ys]
        loss_before = self._optimizer.loss(inputs)
        if self._name:
            prefix = self._name + '/'
        else:
            prefix = ''
        tabular.record(prefix + 'LossBefore', loss_before)
        self._optimizer.optimize(inputs)
        loss_after = self._optimizer.loss(inputs)
        tabular.record(prefix + 'LossAfter', loss_after)
        if self._use_trust_region:
            tabular.record(prefix + 'MeanKL',
                           self._optimizer.constraint_val(inputs))
        tabular.record(prefix + 'dLoss', loss_before - loss_after)
Example #17
0
    def extra_recording(self, itr):
        """Record extra training statistics per-iteration.

        Parameters
        ----------
        itr : int
            The iteration number.
        """
        tabular.record('Max Divergence', np.max(self.divergences))
        tabular.record('Min Divergence', np.min(self.divergences))
        tabular.record('Mean Divergence', np.mean(self.divergences))
        return None
Example #18
0
    def _train_once(self, itr, episodes):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            episodes (garage.EpisodeBatch): Episodes collected using the
                current policy.

        Returns:
            float: The average return of epoch cycle.

        """
        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(itr,
                                               episodes,
                                               discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        average_return = np.mean(undiscounted_returns)

        epoch = itr // self._n_samples
        i_sample = itr - epoch * self._n_samples
        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)
        rtn = average_return
        self._all_returns.append(average_return)

        # -- Stage: Update policy distribution.
        if (itr + 1) % self._n_samples == 0:
            avg_rtns = np.array(self._all_returns)
            best_inds = np.argsort(-avg_rtns)[:self._n_best]
            best_params = np.array(self._all_params)[best_inds]

            # MLE of normal distribution
            self._cur_mean = best_params.mean(axis=0)
            self._cur_std = best_params.std(axis=0)
            self.policy.set_param_values(self._cur_mean)

            # Clear for next epoch
            rtn = max(self._all_returns)
            self._all_returns.clear()
            self._all_params.clear()

        # -- Stage: Generate a new policy for next path sampling
        self._cur_params = self._sample_params(itr)
        self._all_params.append(self._cur_params.copy())
        self.policy.set_param_values(self._cur_params)

        logger.log(tabular)
        return rtn
    def fit(self, xs, ys):
        """Fit with input data xs and label ys."""
        if self._normalize_inputs:
            # recompute normalizing constants for inputs
            self.model.networks['default'].x_mean.load(
                np.mean(xs, axis=0, keepdims=True))
            self.model.networks['default'].x_std.load(
                np.std(xs, axis=0, keepdims=True) + 1e-8)

        inputs = [xs, ys]
        loss_before = self._optimizer.loss(inputs)
        tabular.record('{}/LossBefore'.format(self._name), loss_before)
        self._optimizer.optimize(inputs)
        loss_after = self._optimizer.loss(inputs)
        tabular.record('{}/LossAfter'.format(self._name), loss_after)
        tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after)
Example #20
0
    def _train_once(self, itr, episodes):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            episodes (garage.EpisodeBatch): Episodes collected using the
                current policy.

        Returns:
            float: The average return of epoch cycle.

        """
        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(itr,
                                               episodes,
                                               discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        average_return = np.mean(undiscounted_returns)

        epoch = itr // self._n_samples
        i_sample = itr - epoch * self._n_samples

        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)
        rtn = average_return
        self._all_returns.append(average_return)

        if (itr + 1) % self._n_samples == 0:
            avg_rtns = np.array(self._all_returns)
            self._es.tell(self._all_params, -avg_rtns)
            self.policy.set_param_values(self._es.best.get()[0])

            # Clear for next epoch
            rtn = max(self._all_returns)
            self._all_returns.clear()
            self._all_params = self._sample_params()

        self._cur_params = self._all_params[(i_sample + 1) % self._n_samples]
        self.policy.set_param_values(self._cur_params)

        logger.log(tabular)
        return rtn
 def fit(self, xs, ys):
     if self.normalize_inputs:
         # recompute normalizing constants for inputs
         new_mean = np.mean(xs, axis=0, keepdims=True)
         new_std = np.std(xs, axis=0, keepdims=True) + 1e-8
         tf.compat.v1.get_default_session().run(
             tf.group(
                 tf.compat.v1.assign(self.x_mean_var, new_mean),
                 tf.compat.v1.assign(self.x_std_var, new_std),
             ))
     inputs = [xs, ys]
     loss_before = self.optimizer.loss(inputs)
     if self.name:
         prefix = self.name + '/'
     else:
         prefix = ''
     tabular.record(prefix + 'LossBefore', loss_before)
     self.optimizer.optimize(inputs)
     loss_after = self.optimizer.loss(inputs)
     tabular.record(prefix + 'LossAfter', loss_after)
     tabular.record(prefix + 'dLoss', loss_before - loss_after)
Example #22
0
    def fit(self, paths):
        """Fit regressor based on paths.

        Args:
            paths (dict[numpy.ndarray]): Sample paths.

        """
        xs = np.concatenate([p['observations'] for p in paths])
        ys = np.concatenate([p['returns'] for p in paths])
        ys = ys.reshape((-1, 1))
        if self._normalize_inputs:
            # recompute normalizing constants for inputs
            self._x_mean.load(np.mean(xs, axis=0, keepdims=True))
            self._x_std.load(np.std(xs, axis=0, keepdims=True) + 1e-8)

        inputs = [xs, ys]
        loss_before = self._optimizer.loss(inputs)
        tabular.record('{}/LossBefore'.format(self._name), loss_before)
        self._optimizer.optimize(inputs)
        loss_after = self._optimizer.loss(inputs)
        tabular.record('{}/LossAfter'.format(self._name), loss_after)
        tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after)
Example #23
0
    def train_once(self, itr, trajectories):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            trajectories (TrajectoryBatch): Batch of trajectories.

        Returns:
            numpy.float64: Average return.

        """
        epoch = itr / self._steps_per_epoch

        self.episode_rewards.extend(
            [traj.rewards.sum() for traj in trajectories.split()])
        last_average_return = np.mean(self.episode_rewards)
        for _ in range(self._n_train_steps):
            if (self.replay_buffer.n_transitions_stored >=
                    self._min_buffer_size):
                qf_loss = self.optimize_policy(None)
                self.episode_qf_losses.append(qf_loss)

        if self.replay_buffer.n_transitions_stored >= self._min_buffer_size:
            if itr % self._target_network_update_freq == 0:
                self._qf_update_ops()

        if itr % self._steps_per_epoch == 0:
            if (self.replay_buffer.n_transitions_stored >=
                    self._min_buffer_size):
                mean100ep_rewards = round(np.mean(self.episode_rewards[-100:]),
                                          1)
                mean100ep_qf_loss = np.mean(self.episode_qf_losses[-100:])
                tabular.record('Epoch', epoch)
                tabular.record('Episode100RewardMean', mean100ep_rewards)
                tabular.record('{}/Episode100LossMean'.format(self._qf.name),
                               mean100ep_qf_loss)
        return last_average_return
Example #24
0
    def optimize_policy(self, samples_data):
        """Optimize the policy using the samples.

        Args:
            samples_data (dict): Processed sample data.
                See metarl.tf.paths_to_tensors() for details.

        """
        # Initial BFGS parameter values.
        x0 = np.hstack([self._param_eta, self._param_v])
        # Set parameter boundaries: \eta>=1e-12, v unrestricted.
        bounds = [(-np.inf, np.inf) for _ in x0]
        bounds[0] = (1e-12, np.inf)

        # Optimize dual
        eta_before = self._param_eta
        logger.log('Computing dual before')
        self._feat_diff = self._features(samples_data)
        dual_opt_input_values = self._dual_opt_input_values(samples_data)
        dual_before = self._f_dual(*dual_opt_input_values)
        logger.log('Optimizing dual')

        def eval_dual(x):
            """Evaluate dual function loss.

            Args:
                x (numpy.ndarray): Input to dual function.

            Returns:
                numpy.float64: Dual function loss.

            """
            self._param_eta = x[0]
            self._param_v = x[1:]
            dual_opt_input_values = self._dual_opt_input_values(samples_data)
            return self._f_dual(*dual_opt_input_values)

        def eval_dual_grad(x):
            """Evaluate gradient of dual function loss.

            Args:
                x (numpy.ndarray): Input to dual function.

            Returns:
                numpy.ndarray: Gradient of dual function loss.

            """
            self._param_eta = x[0]
            self._param_v = x[1:]
            dual_opt_input_values = self._dual_opt_input_values(samples_data)
            grad = self._f_dual_grad(*dual_opt_input_values)
            eta_grad = np.float(grad[0])
            v_grad = grad[1]
            return np.hstack([eta_grad, v_grad])

        params_ast, _, _ = self._dual_optimizer(func=eval_dual,
                                                x0=x0,
                                                fprime=eval_dual_grad,
                                                bounds=bounds,
                                                **self._dual_optimizer_args)

        logger.log('Computing dual after')
        self._param_eta, self._param_v = params_ast[0], params_ast[1:]
        dual_opt_input_values = self._dual_opt_input_values(samples_data)
        dual_after = self._f_dual(*dual_opt_input_values)

        # Optimize policy
        policy_opt_input_values = self._policy_opt_input_values(samples_data)
        logger.log('Computing policy loss before')
        loss_before = self._optimizer.loss(policy_opt_input_values)
        logger.log('Computing policy KL before')
        policy_kl_before = self._f_policy_kl(*policy_opt_input_values)
        logger.log('Optimizing policy')
        self._optimizer.optimize(policy_opt_input_values)
        logger.log('Computing policy KL')
        policy_kl = self._f_policy_kl(*policy_opt_input_values)
        logger.log('Computing policy loss after')
        loss_after = self._optimizer.loss(policy_opt_input_values)
        tabular.record('EtaBefore', eta_before)
        tabular.record('EtaAfter', self._param_eta)
        tabular.record('DualBefore', dual_before)
        tabular.record('DualAfter', dual_after)
        tabular.record('{}/LossBefore'.format(self.policy.name), loss_before)
        tabular.record('{}/LossAfter'.format(self.policy.name), loss_after)
        tabular.record('{}/dLoss'.format(self.policy.name),
                       loss_before - loss_after)
        tabular.record('{}/KLBefore'.format(self.policy.name),
                       policy_kl_before)
        tabular.record('{}/KL'.format(self.policy.name), policy_kl)
        self._old_policy.model.parameters = self.policy.model.parameters
Example #25
0
    def train_once(self, itr, paths):
        """Train the algorithm once.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Calculated mean value of undiscounted returns.

        """
        obs, actions, rewards, returns, valids, baselines = \
            self.process_samples(paths)

        if self._maximum_entropy:
            policy_entropies = self._compute_policy_entropy(obs)
            rewards += self._policy_ent_coeff * policy_entropies

        obs_flat = torch.cat(filter_valids(obs, valids))
        actions_flat = torch.cat(filter_valids(actions, valids))
        rewards_flat = torch.cat(filter_valids(rewards, valids))
        returns_flat = torch.cat(filter_valids(returns, valids))
        advs_flat = self._compute_advantage(rewards, valids, baselines)

        with torch.no_grad():
            policy_loss_before = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_before = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_before = self._compute_kl_constraint(obs)

        self._train(obs_flat, actions_flat, rewards_flat, returns_flat,
                    advs_flat)

        with torch.no_grad():
            policy_loss_after = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_after = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_after = self._compute_kl_constraint(obs)
            policy_entropy = self._compute_policy_entropy(obs)

        with tabular.prefix(self.policy.name):
            tabular.record('/LossBefore', policy_loss_before.item())
            tabular.record('/LossAfter', policy_loss_after.item())
            tabular.record('/dLoss',
                           (policy_loss_before - policy_loss_after).item())
            tabular.record('/KLBefore', kl_before.item())
            tabular.record('/KL', kl_after.item())
            tabular.record('/Entropy', policy_entropy.mean().item())

        with tabular.prefix(self._value_function.name):
            tabular.record('/LossBefore', vf_loss_before.item())
            tabular.record('/LossAfter', vf_loss_after.item())
            tabular.record('/dLoss',
                           vf_loss_before.item() - vf_loss_after.item())

        self._old_policy.load_state_dict(self.policy.state_dict())

        undiscounted_returns = log_performance(itr,
                                               EpisodeBatch.from_list(
                                                   self._env_spec, paths),
                                               discount=self.discount)
        return np.mean(undiscounted_returns)
Example #26
0
    def _train_once(self, itr, episodes):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            episodes (EpisodeBatch): Batch of episodes.

        """
        self._replay_buffer.add_episode_batch(episodes)

        epoch = itr / self._steps_per_epoch

        for _ in range(self._n_train_steps):
            if (self._replay_buffer.n_transitions_stored >=
                    self._min_buffer_size):
                qf_loss, y_s, qval, policy_loss = self._optimize_policy(itr)

                self._episode_policy_losses.append(policy_loss)
                self._episode_qf_losses.append(qf_loss)
                self._epoch_ys.append(y_s)
                self._epoch_qs.append(qval)

        if itr % self._steps_per_epoch == 0:
            logger.log('Training finished')

            if (self._replay_buffer.n_transitions_stored >=
                    self._min_buffer_size):
                tabular.record('Epoch', epoch)
                tabular.record('Policy/AveragePolicyLoss',
                               np.mean(self._episode_policy_losses))
                tabular.record('QFunction/AverageQFunctionLoss',
                               np.mean(self._episode_qf_losses))
                tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs))
                tabular.record('QFunction/MaxQ', np.max(self._epoch_qs))
                tabular.record('QFunction/AverageAbsQ',
                               np.mean(np.abs(self._epoch_qs)))
                tabular.record('QFunction/AverageY', np.mean(self._epoch_ys))
                tabular.record('QFunction/MaxY', np.max(self._epoch_ys))
                tabular.record('QFunction/AverageAbsY',
                               np.mean(np.abs(self._epoch_ys)))
Example #27
0
    def train_once(self, itr, paths):
        """Perform one iteration of training.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            float: Average return.

        """
        paths = self.process_samples(itr, paths)

        epoch = itr / self.steps_per_epoch

        self._episode_rewards.extend([
            path for path, complete in zip(paths['undiscounted_returns'],
                                           paths['complete']) if complete
        ])
        self._success_history.extend([
            path for path, complete in zip(paths['success_history'],
                                           paths['complete']) if complete
        ])

        last_average_return = np.mean(self._episode_rewards)
        for _ in range(self.n_train_steps):
            if (self.replay_buffer.n_transitions_stored >=
                    self.min_buffer_size):
                self._evaluate = True
                samples = self.replay_buffer.sample(self.buffer_batch_size)
                qf_loss, y, q, policy_loss = tu.torch_to_np(
                    self.optimize_policy(itr, samples))

                self._episode_policy_losses.append(policy_loss)
                self._episode_qf_losses.append(qf_loss)
                self._epoch_ys.append(y)
                self._epoch_qs.append(q)

        if itr % self.steps_per_epoch == 0:
            logger.log('Training finished')

            if self._evaluate:
                tabular.record('Epoch', epoch)
                tabular.record('AverageReturn', np.mean(self._episode_rewards))
                tabular.record('StdReturn', np.std(self._episode_rewards))
                tabular.record('Policy/AveragePolicyLoss',
                               np.mean(self._episode_policy_losses))
                tabular.record('QFunction/AverageQFunctionLoss',
                               np.mean(self._episode_qf_losses))
                tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs))
                tabular.record('QFunction/MaxQ', np.max(self._epoch_qs))
                tabular.record('QFunction/AverageAbsQ',
                               np.mean(np.abs(self._epoch_qs)))
                tabular.record('QFunction/AverageY', np.mean(self._epoch_ys))
                tabular.record('QFunction/MaxY', np.max(self._epoch_ys))
                tabular.record('QFunction/AverageAbsY',
                               np.mean(np.abs(self._epoch_ys)))
                tabular.record('AverageSuccessRate',
                               np.mean(self._success_history))

            if not self.smooth_return:
                self._episode_rewards = []
                self._episode_policy_losses = []
                self._episode_qf_losses = []
                self._epoch_ys = []
                self._epoch_qs = []

            self._success_history.clear()

        return last_average_return
Example #28
0
def log_multitask_performance(itr, batch, discount, name_map=None):
    r"""Log performance of trajectories from multiple tasks.

    Args:
        itr (int): Iteration number to be logged.
        batch (garage.TrajectoryBatch): Batch of trajectories. The trajectories
            should have either the "task_name" or "task_id" `env_infos`. If the
            "task_name" is not present, then `name_map` is required, and should
            map from task id's to task names.
        discount (float): Discount used in computing returns.
        name_map (dict[int, str] or None): Mapping from task id's to task
            names. Optional if the "task_name" environment info is present.
            Note that if provided, all tasks listed in this map will be logged,
            even if there are no trajectories present for them.

    Returns:
        numpy.ndarray: Undiscounted returns averaged across all tasks. Has
            shape :math:`(N \bullet [T])`.

    """
    traj_by_name = defaultdict(list)
    for trajectory in batch.split():
        task_name = '__unnamed_task__'
        if 'task_name' in trajectory.env_infos:
            task_name = trajectory.env_infos['task_name'][0]
        elif 'task_id' in trajectory.env_infos:
            name_map = {} if name_map is None else name_map
            task_id = trajectory.env_infos['task_id'][0]
            task_name = name_map.get(task_id, 'Task #{}'.format(task_id))
        traj_by_name[task_name].append(trajectory)
    if name_map is None:
        task_names = traj_by_name.keys()
    else:
        task_names = name_map.values()
    for task_name in task_names:
        if task_name in traj_by_name:
            trajectories = traj_by_name[task_name]
            log_performance(itr,
                            garage.TrajectoryBatch.concatenate(*trajectories),
                            discount,
                            prefix=task_name)
        else:
            with tabular.prefix(task_name + '/'):
                tabular.record('Iteration', itr)
                tabular.record('NumTrajs', 0)
                tabular.record('AverageDiscountedReturn', np.nan)
                tabular.record('AverageReturn', np.nan)
                tabular.record('StdReturn', np.nan)
                tabular.record('MaxReturn', np.nan)
                tabular.record('MinReturn', np.nan)
                tabular.record('TerminationRate', np.nan)
                tabular.record('SuccessRate', np.nan)

    return log_performance(itr, batch, discount=discount, prefix='Average')
Example #29
0
def log_performance(itr, batch, discount, prefix='Evaluation'):
    """Evaluate the performance of an algorithm on a batch of trajectories.

    Args:
        itr (int): Iteration number.
        batch (TrajectoryBatch): The trajectories to evaluate with.
        discount (float): Discount value, from algorithm's property.
        prefix (str): Prefix to add to all logged keys.

    Returns:
        numpy.ndarray: Undiscounted returns.

    """
    returns = []
    undiscounted_returns = []
    termination = []
    success = []
    for trajectory in batch.split():
        returns.append(discount_cumsum(trajectory.rewards, discount))
        undiscounted_returns.append(sum(trajectory.rewards))
        termination.append(
            float(
                any(step_type == StepType.TERMINAL
                    for step_type in trajectory.step_types)))
        if 'success' in trajectory.env_infos:
            success.append(float(trajectory.env_infos['success'].any()))

    average_discounted_return = np.mean([rtn[0] for rtn in returns])

    with tabular.prefix(prefix + '/'):
        tabular.record('Iteration', itr)
        tabular.record('NumTrajs', len(returns))

        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))
        tabular.record('TerminationRate', np.mean(termination))
        if success:
            tabular.record('SuccessRate', np.mean(success))

    return undiscounted_returns
Example #30
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = special.discount_cumsum(path['rewards'],
                                                      self.discount)
            returns.append(path['returns'])

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        average_discounted_return = (np.mean(
            [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self.episode_reward_mean.extend(undiscounted_returns)

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        samples_data = dict(average_return=np.mean(undiscounted_returns))

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))
        tabular.record('NumTrajs', len(paths))
        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))

        return samples_data