コード例 #1
0
ファイル: maze_env.py プロジェクト: reslthrowaway/garage
 def log_diagnostics(self, paths, *args, **kwargs):
     # we call here any logging related to the maze, strip the maze
     # obs and call log_diag with the stripped paths we need to log
     # the purely gather reward!!
     with logger.tabular_prefix('Maze_'):
         gather_undiscounted_returns = [
             sum(path['env_infos']['outer_rew']) for path in paths
         ]
         logger.record_tabular_misc_stat('Return',
                                         gather_undiscounted_returns,
                                         placement='front')
     stripped_paths = []
     for path in paths:
         stripped_path = {}
         for k, v in path.items():
             stripped_path[k] = v
         stripped_path['observations'] = stripped_path[
             'observations'][:, :flat_dim(self.env.observation_space)]
         #  this breaks if the obs of the robot are d>1 dimensional (not a
         #  vector)
         stripped_paths.append(stripped_path)
     with logger.tabular_prefix('wrapped_'):
         wrapped_undiscounted_return = np.mean(
             [np.sum(path['env_infos']['inner_rew']) for path in paths])
         logger.record_tabular('AverageReturn', wrapped_undiscounted_return)
         self.env.log_diagnostics(stripped_paths, *args, **kwargs)
コード例 #2
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        sess.run(tf.global_variables_initializer())

        self.start_worker(sess)
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                params = self.optimize_policy(itr, )
                if self.plot:
                    self.plotter.update_plot(self.policy, self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
                logger.log("Saving snapshot...")
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('IterTime', time.time() - itr_start_time)
                logger.record_tabular('Time', time.time() - start_time)
                logger.dump_tabular()
        self.shutdown_worker()
        if created_session:
            sess.close()
コード例 #3
0
 def log_env_info(self, env_infos, prefix=""):
     # Logging rewards
     rew_dic = env_infos["rewards"]
     for key in rew_dic.keys():
         rew_sums = np.sum(rew_dic[key], axis=1)
         logger.record_tabular("rewards/" + key + "_avg", np.mean(rew_sums))
         logger.record_tabular("rewards/" + key + "_std", np.std(rew_sums))
コード例 #4
0
    def log_diagnostics(self, paths):
        self.policy.log_diagnostics(paths)
        self.baseline.log_diagnostics(paths)

        path_lengths = [path["returns"].size for path in paths]
        logger.record_tabular('ep_len_avg', np.mean(path_lengths))
        logger.record_tabular('ep_len_std', np.std(path_lengths))
コード例 #5
0
ファイル: gather_env.py プロジェクト: gntoni/garage
 def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs):
     # we call here any logging related to the gather, strip the maze obs
     # and call log_diag with the stripped paths we need to log the purely
     # gather reward!!
     with logger.tabular_prefix(log_prefix + '_'):
         gather_undiscounted_returns = [
             sum(path['env_infos']['outer_rew']) for path in paths
         ]
         logger.record_tabular_misc_stat('Return',
                                         gather_undiscounted_returns,
                                         placement='front')
     stripped_paths = []
     for path in paths:
         stripped_path = {}
         for k, v in path.items():
             stripped_path[k] = v
         stripped_path['observations'] = \
             stripped_path['observations'][
                 :, :flat_dim(self.wrapped_env.observation_space)]
         #  this breaks if the obs of the robot are d>1 dimensional (not a
         #  vector)
         stripped_paths.append(stripped_path)
     with logger.tabular_prefix('wrapped_'):
         if 'env_infos' in paths[0].keys(
         ) and 'inner_rew' in paths[0]['env_infos'].keys():
             wrapped_undiscounted_return = np.mean(
                 [np.sum(path['env_infos']['inner_rew']) for path in paths])
             logger.record_tabular('AverageReturn',
                                   wrapped_undiscounted_return)
         self.wrapped_env.log_diagnostics(
             stripped_paths
         )  # see swimmer_env.py for a scketch of the maze plotting!
コード例 #6
0
ファイル: npo.py プロジェクト: gntoni/garage
 def optimize_policy(self, itr, samples_data):
     all_input_values = tuple(
         ext.extract(samples_data, "observations", "actions", "advantages"))
     agent_infos = samples_data["agent_infos"]
     state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
     dist_info_list = [
         agent_infos[k] for k in self.policy.distribution.dist_info_keys
     ]
     all_input_values += tuple(state_info_list) + tuple(dist_info_list)
     if self.policy.recurrent:
         all_input_values += (samples_data["valids"], )
     logger.log("Computing loss before")
     loss_before = self.optimizer.loss(all_input_values)
     logger.log("Computing KL before")
     mean_kl_before = self.optimizer.constraint_val(all_input_values)
     logger.log("Optimizing")
     self.optimizer.optimize(all_input_values)
     logger.log("Computing KL after")
     mean_kl = self.optimizer.constraint_val(all_input_values)
     logger.log("Computing loss after")
     loss_after = self.optimizer.loss(all_input_values)
     logger.record_tabular('LossBefore', loss_before)
     logger.record_tabular('LossAfter', loss_after)
     logger.record_tabular('MeanKLBefore', mean_kl_before)
     logger.record_tabular('MeanKL', mean_kl)
     logger.record_tabular('dLoss', loss_before - loss_after)
     return dict()
コード例 #7
0
 def train_once(self, itr, paths):
     itr_start_time = time.time()
     with logger.prefix('itr #%d | ' % itr):
         self.log_diagnostics(paths)
         logger.log("Optimizing policy...")
         self.optimize_policy(itr, paths)
         logger.record_tabular('IterTime', time.time() - itr_start_time)
         logger.dump_tabular()
コード例 #8
0
ファイル: rl_algorithm.py プロジェクト: sra4077/softqlearning
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, pool)

        evaluation_env = deep_clone(env) if self._eval_n_episodes else None

        with tf_utils.get_default_session().as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(
                            iteration=t + epoch * self._epoch_length,
                            batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(policy, evaluation_env)
                gt.stamp('eval')

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)

                time_itrs = gt.get_times().stamps.itrs
                time_eval = time_itrs['eval'][-1]
                time_total = gt.get_times().total
                time_train = time_itrs.get('train', [0])[-1]
                time_sample = time_itrs.get('sample', [0])[-1]

                logger.record_tabular('time-train', time_train)
                logger.record_tabular('time-eval', time_eval)
                logger.record_tabular('time-sample', time_sample)
                logger.record_tabular('time-total', time_total)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
コード例 #9
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, pool)

        evaluation_env = deep_clone(env) if self._eval_n_episodes else None

        with tf_utils.get_default_session().as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(policy, evaluation_env)
                gt.stamp('eval')

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)

                time_itrs = gt.get_times().stamps.itrs
                time_eval = time_itrs['eval'][-1]
                time_total = gt.get_times().total
                time_train = time_itrs.get('train', [0])[-1]
                time_sample = time_itrs.get('sample', [0])[-1]

                logger.record_tabular('time-train', time_train)
                logger.record_tabular('time-eval', time_eval)
                logger.record_tabular('time-sample', time_sample)
                logger.record_tabular('time-total', time_total)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
コード例 #10
0
    def log_diagnostics(self, paths):
        n_goal = len(self.goal_positions)
        goal_reached = [False] * n_goal

        for path in paths:
            last_obs = path["observations"][-1]
            for i, goal in enumerate(self.goal_positions):
                if np.linalg.norm(last_obs - goal) < self.goal_threshold:
                    goal_reached[i] = True

        logger.record_tabular('env:goals_reached', goal_reached.count(True))
コード例 #11
0
ファイル: multigoal.py プロジェクト: sra4077/softqlearning
    def log_diagnostics(self, paths):
        n_goal = len(self.goal_positions)
        goal_reached = [False] * n_goal

        for path in paths:
            last_obs = path["observations"][-1]
            for i, goal in enumerate(self.goal_positions):
                if np.linalg.norm(last_obs - goal) < self.goal_threshold:
                    goal_reached[i] = True

        logger.record_tabular('env:goals_reached', goal_reached.count(True))
コード例 #12
0
    def train(self, sess=None):
        address = ("localhost", 6000)
        conn = Client(address)
        last_average_return = None
        try:
            created_session = True if (sess is None) else False
            if sess is None:
                sess = tf.Session()
                sess.__enter__()

            sess.run(tf.global_variables_initializer())
            conn.send(ExpLifecycle.START)
            self.start_worker(sess)
            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                with logger.prefix('itr #%d | ' % itr):
                    logger.log("Obtaining samples...")
                    conn.send(ExpLifecycle.OBTAIN_SAMPLES)
                    paths = self.obtain_samples(itr)
                    logger.log("Processing samples...")
                    conn.send(ExpLifecycle.PROCESS_SAMPLES)
                    samples_data = self.process_samples(itr, paths)
                    last_average_return = samples_data["average_return"]
                    logger.log("Logging diagnostics...")
                    self.log_diagnostics(paths)
                    logger.log("Optimizing policy...")
                    conn.send(ExpLifecycle.OPTIMIZE_POLICY)
                    self.optimize_policy(itr, samples_data)
                    logger.log("Saving snapshot...")
                    params = self.get_itr_snapshot(itr, samples_data)
                    if self.store_paths:
                        params["paths"] = samples_data["paths"]
                    logger.save_itr_params(itr, params)
                    logger.log("Saved")
                    logger.record_tabular('Time', time.time() - start_time)
                    logger.record_tabular('ItrTime',
                                          time.time() - itr_start_time)
                    logger.dump_tabular(with_prefix=False)
                    if self.plot:
                        conn.send(ExpLifecycle.UPDATE_PLOT)
                        self.plotter.update_plot(self.policy,
                                                 self.max_path_length)
                        if self.pause_for_plot:
                            input("Plotting evaluation run: Press Enter to "
                                  "continue...")

            conn.send(ExpLifecycle.SHUTDOWN)
            self.shutdown_worker()
            if created_session:
                sess.close()
        finally:
            conn.close()
        return last_average_return
コード例 #13
0
    def train_inference_network(self, inference_opt_input_values):
        """ Optimize inference network """

        logger.log("Optimizing inference network...")
        infer_loss_before = self.inference_optimizer.loss(
            inference_opt_input_values)
        logger.record_tabular('Inference/Loss', infer_loss_before)
        self.inference_optimizer.optimize(inference_opt_input_values)
        infer_loss_after = self.inference_optimizer.loss(
            inference_opt_input_values)
        logger.record_tabular('Inference/dLoss',
                              infer_loss_before - infer_loss_after)

        return infer_loss_after
コード例 #14
0
ファイル: catrpo.py プロジェクト: Mee321/HAPG_exp
    def outer_optimize(self, samples_data):
        logger.log("optimizing policy")
        observations = ext.extract(samples_data, "observations")
        actions = ext.extract(samples_data, "actions")
        advantages = ext.extract(samples_data, "advantages")

        num_traj = len(samples_data["paths"])

        observations = observations[0].reshape(
            -1, self.env.spec.observation_space.shape[0])
        actions = actions[0].reshape(-1, self.env.spec.action_space.shape[0])
        advantages = advantages[0].reshape(-1)
        inputs = tuple([observations, actions, advantages])

        s_g = self._opt_fun["f_train"](*(list(inputs)))
        #s_g = [x / num_traj for x in s_g]
        self.gradient_backup = copy.deepcopy(s_g)
        g_flat = self.flatten_parameters(s_g)

        loss_before = self._opt_fun["f_loss"](*(list(inputs)))
        self.backup_policy.set_param_values(
            self.policy.get_param_values(trainable=True), trainable=True)
        self.optimizer.optimize(inputs, g_flat)
        loss_after = self._opt_fun["f_loss"](*(list(inputs)))
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self._opt_fun['f_kl'](*(list(inputs)))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
コード例 #15
0
ファイル: svrg_optimizer.py プロジェクト: Mee321/HAPG_exp
    def optimize(self, inputs, extra_inputs=None):

        if not inputs:
            # Assumes that we should always sample mini-batches
            raise NotImplementedError

        f_loss = self._opt_fun["f_loss"]
        f_grad = self._opt_fun["f_grad"]
        f_grad_tilde = self._opt_fun["f_grad_tilde"]

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        param = np.copy(self._target.get_param_values(trainable=True))
        logger.log("Start SVRPG optimization: #parameters: %d, #inputs %d" %
                   (len(param), len(inputs[0])))
        dataset = BatchDataset(inputs,
                               self._batch_size,
                               extra_inputs=extra_inputs)
        start_time = time.time()

        for epoch in range(self._max_epochs):
            if self._verbose:
                logger.log("Epoch %d" % (epoch))
                progbar = pyprind.ProgBar(len(inputs[0]))
            grad_sum = np.zeros_like(param)
            g_mean_tilde = f_grad_tilde(inputs, extra_inputs)
            logger.record_tabular('g_mean_tilde', LA.norm(g_mean_tilde))
            print("-------------mini-batch-------------------")
            num_batch = 0
            while num_batch < self._max_batch:
                batch = dataset.random_batch()
                g = f_grad(*(batch)) - f_grad_tilde(*(batch)) + g_mean_tilde
                grad_sum += g
                prev_w = np.copy(self._target.get_param_values(trainable=True))
                step = self._alpha * g
                cur_w = prev_w + step
                self._target.set_param_value(cur_w, trainable=True)
                num_batch += 1
            print("max batch achieved {:}".format(num_batch))
            grad_sum /= 1.0 * num_batch
            logger.record_tabular('gdist', LA.norm(grad_sum - g_mean_tilde))
            cur_w = np.copy(self._target.get_param_values(trainable=True))
            w_tilde = self._target_tilde.get_params_values(trainable=True)
            self._target_tilde.set_param_values(cur_w, trainable=True)
            logger.record_tabular('wnorm', LA.norm(cur_w))
            logger.record_tabular('w_dist',
                                  LA.norm(cur_w - w_tilde) / LA.norm(cur_w))

            if self._verbose:
                if progbar.active:
                    progbar.stop()
            if abs(LA.norm(cur_w - w_tilde) /
                   LA.norm(cur_w)) < self._tolerance:
                break
コード例 #16
0
 def log_diagnostics(self, paths):
     progs = [
         path["observations"][-1][-3] - path["observations"][0][-3]
         for path in paths
     ]
     logger.record_tabular('AverageForwardProgress', np.mean(progs))
     logger.record_tabular('MaxForwardProgress', np.max(progs))
     logger.record_tabular('MinForwardProgress', np.min(progs))
     logger.record_tabular('StdForwardProgress', np.std(progs))
コード例 #17
0
    def fit(self, xs, ys):

        if self._subsample_factor < 1:
            num_samples_tot = xs.shape[0]
            idx = np.random.randint(
                0, num_samples_tot,
                int(num_samples_tot * self._subsample_factor))
            xs, ys = xs[idx], ys[idx]

        if self._normalize_inputs:
            # recompute normalizing constants for inputs
            self._x_mean_var.set_value(
                np.mean(xs, axis=0,
                        keepdims=True).astype(theano.config.floatX))
            self._x_std_var.set_value(
                (np.std(xs, axis=0, keepdims=True) + 1e-8).astype(
                    theano.config.floatX))
        if self._normalize_outputs:
            # recompute normalizing constants for outputs
            self._y_mean_var.set_value(
                np.mean(ys, axis=0,
                        keepdims=True).astype(theano.config.floatX))
            self._y_std_var.set_value(
                (np.std(ys, axis=0, keepdims=True) + 1e-8).astype(
                    theano.config.floatX))
        if self._name:
            prefix = self._name + "_"
        else:
            prefix = ""
        # FIXME: needs batch computation to avoid OOM.
        loss_before, loss_after, mean_kl, batch_count = 0., 0., 0., 0
        for batch in iterate_minibatches_generic(
                input_lst=[xs, ys], batchsize=self._batchsize, shuffle=True):
            batch_count += 1
            xs, ys = batch
            if self._use_trust_region:
                old_means, old_log_stds = self._f_pdists(xs)
                inputs = [xs, ys, old_means, old_log_stds]
            else:
                inputs = [xs, ys]
            loss_before += self._optimizer.loss(inputs)

            self._optimizer.optimize(inputs)
            loss_after += self._optimizer.loss(inputs)
            if self._use_trust_region:
                mean_kl += self._optimizer.constraint_val(inputs)

        logger.record_tabular(prefix + 'LossBefore', loss_before / batch_count)
        logger.record_tabular(prefix + 'LossAfter', loss_after / batch_count)
        logger.record_tabular(prefix + 'dLoss',
                              loss_before - loss_after / batch_count)
        if self._use_trust_region:
            logger.record_tabular(prefix + 'MeanKL', mean_kl / batch_count)
コード例 #18
0
ファイル: ddopg.py プロジェクト: stjordanis/DD_OPG
    def _training_step(self, itr):
        itr_start_time = time.time()

        with logger.prefix('itr #%d | ' % itr):
            self._sampling()

            self._bookkeeping()

            self._memory_selection(itr)
            self._policy_optimization(itr)

            if itr % self.evaluation_interval == 0:
                self._policy_evaluation()

            self._log_diagnostics(itr)

            logger.record_tabular('Time', time.time() - self.start_time)
            logger.record_tabular('ItrTime', time.time() - itr_start_time)
            logger.dump_tabular(with_prefix=False)
コード例 #19
0
ファイル: batch_polopt.py プロジェクト: Mee321/HAPG_exp
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        sess.run(tf.global_variables_initializer())
        self.start_worker(sess)
        start_time = time.time()
        last_average_return = None
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                paths = self.obtain_samples(itr)
                logger.log("Processing samples...")
                samples_data = self.process_samples(itr, paths)
                last_average_return = samples_data["average_return"]
                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                self.optimize_policy(itr, samples_data)
                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr, samples_data)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    self.plotter.update_plot(self.policy, self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")

        self.shutdown_worker()
        if created_session:
            sess.close()
        return last_average_return
コード例 #20
0
    def _fit_baseline(self, samples_data):
        """ Update baselines from samples. """

        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Augment reward from baselines
        rewards_tensor = self.f_rewards(*policy_opt_input_values)
        returns_tensor = self.f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor)

        paths = samples_data["paths"]
        valids = samples_data["valids"]
        baselines = [path["baselines"] for path in paths]

        # Recompute parts of samples_data
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids,
                                       paths):
            path["rewards"] = rew[val.astype(np.bool)]
            path["returns"] = ret[val.astype(np.bool)]
            aug_rewards.append(path["rewards"])
            aug_returns.append(path["returns"])
        aug_rewards = tensor_utils.concat_tensor_list(aug_rewards)
        aug_returns = tensor_utils.concat_tensor_list(aug_returns)
        samples_data["rewards"] = aug_rewards
        samples_data["returns"] = aug_returns

        # Calculate explained variance
        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           aug_returns)
        logger.record_tabular(
            "{}/ExplainedVariance".format(self.baseline.name), ev)

        # Fit baseline
        logger.log("Fitting baseline...")
        if hasattr(self.baseline, "fit_with_samples"):
            self.baseline.fit_with_samples(paths, samples_data)
        else:
            self.baseline.fit(paths)
コード例 #21
0
ファイル: catrpo.py プロジェクト: Mee321/HAPG_exp
 def train(self):
     with tf.Session() as sess:
         sess.run(tf.initialize_all_variables())
         self.start_worker(sess)
         start_time = time.time()
         self.num_samples = 0
         for itr in range(self.start_itr, self.n_itr):
             itr_start_time = time.time()
             with logger.prefix('itr #%d | ' % itr):
                 logger.log("Obtaining new samples...")
                 paths = self.obtain_samples(itr)
                 for path in paths:
                     self.num_samples += len(path["rewards"])
                 logger.log("total num samples..." + str(self.num_samples))
                 logger.log("Processing samples...")
                 samples_data = self.process_samples(itr, paths)
                 logger.log("Logging diagnostics...")
                 self.log_diagnostics(paths)
                 logger.log("Optimizing policy...")
                 self.outer_optimize(samples_data)
                 for sub_itr in range(self.n_sub_itr):
                     logger.log("Minibatch Optimizing...")
                     self.inner_optimize(samples_data)
                 logger.log("Saving snapshot...")
                 params = self.get_itr_snapshot(itr,
                                                samples_data)  # , **kwargs)
                 if self.store_paths:
                     params["paths"] = samples_data["paths"]
                 logger.save_itr_params(itr, params)
                 logger.log("Saved")
                 logger.record_tabular('Time', time.time() - start_time)
                 logger.record_tabular('ItrTime',
                                       time.time() - itr_start_time)
                 logger.dump_tabular(with_prefix=False)
                 #if self.plot:
                 #   self.update_plot()
                 #  if self.pause_for_plot:
                 #     input("Plotting evaluation run: Press Enter to "
                 #          "continue...")
     self.shutdown_worker()
コード例 #22
0
 def fit(self, xs, ys):
     if self.normalize_inputs:
         # recompute normalizing constants for inputs
         new_mean = np.mean(xs, axis=0, keepdims=True)
         new_std = np.std(xs, axis=0, keepdims=True) + 1e-8
         tf.get_default_session().run(
             tf.group(
                 tf.assign(self.x_mean_var, new_mean),
                 tf.assign(self.x_std_var, new_std),
             ))
         # self._x_mean_var.set_value(np.mean(xs, axis=0, keepdims=True))
         # self._x_std_var.set_value(
         #     np.std(xs, axis=0, keepdims=True) + 1e-8)
     if self.use_trust_region and self.first_optimized:
         old_p = self.f_p(xs)
         inputs = [xs, ys, old_p]
         optimizer = self.tr_optimizer
     else:
         inputs = [xs, ys]
         optimizer = self.optimizer
     loss_before = optimizer.loss(inputs)
     if self.name:
         prefix = self.name + "_"
     else:
         prefix = ""
     logger.record_tabular(prefix + 'LossBefore', loss_before)
     optimizer.optimize(inputs)
     loss_after = optimizer.loss(inputs)
     logger.record_tabular(prefix + 'LossAfter', loss_after)
     logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
     self.first_optimized = True
コード例 #23
0
    def fit(self, xs, ys):
        """Optimize the regressor based on the inputs."""
        if self._subsample_factor < 1:
            num_samples_tot = xs.shape[0]
            idx = np.random.randint(
                0, num_samples_tot,
                int(num_samples_tot * self._subsample_factor))
            xs, ys = xs[idx], ys[idx]

        sess = tf.get_default_session()
        if self._normalize_inputs:
            # recompute normalizing constants for inputs
            sess.run([
                tf.assign(self._x_mean_var, np.mean(xs, axis=0,
                                                    keepdims=True)),
                tf.assign(self._x_std_var,
                          np.std(xs, axis=0, keepdims=True) + 1e-8),
            ])
        if self._normalize_outputs:
            # recompute normalizing constants for outputs
            sess.run([
                tf.assign(self._y_mean_var, np.mean(ys, axis=0,
                                                    keepdims=True)),
                tf.assign(self._y_std_var,
                          np.std(ys, axis=0, keepdims=True) + 1e-8),
            ])
        if self._use_trust_region:
            old_means, old_log_stds = self._f_pdists(xs)
            inputs = [xs, ys, old_means, old_log_stds]
        else:
            inputs = [xs, ys]
        loss_before = self._optimizer.loss(inputs)
        if self._name:
            prefix = self._name + "/"
        else:
            prefix = ""
        logger.record_tabular(prefix + 'LossBefore', loss_before)
        self._optimizer.optimize(inputs)
        loss_after = self._optimizer.loss(inputs)
        logger.record_tabular(prefix + 'LossAfter', loss_after)
        if self._use_trust_region:
            logger.record_tabular(prefix + 'MeanKL',
                                  self._optimizer.constraint_val(inputs))
        logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
コード例 #24
0
ファイル: vpg.py プロジェクト: gntoni/garage
    def optimize_policy(self, itr, samples_data):
        logger.log("optimizing policy")
        inputs = ext.extract(samples_data, "observations", "actions",
                             "advantages")
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        inputs += tuple(state_info_list)
        if self.policy.recurrent:
            inputs += (samples_data["valids"], )
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        loss_before = self.optimizer.loss(inputs)
        self.optimizer.optimize(inputs)
        loss_after = self.optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self.opt_info['f_kl'](
            *(list(inputs) + dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
コード例 #25
0
 def fit(self, xs, ys):
     if self._normalize_inputs:
         # recompute normalizing constants for inputs
         self._x_mean_var.set_value(np.mean(xs, axis=0, keepdims=True))
         self._x_std_var.set_value(np.std(xs, axis=0, keepdims=True) + 1e-8)
     if self._use_trust_region:
         old_prob = self._f_prob(xs)
         inputs = [xs, ys, old_prob]
     else:
         inputs = [xs, ys]
     loss_before = self._optimizer.loss(inputs)
     if self._name:
         prefix = self._name + "_"
     else:
         prefix = ""
     logger.record_tabular(prefix + 'LossBefore', loss_before)
     self._optimizer.optimize(inputs)
     loss_after = self._optimizer.loss(inputs)
     logger.record_tabular(prefix + 'LossAfter', loss_after)
     logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
コード例 #26
0
    def log_diagnostics(self, batch):
        """Record diagnostic information.

        Records the mean and standard deviation of Q-function and the
        squared Bellman residual of the  s (mean squared Bellman error)
        for a sample batch.

        Also call the `draw` method of the plotter, if plotter is defined.
        """

        feeds = self._get_feed_dict(batch)
        qf, bellman_residual = self._sess.run(
            [self._q_values, self._bellman_residual], feeds)

        logger.record_tabular('qf-avg', np.mean(qf))
        logger.record_tabular('qf-std', np.std(qf))
        logger.record_tabular('mean-sq-bellman-error', bellman_residual)

        self.policy.log_diagnostics(batch)
        if self.plotter:
            self.plotter.draw()
コード例 #27
0
 def fit(self, xs, ys):
     if self.normalize_inputs:
         # recompute normalizing constants for inputs
         new_mean = np.mean(xs, axis=0, keepdims=True)
         new_std = np.std(xs, axis=0, keepdims=True) + 1e-8
         tf.get_default_session().run(
             tf.group(
                 tf.assign(self.x_mean_var, new_mean),
                 tf.assign(self.x_std_var, new_std),
             ))
     inputs = [xs, ys]
     loss_before = self.optimizer.loss(inputs)
     if self.name:
         prefix = self.name + "/"
     else:
         prefix = ""
     logger.record_tabular(prefix + 'LossBefore', loss_before)
     self.optimizer.optimize(inputs)
     loss_after = self.optimizer.loss(inputs)
     logger.record_tabular(prefix + 'LossAfter', loss_after)
     logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
コード例 #28
0
 def log_diagnostics(self, paths):
     log_stds = np.vstack(
         [path["agent_infos"]["log_std"] for path in paths])
     logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds)))
コード例 #29
0
ファイル: ddpg.py プロジェクト: ScapeQin/garage
    def evaluate(self, epoch, pool):
        logger.log("Collecting samples for evaluation")
        paths = parallel_sampler.sample_paths(
            policy_params=self.policy.get_param_values(),
            max_samples=self.eval_samples,
            max_path_length=self.max_path_length,
        )

        average_discounted_return = np.mean([
            special.discount_return(path["rewards"], self.discount)
            for path in paths
        ])

        returns = [sum(path["rewards"]) for path in paths]

        all_qs = np.concatenate(self.q_averages)
        all_ys = np.concatenate(self.y_averages)

        average_q_loss = np.mean(self.qf_loss_averages)
        average_policy_surr = np.mean(self.policy_surr_averages)
        average_action = np.mean(
            np.square(np.concatenate([path["actions"] for path in paths])))

        policy_reg_param_norm = np.linalg.norm(
            self.policy.get_param_values(regularizable=True))
        qfun_reg_param_norm = np.linalg.norm(
            self.qf.get_param_values(regularizable=True))

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('AverageReturn', np.mean(returns))
        logger.record_tabular('StdReturn', np.std(returns))
        logger.record_tabular('MaxReturn', np.max(returns))
        logger.record_tabular('MinReturn', np.min(returns))
        if self.es_path_returns:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.es_path_returns))
            logger.record_tabular('StdEsReturn', np.std(self.es_path_returns))
            logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns))
            logger.record_tabular('MinEsReturn', np.min(self.es_path_returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageQLoss', average_q_loss)
        logger.record_tabular('AveragePolicySurr', average_policy_surr)
        logger.record_tabular('AverageQ', np.mean(all_qs))
        logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
        logger.record_tabular('AverageY', np.mean(all_ys))
        logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
        logger.record_tabular('AverageAbsQYDiff',
                              np.mean(np.abs(all_qs - all_ys)))
        logger.record_tabular('AverageAction', average_action)

        logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm)
        logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm)

        self.policy.log_diagnostics(paths)

        self.qf_loss_averages = []
        self.policy_surr_averages = []

        self.q_averages = []
        self.y_averages = []
        self.es_path_returns = []
コード例 #30
0
ファイル: npo_v1.py プロジェクト: maliesa96/fyra
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Train policy network
        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(policy_opt_input_values)
        logger.log("Computing KL before")
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Optimizing")
        self.optimizer.optimize(policy_opt_input_values)
        logger.log("Computing KL after")
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(policy_opt_input_values)
        logger.record_tabular("{}/LossBefore".format(self.policy.name),
                              loss_before)
        logger.record_tabular("{}/LossAfter".format(self.policy.name),
                              loss_after)
        logger.record_tabular("{}/dLoss".format(self.policy.name),
                              loss_before - loss_after)
        logger.record_tabular("{}/KLBefore".format(self.policy.name),
                              policy_kl_before)
        logger.record_tabular("{}/KL".format(self.policy.name), policy_kl)

        pol_ent = self.f_policy_entropy(*policy_opt_input_values)
        logger.record_tabular("{}/Entropy".format(self.policy.name),
                              np.mean(pol_ent))

        num_traj = self.batch_size // self.max_path_length
        actions = samples_data["actions"][:num_traj, ...]
        logger.record_histogram("{}/Actions".format(self.policy.name), actions)

        self._fit_baseline(samples_data)
コード例 #31
0
ファイル: pusher.py プロジェクト: sra4077/softqlearning
    def log_diagnostics(self, paths):
        arm_dists = [p['env_infos'][-1]['arm_distance'] for p in paths]
        goal_dists = [p['env_infos'][-1]['goal_distance'] for p in paths]

        logger.record_tabular('FinalArmDistanceAvg', np.mean(arm_dists))
        logger.record_tabular('FinalArmDistanceMax', np.max(arm_dists))
        logger.record_tabular('FinalArmDistanceMin', np.min(arm_dists))
        logger.record_tabular('FinalArmDistanceStd', np.std(arm_dists))

        logger.record_tabular('FinalGoalDistanceAvg', np.mean(goal_dists))
        logger.record_tabular('FinalGoalDistanceMax', np.max(goal_dists))
        logger.record_tabular('FinalGoalDistanceMin', np.min(goal_dists))
        logger.record_tabular('FinalGoalDistanceStd', np.std(goal_dists))
コード例 #32
0
ファイル: sampler.py プロジェクト: sra4077/softqlearning
 def log_diagnostics(self):
     logger.record_tabular('pool-size', self.pool.size)
コード例 #33
0
    def evaluate(self, policy_opt_input_values, samples_data):
        # Everything else
        rewards_tensor = self.f_rewards(*policy_opt_input_values)
        returns_tensor = self.f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor)  # TODO
        # TODO: check the squeeze/dimension handling for both convolutions

        paths = samples_data['paths']
        valids = samples_data['valids']
        baselines = [path['baselines'] for path in paths]
        env_rewards = [path['rewards'] for path in paths]
        env_rewards = tensor_utils.concat_tensor_list(env_rewards.copy())
        env_returns = [path['returns'] for path in paths]
        env_returns = tensor_utils.concat_tensor_list(env_returns.copy())
        env_average_discounted_return = \
            np.mean([path["returns"][0] for path in paths])

        # Recompute parts of samples_data
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids,
                                       paths):
            path['rewards'] = rew[val.astype(np.bool)]
            path['returns'] = ret[val.astype(np.bool)]
            aug_rewards.append(path['rewards'])
            aug_returns.append(path['returns'])
        aug_rewards = tensor_utils.concat_tensor_list(aug_rewards)
        aug_returns = tensor_utils.concat_tensor_list(aug_returns)
        samples_data['rewards'] = aug_rewards
        samples_data['returns'] = aug_returns

        # Calculate effect of the entropy terms
        d_rewards = np.mean(aug_rewards - env_rewards)
        logger.record_tabular('Policy/EntRewards', d_rewards)

        aug_average_discounted_return = \
            np.mean([path["returns"][0] for path in paths])
        d_returns = np.mean(aug_average_discounted_return -
                            env_average_discounted_return)
        logger.record_tabular('Policy/EntReturns', d_returns)

        # Calculate explained variance
        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           aug_returns)
        logger.record_tabular('Baseline/ExplainedVariance', ev)

        inference_rmse = (samples_data['trajectory_infos']['mean'] -
                          samples_data['latents'])**2.
        inference_rmse = np.sqrt(inference_rmse.mean())
        logger.record_tabular('Inference/RMSE', inference_rmse)

        inference_rrse = rrse(samples_data['latents'],
                              samples_data['trajectory_infos']['mean'])
        logger.record_tabular('Inference/RRSE', inference_rrse)

        embed_ent = self.f_embedding_entropy(*policy_opt_input_values)
        logger.record_tabular('Embedding/Entropy', embed_ent)

        infer_ce = self.f_inference_ce(*policy_opt_input_values)
        logger.record_tabular('Inference/CrossEntropy', infer_ce)

        pol_ent = self.f_policy_entropy(*policy_opt_input_values)
        logger.record_tabular('Policy/Entropy', pol_ent)

        #task_ents = self.f_task_entropies(*policy_opt_input_values)
        #tasks = samples_data["tasks"][:, 0, :]
        #_, task_indices = np.nonzero(tasks)
        #path_lengths = np.sum(samples_data["valids"], axis=1)
        #for t in range(self.policy.n_tasks):
        #lengths = path_lengths[task_indices == t]
        #completed = lengths < self.max_path_length
        #pct_completed = np.mean(completed)
        #num_samples = np.sum(lengths)
        #num_trajs = lengths.shape[0]
        #logger.record_tabular('Tasks/EpisodeLength/t={}'.format(t),
        #                     np.mean(lengths))
        #    logger.record_tabular('Tasks/CompletionRate/t={}'.format(t),
        #                       pct_completed)
        # logger.record_tabular('Tasks/NumSamples/t={}'.format(t),
        #                       num_samples)
        # logger.record_tabular('Tasks/NumTrajs/t={}'.format(t), num_trajs)
        # logger.record_tabular('Tasks/Entropy/t={}'.format(t), task_ents[t])

        return samples_data
コード例 #34
0
    def train_policy_and_embedding_networks(self, policy_opt_input_values):
        """ Joint optimization of policy and embedding networks """

        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(policy_opt_input_values)

        logger.log("Computing KL before")
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        embed_kl_before = self.f_embedding_kl(*policy_opt_input_values)

        logger.log("Optimizing")
        self.optimizer.optimize(policy_opt_input_values)

        logger.log("Computing KL after")
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        embed_kl = self.f_embedding_kl(*policy_opt_input_values)

        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(policy_opt_input_values)

        logger.record_tabular('Policy/LossBefore', loss_before)
        logger.record_tabular('Policy/LossAfter', loss_after)
        logger.record_tabular('Policy/KLBefore', policy_kl_before)
        logger.record_tabular('Policy/KL', policy_kl)
        logger.record_tabular('Policy/dLoss', loss_before - loss_after)
        logger.record_tabular('Embedding/KLBefore', embed_kl_before)
        logger.record_tabular('Embedding/KL', embed_kl)

        return loss_after
コード例 #35
0
ファイル: sampler.py プロジェクト: sra4077/softqlearning
 def log_diagnostics(self):
     super(SimpleSampler, self).log_diagnostics()
     logger.record_tabular('max-path-return', self._max_path_return)
     logger.record_tabular('last-path-return', self._last_path_return)
     logger.record_tabular('episodes', self._n_episodes)
     logger.record_tabular('total-samples', self._total_samples)
コード例 #36
0
ファイル: rl_algorithm.py プロジェクト: sra4077/softqlearning
    def _evaluate(self, policy, evaluation_env):
        """Perform evaluation for the current policy."""

        if self._eval_n_episodes < 1:
            return

        # TODO: max_path_length should be a property of environment.
        paths = rollouts(evaluation_env, policy, self.sampler._max_path_length,
                         self._eval_n_episodes)

        total_returns = [path['rewards'].sum() for path in paths]
        episode_lengths = [len(p['rewards']) for p in paths]

        logger.record_tabular('return-average', np.mean(total_returns))
        logger.record_tabular('return-min', np.min(total_returns))
        logger.record_tabular('return-max', np.max(total_returns))
        logger.record_tabular('return-std', np.std(total_returns))
        logger.record_tabular('episode-length-avg', np.mean(episode_lengths))
        logger.record_tabular('episode-length-min', np.min(episode_lengths))
        logger.record_tabular('episode-length-max', np.max(episode_lengths))
        logger.record_tabular('episode-length-std', np.std(episode_lengths))

        evaluation_env.log_diagnostics(paths)
        if self._eval_render:
            evaluation_env.render(paths)

        if self.sampler.batch_ready():
            batch = self.sampler.random_batch()
            self.log_diagnostics(batch)
コード例 #37
0
 def log_diagnostics(self):
     logger.record_tabular('max-path-return', self._max_path_return)
     logger.record_tabular('last-path-return', self._last_path_return)
     logger.record_tabular('pool-size', self.pool.size)
     logger.record_tabular('episodes', self._n_episodes)
     logger.record_tabular('total-samples', self._total_samples)