Ejemplo n.º 1
0
        def runner(x):
            costs_mean = []
            costs_std = []
            for i in range(x.shape[0]):
                self.policy.set_params(x[i])
                episodes = run_rollout(
                    policy=self,
                    env=self.env,
                    n=self.n_evals_if_stochastic if self.stochastic else 1,
                    eval=False,
                    additional_keys=['costs', 'n_icu'],
                )
                costs_eps = np.array([
                    np.sum(episodes[i_ep]['costs'], axis=0) for i_ep in range(
                        self.n_evals_if_stochastic if self.stochastic else 1)
                ])
                costs_mean.append(costs_eps.mean(axis=0))
                costs_std.append(costs_eps.std(axis=0))

            return np.array(costs_mean), np.array(costs_std)
Ejemplo n.º 2
0
    def evaluate(self, n=None, goal=None, best=None, reset_same_model=False):
        # run eval
        if n is None:
            n = self.n_evals_if_stochastic if self.env.unwrapped.stochastic else 1
        if self.goal_conditioned:
            if goal is not None:
                eval_goals = np.array([goal] * n)
            else:
                eval_goals = self.cost_function.get_eval_goals(n)

            n = eval_goals.shape[0]
        else:
            eval_goals = None
        eval_episodes = run_rollout(
            policy=self,
            env=self.env,
            n=n,
            goal=eval_goals,
            eval=True,
            reset_same_model=reset_same_model,
            additional_keys=('costs', 'constraints'),
        )
        new_logs, costs = self.compute_eval_score(eval_episodes, eval_goals)
        return new_logs, costs
Ejemplo n.º 3
0
    def evaluate_pareto(self, load_model=True):
        if load_model:
            self.load_model(self.logdir + '/models/best_model.cp')
        if self.goal_conditioned:
            print('----------------\nForming pareto front')
            goals = sample_goals(self.pareto_size, self.cost_function.goal_dim)

            res = dict()

            costs_mean = []
            costs_std = []
            n = self.n_evals_if_stochastic if self.env.unwrapped.stochastic else 1
            for i_g, g in enumerate(goals):
                if (i_g + 1) % 20 == 0:
                    print('\t{:.2f} %'.format(
                        (i_g + 1) / goals.shape[0] * 100))
                gs = np.atleast_2d(np.array([g for _ in range(n)]))
                if gs.shape[0] != n:
                    gs = gs.transpose()
                episodes = run_rollout(
                    policy=self,
                    env=self.env,
                    n=n,
                    goal=gs,
                    eval=True,
                    additional_keys=['costs'],
                )

                costs = np.array(
                    [np.array(e['costs']).sum(axis=0) for e in episodes])
                costs_mean.append(costs.mean(axis=0))
                costs_std.append(costs.std(axis=0))
            res['F_all'] = np.array(costs_mean)
            res['F_std_all'] = np.array(costs_std)
            res['G_all'] = goals

            front_ids = compute_pareto_front(costs_mean)
            costs_mean = np.array(costs_mean)
            costs_std = np.array(costs_std)
            costs_std = costs_std[front_ids]
            costs_mean = costs_mean[front_ids]
            res['F'] = costs_mean
            res['F_std'] = costs_std

            with open(self.logdir + 'res_eval.pk', 'wb') as f:
                pickle.dump(res, f)
        else:
            print('----------------\nForming pareto front')

            res = dict()
            costs_mean = []
            costs_std = []
            n = self.n_evals_if_stochastic if self.env.unwrapped.stochastic else 1
            episodes = run_rollout(
                policy=self,
                env=self.env,
                n=n,
                eval=True,
                additional_keys=['costs'],
            )

            costs = np.array(
                [np.array(e['costs']).sum(axis=0) for e in episodes])
            costs_mean.append(costs.mean(axis=0))
            costs_std.append(costs.std(axis=0))
            res['F'] = np.array(costs_mean)
            res['F_std'] = np.array(costs_std)
            for k in list(res.keys()):
                res[k + '_all'] = res[k]
            res['G_all'] = np.array([[
                self.cost_function.beta_default for _ in range(len(costs_mean))
            ]])

            with open(self.logdir + 'res_eval.pk', 'wb') as f:
                pickle.dump(res, f)
Ejemplo n.º 4
0
    def learn(self, num_train_steps):
        """
        Main training loop.

        Parameters
        ----------
        num_train_steps: int
            Number of training steps (environment steps)

        Returns
        -------

        """

        while self.env_step_counter < num_train_steps:
            if self.goal_conditioned:
                goal = self.env.unwrapped.sample_cost_function_params()
            else:
                goal = None

            episodes = run_rollout(
                policy=self,
                env=self.env,
                n=1,
                goal=goal,
                eval=False,
                additional_keys=('costs', 'constraints'),
            )
            lengths = self.store_episodes(episodes)
            self.env_step_counter += np.sum(lengths)
            self.episode += 1

            self.aggregated_costs.append(
                np.sum(episodes[0]['aggregated_costs']))
            self.costs.append(np.sum(episodes[0]['costs'], axis=0))

            # Update
            if len(self.replay_buffer) > self.batch_size:
                update_losses = []
                for _ in range(int(np.sum(lengths) * 0.5)):
                    update_losses.append(self.update())
                update_losses = np.array(update_losses)
                losses = update_losses.mean(axis=0)
            else:
                losses = [np.nan] * 2

            if self.episode % self.eval_and_log_every == 0:
                # Run evaluations
                new_logs, eval_costs = self.evaluate(
                    n=self.n_evals_if_stochastic if self.stochastic else 1)
                # Compute train scores
                train_agg_cost = np.mean(self.aggregated_costs)
                train_costs = np.array(self.costs).mean(axis=0)
                self.log(self.episode, new_logs, losses, train_agg_cost,
                         train_costs)
                # Reset training score tracking
                self.aggregated_costs = []
                self.costs = []

            if self.episode % self.save_policy_every == 0:
                self.save_model(self.logdir +
                                '/models/policy_{}.cp'.format(self.episode))
        self.evaluate_pareto()
        print('Run has terminated successfully')
Ejemplo n.º 5
0
    def evaluate(self,
                 n=None,
                 all=False,
                 best=False,
                 goal=None,
                 reset_same_model=False):
        res = dict()

        if all:
            costs_mean = []
            costs_std = []
            for w in self.res.X:
                self.policy.set_params(w)
                episodes = run_rollout(
                    policy=self,
                    env=self.env,
                    n=n,
                    eval=True,
                    reset_same_model=reset_same_model,
                    additional_keys=['costs'],
                )

                costs = np.array(
                    [np.array(e['costs']).sum(axis=0) for e in episodes])
                costs_mean.append(costs.mean(axis=0))
                costs_std.append(costs.std(axis=0))

            front_ids = compute_pareto_front(costs_mean)
            costs_mean = np.array(costs_mean)
            costs_std = np.array(costs_std)
            costs_std = costs_std[front_ids]
            costs_mean = costs_mean[front_ids]
            weights = self.res.X[front_ids]
            res['F'] = costs_mean
            res['F_std'] = costs_std
            res['X'] = weights
            costs = costs_mean
        elif best:
            weights = self.res_eval['X']
            costs = self.res_eval['F']
            normalized_costs = np.array([
                c_f.scale(c)
                for c_f, c in zip(self.cost_function.costs, costs.transpose())
            ]).transpose()
            agg_cost = normalized_costs.sum(axis=1)
            ind_min = np.argmin(agg_cost)
            self.policy.set_params(weights[ind_min])
            episodes = run_rollout(
                policy=self,
                env=self.env,
                n=n,
                eval=True,
                additional_keys=['costs'],
            )
            costs = np.array(
                [np.array(e['costs']).sum(axis=0) for e in episodes])
            # res['X'] = weights[ind_min]
            for i, c_m, c_std in zip(range(costs.shape[1]), costs.mean(axis=0),
                                     costs.std(axis=0)):
                res['C{} mean'.format(i)] = c_m
                res['C{} std'.format(i)] = c_std

        elif goal is not None:
            nn_model = NearestNeighbors(n_neighbors=1)

            weights = self.res_eval['X']
            costs = self.res_eval['F']
            normalized_costs = np.array([
                c_f.scale(c)
                for c_f, c in zip(self.cost_function.costs, costs.transpose())
            ]).transpose()
            nn_model.fit(normalized_costs)
            normalized_goal = np.atleast_2d(
                np.array([
                    c_f.scale(g)
                    for c_f, g in zip(self.cost_function.costs, goal)
                ]))
            ind_nn = nn_model.kneighbors(normalized_goal,
                                         return_distance=False).flatten()
            self.policy.set_params(weights[ind_nn].flatten())
            episodes = run_rollout(
                policy=self,
                env=self.env,
                n=n,
                eval=True,
                additional_keys=['costs'],
            )
            costs = np.array(
                [np.array(e['costs']).sum(axis=0) for e in episodes])
            res['X'] = weights[ind_nn]
            res['F'] = costs.mean(axis=0)
            res['F_std'] = costs.std(axis=0)
        else:
            episodes = run_rollout(
                policy=self,
                env=self.env,
                n=n,
                eval=True,
                additional_keys=['costs'],
            )
            costs = np.array(
                [np.array(e['costs']).sum(axis=0) for e in episodes])
            for i, c_m, c_std in zip(range(costs.shape[1]), costs.mean(axis=0),
                                     costs.std(axis=0)):
                res['C{} mean'.format(i)] = c_m
                res['C{} std'.format(i)] = c_std

        return res, costs