Beispiel #1
0
    def get_action(self, step_number, curr_state_K, actions_taken_so_far,
                   starting_fullenvstate, evaluating,
                   take_exploratory_actions):
        """Select optimal action

        Args:
            curr_state_K:
                current "state" as known by the dynamics model
                actually a concatenation of (1) current obs, and (K-1) past obs
            step_number:
                which step number the rollout is currently on (used to calculate costs)
            actions_taken_so_far:
                used to restore state of the env to correct place,
                when using ground-truth dynamics
            starting_fullenvstate
                full state of env before this rollout, used for env resets (when using ground-truth dynamics)
            evaluating
                if True: default to not having any noise on the executing action
            take_exploratory_actions
                if True: select action based on disagreement of ensembles
                if False: (default) select action based on predicted costs

        Returns:
            best_action: optimal action to perform, according to this controller
            resulting_states_list: predicted results of executing the candidate action sequences
        """

        ############################
        ### sample N random candidate action sequences, each of length horizon
        ############################

        np.random.seed()  # to get different action samples for each rollout

        all_samples = []
        junk = 1
        for i in range(self.N):
            sample = []
            for num in range(self.horizon):
                sample.append(
                    self.rand_policy.get_action(
                        junk,
                        prev_action=None,
                        random_sampling_params=self.random_sampling_params,
                        hold_action_overrideToOne=True)[0])
            all_samples.append(np.array(sample))
        all_samples = np.array(all_samples)

        ########################################################################
        ### make each action element be (past K actions) instead of just (curr action)
        ########################################################################

        #all_samples : [N, horizon, ac_dim]
        all_acs = turn_acs_into_acsK(actions_taken_so_far, all_samples, self.K,
                                     self.N, self.horizon)
        #all_acs : [N, horizon, K, ac_dim]

        ############################
        ### have model predict the result of executing those candidate action sequences
        ############################

        if self.use_ground_truth_dynamics:

            paths = trajectory_sampler.sample_paths_parallel(
                self.N,
                all_samples,
                actions_taken_so_far,
                starting_fullenvstate,
                self.env,
                suppress_print=True,
            )  #list of dicts, each w observations/actions/etc.

            #the taken number of paths is num_cpu*(floor(self.N/num_cpu))
            #rather than self.N, so update parameter accordingly
            self.N = len(paths)
            all_samples = all_samples[:self.N]

            resulting_states = [entry['observations'] for entry in paths]
            resulting_states = np.swapaxes(resulting_states, 0, 1)
            resulting_states_list = [resulting_states]
        else:

            resulting_states_list = self.dyn_models.do_forward_sim(
                [curr_state_K, 0], np.copy(all_acs))
            resulting_states_list = np.swapaxes(
                resulting_states_list, 0,
                1)  #[ensSize, horizon+1, N, statesize]

        ############################
        ### evaluate the predicted trajectories
        ############################

        #calculate costs
        costs, mean_costs, std_costs = calculate_costs(
            resulting_states_list, all_samples, self.reward_func, evaluating,
            take_exploratory_actions)

        #pick best action sequence
        best_score = np.min(costs)
        best_sim_number = np.argmin(costs)
        best_sequence = all_samples[best_sim_number]
        best_action = np.copy(best_sequence[0])

        #########################################
        ### execute the candidate action sequences on the real dynamics
        ### instead of just on the model
        ### useful for debugging/analysis...
        #########################################

        if self.execute_sideRollouts:
            if (step_number % self.horizon) == 0:
                cmap = plt.get_cmap('jet_r')
                num_colors = 10  ##5
                indices_to_vis = [0, 1, 2]
                curr_plot = 1
                num_plots = len(indices_to_vis)
                for index_state_to_vis in indices_to_vis:
                    plt.subplot(num_plots, 1, curr_plot)
                    for sim_num in range(num_colors):
                        true_states = do_groundtruth_rollout(
                            all_samples[sim_num], self.env,
                            starting_fullenvstate, actions_taken_so_far)
                        color = cmap(float(sim_num) / num_colors)
                        if (self.iter_num == 0 and self.plot_sideRollouts):
                            if (step_number % 10 == 0):
                                plt.plot(resulting_states_list[-1]
                                         [:, sim_num, index_state_to_vis],
                                         '--',
                                         c=color,
                                         label=sim_num)
                                plt.plot(
                                    np.array(true_states)[:,
                                                          index_state_to_vis],
                                    '-',
                                    c=color)
                    curr_plot += 1

                plt.legend()
                plt.show()

        return best_action, resulting_states_list
Beispiel #2
0
    def get_action(self, step_number, curr_state_K, actions_taken_so_far,
                   starting_fullenvstate, evaluating, take_exploratory_actions):

        # init vars
        curr_state_K = np.array(curr_state_K)  #[K, sa_dim]

        ## Debugging
        self.observation_history.append(curr_state_K[0])
        if len(self.observation_history) > self.horizon + 1:
            self.observation_history.pop(0)

        # remove the 1st entry of mean (mean from past timestep, that was just executed)
        # and copy last entry (starting point, for the next timestep)
        past_action = self.mppi_mean[0].copy()
        self.mppi_mean[:-1] = self.mppi_mean[1:]

        ##############################################
        ## sample candidate action sequences
        ## by creating smooth filtered trajecs (noised around a mean)
        ##############################################

        np.random.seed()  # to get different action samples for each rollout

        #sample noise from normal dist, scaled by sigma
        if(self.sample_velocity):
            eps_higherRange = np.random.normal(
                loc=0, scale=1.0, size=(self.N, self.horizon,
                                        self.ac_dim)) * self.sigma
            lowerRange = 0.3*self.sigma
            num_lowerRange = int(0.1*self.N)
            eps_lowerRange = np.random.normal(
                loc=0, scale=1.0, size=(num_lowerRange, self.horizon,
                                        self.ac_dim)) * lowerRange
            eps_higherRange[-num_lowerRange:] = eps_lowerRange
            eps=eps_higherRange.copy()
        else:
            eps = np.random.normal(
                loc=0, scale=1.0, size=(self.N, self.horizon,
                                        self.ac_dim)) * self.sigma

        # actions = mean + noise... then smooth the actions temporally
        all_samples = eps.copy()
        for i in range(self.horizon):

            if(i==0):
                all_samples[:, i, :] = self.beta*(self.mppi_mean[i, :] + eps[:, i, :]) + (1-self.beta)*past_action
            else:
                all_samples[:, i, :] = self.beta*(self.mppi_mean[i, :] + eps[:, i, :]) + (1-self.beta)*all_samples[:, i-1, :]

        # The resulting candidate action sequences:
        # all_samples : [N, horizon, ac_dim]
        all_samples = np.clip(all_samples, -1, 1)

        ########################################################################
        ### make each action element be (past K actions) instead of just (curr action)
        ########################################################################

        #all_samples : [N, horizon, ac_dim]
        #all_acs : [N, horizon, K, ac_dim]
        all_acs = turn_acs_into_acsK(actions_taken_so_far, all_samples, self.K,
                                     self.N, self.horizon)

        #################################################
        ### Get result of executing those candidate action sequences
        #################################################

        if self.use_ground_truth_dynamics:
            paths = trajectory_sampler.sample_paths_parallel(
                self.N,
                all_samples,
                actions_taken_so_far,
                starting_fullenvstate,
                self.env,
                suppress_print=True,
            )  #list of dicts, each w observations/actions/etc.

            #the taken number of paths is num_cpu*(floor(self.N/num_cpu))
            #rather than self.N, so update parameter accordingly
            self.N = len(paths)
            all_samples = all_samples[:self.N]

            resulting_states = [entry['observations'] for entry in paths]
            resulting_states = np.swapaxes(resulting_states, 0, 1)
            resulting_states_list = [resulting_states]
        else:
            ##################################################
            ### Trajectory sampling edit, every self.traj_sampling_ratio is a new set of actions
            ##################################################
            all_acs = np.repeat(all_acs, self.traj_sampling_ratio, axis=0)

            resulting_states_list = self.dyn_models.do_forward_sim(
                [curr_state_K, 0], np.copy(all_acs))
            resulting_states_list = np.swapaxes(resulting_states_list, 0,1)  #[ensSize, horizon+1, N, statesize]

        ############################
        ### evaluate the predicted trajectories
        ############################

        # calculate costs [N,]
        use_catastrophe_in_prediction = self.catastrophe_pred and self.finetuning
        costs, mean_costs, std_costs = calculate_costs(resulting_states_list, all_samples,
                                self.reward_func, evaluating, take_exploratory_actions, self.traj_sampling_ratio, use_catastrophe_in_prediction, self.risk_aversion_type, self.caution_beta)

        # uses all paths to update action mean (for horizon steps)
        # Note: mppi_update needs rewards, so pass in -costs
        selected_action = self.mppi_update(-costs, -mean_costs, std_costs, all_samples)

        ## Debugging
        prediction_horizon = 7
        #self.prediction_history.append(np.array(resulting_states_list)[:, -(8 - prediction_horizon), :, -1])
        #if len(self.prediction_history) > self.horizon + 1:
        #    self.prediction_history.pop(0)
        self.action_history.append(np.copy(self.mppi_mean))
        if len(self.action_history) > self.horizon + 1:
            self.action_history.pop(0)

        # debugging catastrophe prediction
        if len(self.observation_history) == self.horizon + 1:
            horizon_steps_ago_state = np.expand_dims(np.array(self.observation_history[-(prediction_horizon + 1)]), 0)
            selected_acs = np.expand_dims(np.expand_dims(self.action_history[-(prediction_horizon + 1)], 0), 2)
            resulting_final_actions_states_list = self.dyn_models.do_forward_sim(
                [horizon_steps_ago_state, 0], np.copy(selected_acs))
            resulting_final_actions_states_list = np.array(resulting_final_actions_states_list)[prediction_horizon, :, :, -1]
            cat_pred = expit(resulting_final_actions_states_list)
            #cat_pred = np.where(cat_pred > 0.5, np.ones(cat_pred.shape), np.zeros(cat_pred.shape))
            self.catastrophe_labels.append(self.observation_history[-1][-1])
            #self.catastrophe_scores.append(self.prediction_history[self.horizon - prediction_horizon])
            self.catastrophe_scores.append(cat_pred)
        #for i in range(len(self.observation_history)):
        #    correct = self.observation_history[i][-1] == cat_pred
        #    self.total_prediction_correct[i].append(correct.mean())
        #    if self.observation_history[i][-1] == 1:
        #        positive_correct = np.mean(cat_pred[i] == 1)
        #        self.positive_prediction_correct[0].append(positive_correct)
        #    else:
        #        negative_correct = np.mean(cat_pred[i] == 0)
        #        self.negative_prediction_correct[i].append(negative_correct)


        #########################################
        ### execute the candidate action sequences on the real dynamics
        ### instead of just on the model
        ### useful for debugging/analysis...
        #########################################
        if self.execute_sideRollouts:
            if (step_number % self.horizon)==0:
                cmap = plt.get_cmap('jet_r')
                num_sims = 5
                indices_to_vis = [0, 1, 2, 3, 4, 6, -3, -2]
                curr_plot = 1
                num_plots = len(indices_to_vis)
                for index_state_to_vis in indices_to_vis:
                    plt.subplot(num_plots, 1, curr_plot)
                    plt.ylabel(index_state_to_vis)
                    for sim_num in range(num_sims):
                        true_states = do_groundtruth_rollout(
                            all_samples[sim_num], self.env,
                            starting_fullenvstate, actions_taken_so_far)
                        color = cmap(float(sim_num) / num_sims)

                        ###if(step_number%10==0):
                        plt.plot(
                            resulting_states_list[-1]
                            [:, sim_num, index_state_to_vis],
                            '--',
                            c=color,
                            label=sim_num)
                        plt.plot(
                            np.array(true_states)[:, index_state_to_vis],
                            '-',
                            c=color)
                    curr_plot += 1

                if self.plot_sideRollouts:
                    plt.legend()
                    plt.show()

        return selected_action, resulting_states_list
Beispiel #3
0
    def get_action(self, step_number, curr_state_K, goal, actions_taken_so_far,
                   starting_fullenvstate, evaluating, take_exploratory_actions):

        # init vars
        curr_state_K = np.array(curr_state_K)  #[K, sa_dim]

        # remove the 1st entry of mean (mean from past timestep, that was just executed)
        # and copy last entry (starting point, for the next timestep)
        past_action = self.mppi_mean[0].copy()
        self.mppi_mean[:-1] = self.mppi_mean[1:]

        ##############################################
        ## sample candidate action sequences
        ## by creating smooth filtered trajecs (noised around a mean)
        ##############################################

        np.random.seed()  # to get different action samples for each rollout

        #sample noise from normal dist, scaled by sigma
        if(self.sample_velocity):
            eps_higherRange = np.random.normal(
                loc=0, scale=1.0, size=(self.N, self.horizon,
                                        self.ac_dim)) * self.sigma
            lowerRange = 0.3*self.sigma
            num_lowerRange = int(0.1*self.N)
            eps_lowerRange = np.random.normal(
                loc=0, scale=1.0, size=(num_lowerRange, self.horizon,
                                        self.ac_dim)) * lowerRange
            eps_higherRange[-num_lowerRange:] = eps_lowerRange
            eps=eps_higherRange.copy()
        else:
            eps = np.random.normal(
                loc=0, scale=1.0, size=(self.N, self.horizon,
                                        self.ac_dim)) * self.sigma

        # actions = mean + noise... then smooth the actions temporally
        all_samples = eps.copy()
        for i in range(self.horizon):

            if(i==0):
                all_samples[:, i, :] = self.beta*(self.mppi_mean[i, :] + eps[:, i, :]) + (1-self.beta)*past_action
            else:
                all_samples[:, i, :] = self.beta*(self.mppi_mean[i, :] + eps[:, i, :]) + (1-self.beta)*all_samples[:, i-1, :]

        # The resulting candidate action sequences:
        # all_samples : [N, horizon, ac_dim]
        all_samples = np.clip(all_samples, -1, 1)

        ########################################################################
        ### make each action element be (past K actions) instead of just (curr action)
        ########################################################################

        #all_samples : [N, horizon, ac_dim]
        #all_acs : [N, horizon, K, ac_dim]
        all_acs = turn_acs_into_acsK(actions_taken_so_far, all_samples, self.K,
                                     self.N, self.horizon)

        #################################################
        ### Get result of executing those candidate action sequences
        #################################################

        if self.use_ground_truth_dynamics:
            paths = trajectory_sampler.sample_paths_parallel(
                self.N,
                all_samples,
                actions_taken_so_far,
                starting_fullenvstate,
                self.env,
                suppress_print=True,
            )  #list of dicts, each w observations/actions/etc.

            #the taken number of paths is num_cpu*(floor(self.N/num_cpu))
            #rather than self.N, so update parameter accordingly
            self.N = len(paths)
            all_samples = all_samples[:self.N]

            resulting_states = [entry['observations'] for entry in paths]
            resulting_states = np.swapaxes(resulting_states, 0, 1)
            resulting_states_list = [resulting_states]
        else:
            resulting_states_list = self.dyn_models.do_forward_sim(
                [curr_state_K, 0], np.copy(all_acs))
            resulting_states_list = np.swapaxes(resulting_states_list, 0,1)  #[ensSize, horizon+1, N, statesize]

        ############################
        ### evaluate the predicted trajectories
        ############################

        # from ipdb import set_trace;
        # set_trace()

        # calculate costs [N,]
        costs, mean_costs, std_costs = calculate_costs(resulting_states_list, goal, all_samples,
                                self.reward_func, evaluating, take_exploratory_actions)

        # print('mean_costs',mean_costs)
        # print('std_costs',std_costs)

        # uses all paths to update action mean (for horizon steps)
        # Note: mppi_update needs rewards, so pass in -costs
        selected_action = self.mppi_update(-costs, -mean_costs, std_costs, all_samples)

        #########################################
        ### execute the candidate action sequences on the real dynamics
        ### instead of just on the model
        ### useful for debugging/analysis...
        #########################################
        # if self.execute_sideRollouts:
        #     if (step_number % self.horizon)==0:
        #         cmap = plt.get_cmap('jet_r')
        #         num_sims = 5
        #         indices_to_vis = [0, 1, 2, 3, 4, 6, -3, -2]
        #         curr_plot = 1
        #         num_plots = len(indices_to_vis)
        #         for index_state_to_vis in indices_to_vis:
        #             plt.subplot(num_plots, 1, curr_plot)
        #             plt.ylabel(index_state_to_vis)
        #             for sim_num in range(num_sims):
        #                 true_states = do_groundtruth_rollout(
        #                     all_samples[sim_num], self.env,
        #                     starting_fullenvstate, actions_taken_so_far)
        #                 color = cmap(float(sim_num) / num_sims)
        #
        #                 ###if(step_number%10==0):
        #                 plt.plot(
        #                     resulting_states_list[-1]
        #                     [:, sim_num, index_state_to_vis],
        #                     '--',
        #                     c=color,
        #                     label=sim_num)
        #                 plt.plot(
        #                     np.array(true_states)[:, index_state_to_vis],
        #                     '-',
        #                     c=color)
        #             curr_plot += 1
        #
        #         if self.plot_sideRollouts:
        #             plt.legend()
        #             plt.show()

        return selected_action, resulting_states_list
Beispiel #4
0
    def get_action(self, step_number, curr_state_K, actions_taken_so_far,
                   starting_fullenvstate, evaluating,
                   take_exploratory_actions):
        """Select optimal action

        Args:
            curr_state_K:
                current "state" as known by the dynamics model
                actually a concatenation of (1) current obs, and (K-1) past obs
            step_number:
                which step number the rollout is currently on (used to calculate costs)
            actions_taken_so_far:
                used to restore state of the env to correct place,
                when using ground-truth dynamics
            starting_fullenvstate
                full state of env before this rollout, used for env resets (when using ground-truth dynamics)
            evaluating
                if True: default to not having any noise on the executing action
            take_exploratory_actions
                if True: select action based on disagreement of ensembles
                if False: (default) select action based on predicted costs

        Returns:
            best_action: optimal action to perform, according to this controller
            resulting_states_list: predicted results of executing the candidate action sequences
        """

        #initial mean and var of the sampling normal dist
        mean = np.zeros((self.sol_dim, ))
        var = 5 * np.ones((self.sol_dim, ))
        X = stats.truncnorm(self.lb,
                            self.ub,
                            loc=np.zeros_like(mean),
                            scale=np.ones_like(mean))

        #stop if variance is very low, or if enough iters
        t = 0
        while ((t < self.max_iters) and (np.max(var) > self.epsilon)):

            #variables
            lb_dist = mean - self.lb
            ub_dist = self.ub - mean
            constrained_var = np.minimum(
                np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)),
                var)

            #get samples
            all_samples_orig = X.rvs(size=[self.N, self.sol_dim]) * np.sqrt(
                constrained_var) + mean  # [N, ac*h]
            all_samples = all_samples_orig.reshape(
                self.N, self.horizon,
                -1)  #interpret each row as a sequence of actions
            all_samples = np.clip(all_samples, -1, 1)

            ########################################################################
            ### make each action element be (past K actions) instead of just (curr action)
            ########################################################################

            #all_samples is [N, horizon, ac_dim]
            all_acs = turn_acs_into_acsK(actions_taken_so_far, all_samples,
                                         self.K, self.N, self.horizon)
            #all_acs should now be [N, horizon, K, ac_dim]

            ############################
            ### have model predict the result of executing those candidate action sequences
            ############################

            if self.use_ground_truth_dynamics:
                paths = trajectory_sampler.sample_paths_parallel(
                    self.N,
                    all_samples,
                    actions_taken_so_far,
                    starting_fullenvstate,
                    self.env,
                    suppress_print=True,
                )  #list of dicts, each w observations/actions/etc.

                #the taken number of paths is num_cpu*(floor(self.N/num_cpu))
                #rather than self.N, so update parameter accordingly
                self.N = len(paths)
                all_samples = all_samples[:self.N]

                resulting_states = [entry['observations'] for entry in paths]
                resulting_states = np.swapaxes(resulting_states, 0, 1)

                resulting_states_list = [resulting_states]
            else:
                resulting_states_list = self.dyn_models.do_forward_sim(
                    [curr_state_K, 0], np.copy(all_acs))
                resulting_states_list = np.swapaxes(
                    resulting_states_list, 0,
                    1)  #this is now [ensSize, horizon+1, N, statesize]

            ############################
            ### evaluate the predicted trajectories
            ############################

            #calculate costs : [N,]
            costs, mean_costs, std_costs = calculate_costs(
                resulting_states_list, all_samples, self.reward_func,
                evaluating, take_exploratory_actions)

            #pick elites, and refit mean/var
            #Note: these are costs, so pick the lowest few to be elites
            indices = np.argsort(costs)
            elites = all_samples_orig[indices][:self.num_elites]
            new_mean = np.mean(elites, axis=0)
            new_var = np.var(elites, axis=0)

            #interpolate between old mean and new one
            mean = self.alpha * mean + (1 - self.alpha) * new_mean
            var = self.alpha * var + (1 - self.alpha) * new_var

            #next iteration
            t += 1

        #return the best action
        best_score = np.min(costs)
        best_sequence = mean.reshape(
            self.horizon, -1)  #interpret the 'row' as a sequence of actions
        best_action = np.copy(best_sequence[0])  #(acDim,)

        #########################################
        ### execute the candidate action sequences on the real dynamics
        ### instead of just on the model
        ### useful for debugging/analysis...
        #########################################

        if self.execute_sideRollouts:
            if ((step_number % self.horizon) == 0):
                cmap = plt.get_cmap('jet_r')
                num_sims = 10  ##5
                indices_to_vis = [0, 1, 2]
                curr_plot = 1
                num_plots = len(indices_to_vis)
                for index_state_to_vis in indices_to_vis:
                    plt.subplot(num_plots, 1, curr_plot)
                    for sim_num in range(num_sims):
                        true_states = do_groundtruth_rollout(
                            all_samples[sim_num], self.env,
                            starting_fullenvstate, actions_taken_so_far)
                        color = cmap(float(sim_num) / num_sims)

                        plt.plot(resulting_states_list[-1][:, sim_num,
                                                           index_state_to_vis],
                                 '--',
                                 c=color,
                                 label=sim_num)
                        plt.plot(np.array(true_states)[:, index_state_to_vis],
                                 '-',
                                 c=color)
                    curr_plot += 1

                if self.plot_sideRollouts:
                    plt.legend()
                    plt.show()

        return best_action, resulting_states_list