Python CleanupEnv.render Exemples, social_dilemmas.envs.cleanup.CleanupEnv.render Python Exemples

Exemple #1

0

Afficher le fichier

class Controller(object):
    def __init__(self, env_name='harvest', num_agents=1):
        self.env_name = env_name
        if env_name == 'harvest':
            print('Initializing Harvest environment')
            self.env = HarvestEnv(ascii_map=HARVEST_MAP_CPR,
                                  num_agents=num_agents,
                                  render=True)
        elif env_name == 'cleanup':
            print('Initializing Cleanup environment')
            self.env = CleanupEnv(num_agents=num_agents, render=True)
        else:
            print('Error! Not a valid environment type')
            return

        self.num_agents = num_agents

        self.agent_policies = []
        self.agents = list(self.env.agents.values())
        # print(agents[0].action_space)
        self.action_dim = self.agents[0].action_space.n
        for _ in range(num_agents):
            # TODO right now only using 1 frame, update later to look back x (e.g. 4) frames. Later RNN/LSTM
            neural_net = ConvFC(
                conv_in_channels=
                3,  # harvest specific input is 15x15x3 (HARVEST_VIEW_SIZE = 7)
                conv_out_channels=3,
                input_size=15,
                hidden_size=64,
                output_size=self.action_dim)
            self.agent_policies.append(
                DQNAgent(0, self.action_dim - 1, neural_net))

        self.env.reset()

    def process_experiences(self,
                            id,
                            i,
                            obs,
                            action_dict,
                            rew,
                            next_obs,
                            dones,
                            train_agents=False):
        # print(id)
        # print(i)
        agent_i = "agent-{}".format(i)
        self.agent_policies[i].push_experience(
            reshape_obs_for_convfc(obs[agent_i][0]),
            action_dict[agent_i],
            rew[agent_i],
            reshape_obs_for_convfc(
                next_obs[agent_i][0]
            ),  # we here using without the reward info... can modify later but this is just a test
            dones[agent_i])

        if train_agents:
            self.agent_policies[i].q_learn_update()

    # def train_parallel_agents(self, id, obs, action_dict, rew, next_obs, dones):
    #     for i in range(self.num_agents):
    #         # torch.multiprocessing.spawn(self.train_agent, args=(i, obs, action_dict, rew, next_obs, dones))
    #         self.train_agent(id, i, obs, action_dict, rew, next_obs, dones)

    def rollout(self,
                horizon,
                train_every=100,
                save_path=None,
                train_agents=True,
                print_act=False):
        """ Rollout several timesteps of an episode of the environment.

        Args:
            horizon: The number of timesteps to roll out.
            save_path: If provided, will save each frame to disk at this
                location.
        """

        rewards = np.zeros(self.num_agents)
        observations = []
        shape = self.env.world_map.shape
        full_obs = [
            np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
            for i in range(horizon)
        ]

        init_obs = self.env.reset()
        # print(init_obs)
        obs = init_obs

        for time_step in range(horizon):
            # print(time_step )
            action_dim = self.action_dim

            # Single agent hardcoded for now

            hard_coded = False
            if hard_coded:
                action_cycle = 40
                prep_time = 4 + 2  #10
                single_obs = obs["agent-{}".format(0)][0]
                if time_step < prep_time - 2:
                    # print(single_obs)
                    # print(single_obs.shape)
                    # print(single_obs[7][7])
                    #
                    # print(single_obs[7][6])
                    # print(single_obs[6][7])
                    # print(single_obs[7][8])
                    # print(single_obs[8][7])
                    # if single_obs[8][7].sum() == 540 and single_obs[7][6].sum() == 540: # 200
                    if single_obs[6][7].sum() == 540 and single_obs[7][8].sum(
                    ) == 540:  # 200
                        # if single_obs[6][7].sum() == 540 and single_obs[7][6].sum() == 540: # 100
                        # if single_obs[8][7].sum() == 540 and single_obs[7][8].sum() == 540: # 100
                        action = 4
                    # elif single_obs[7][9].sum() == 0 and single_obs[5][7].sum() == 0: # lower and left empty
                    #     action = 5
                    else:
                        action = 6  # got lazy, just keep turning otherwise
                    # action = 5
                # elif time_step == prep_time - 3:
                #     # print(single_obs[7][6])
                #     # print(single_obs[6][7])
                #     # print(single_obs[7][8])
                #     # print(single_obs[8][7])
                #     action=2 # first up movement, start the cycle
                elif time_step == prep_time - 2:
                    # print(single_obs[7][6])
                    # print(single_obs[6][7])
                    # print(single_obs[7][8])
                    # print(single_obs[8][7])
                    action = 1  #0 # first left movement, start the cycle # left and right are wrong? Yeah they messed it up
                    # Um anyway... around 450 is optimal in this env.
                elif time_step == prep_time - 1:
                    # print(single_obs[7][6])
                    # print(single_obs[6][7])
                    # print(single_obs[7][8])
                    # print(single_obs[8][7])
                    action = 2  # up again for smoe reason
                else:
                    # if time_step == prep_time:
                    # print(single_obs[7][6])
                    # print(single_obs[6][7])
                    # print(single_obs[7][8])
                    # print(single_obs[8][7])
                    # Assumes up orientation
                    if (time_step - prep_time) % action_cycle < 16:
                        action = 1  # left
                    elif (time_step - prep_time) % action_cycle < 20:
                        action = 2
                    elif (time_step - prep_time) % action_cycle < 36:
                        action = 0  # right
                    elif (time_step - prep_time) % action_cycle < 40:
                        action = 3  # down
                    # print(action)

                actions = [action]

            action_dict = {}

            if not hard_coded:
                actions = []
                if train_agents:
                    # for i in range(self.num_agents):
                    #     print(i)
                    #     action = self.agent_policies[i].act(reshape_obs_for_convfc(obs["agent-{}".format(i)]), print_act=print_act)
                    # actions.append(action)
                    actions = [
                        self.agent_policies[i].act(reshape_obs_for_convfc(
                            obs["agent-{}".format(i)][0]),
                                                   print_act=print_act)
                        for i in range(self.num_agents)
                    ]
                else:
                    # can choose eps=0 or something else after
                    actions = [
                        self.agent_policies[i].act(reshape_obs_for_convfc(
                            obs["agent-{}".format(i)][0]),
                                                   print_act=print_act)
                        for i in range(self.num_agents)
                    ]

            for i in range(self.num_agents):
                agent_i = "agent-{}".format(i)
                action_dict[agent_i] = actions[i]
                # if train_agents:
                #     # print(ray.get(self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i]))))
                #     action_dict[agent_i] = self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i]))
                # else:
                #     action_dict[agent_i] = self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i]), epsilon=0)
                #     # 1, obs[agent_i].shape[2], obs[agent_i].shape[0], obs[agent_i].shape[1] )) # batch size = 1 for 1 obs right now...

            next_obs, rew, dones, info, = self.env.step(action_dict)

            if not hard_coded:
                if train_agents:
                    for i in range(self.num_agents):
                        if ((time_step + 1) % train_every == 0):
                            self.process_experiences(0,
                                                     i,
                                                     obs,
                                                     action_dict,
                                                     rew,
                                                     next_obs,
                                                     dones,
                                                     train_agents=True)
                        else:
                            self.process_experiences(0,
                                                     i,
                                                     obs,
                                                     action_dict,
                                                     rew,
                                                     next_obs,
                                                     dones,
                                                     train_agents=False)

            obs = next_obs

            sys.stdout.flush()

            if save_path is not None:
                self.env.render(filename=save_path + 'frame' +
                                str(time_step).zfill(6) + '.png')

            rgb_arr = self.env.map_to_colors()
            full_obs[time_step] = rgb_arr.astype(np.uint8)

            # rewards.append(rew)
            observations.append(obs)
            for i in range(self.num_agents):
                agent_i = "agent-{}".format(i)
                rewards[i] += rew[agent_i]
            # observations.append(obs['agent-0'])
            # rewards.append(rew['agent-0'])

        return rewards, observations, full_obs

    def render_rollout(self, horizon=50, path=None, fps=8):
        """ Render a rollout into a video.

        Args:
            horizon: The number of timesteps to roll out.
            path: Directory where the video will be saved.
            render_type: Can be 'pretty' or 'fast'. Impliciations obvious.
            fps: Integer frames per second.
        """
        if path is None:
            path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
            print(path)
            if not os.path.exists(path):
                os.makedirs(path)
        video_name = self.env_name + '_trajectory'

        # if render_type == 'pretty':
        #     image_path = os.path.join(path, 'frames/')
        #     if not os.path.exists(image_path):
        #         os.makedirs(image_path)
        #
        #     rewards, observations, full_obs = self.rollout(
        #         horizon=horizon, save_path=image_path, train_agents=False)
        #     utility_funcs.make_video_from_image_dir(path, image_path, fps=fps,
        #                                             video_name=video_name)
        #
        #     # Clean up images
        #     shutil.rmtree(image_path)
        # else:
        rewards, observations, full_obs = self.rollout(horizon=horizon,
                                                       train_agents=False,
                                                       print_act=False)
        utility_funcs.make_video_from_rgb_imgs(full_obs,
                                               path,
                                               fps=fps,
                                               video_name=video_name)
        return rewards

Exemple #2

0

Afficher le fichier

Fichier : rollout.py Projet : Fabien-Couthouis/XAI-in-RL

class Controller(object):
    def __init__(self, env_name='cleanup'):
        self.env_name = env_name
        if env_name == 'harvest':
            print('Initializing Harvest environment')
            self.env = HarvestEnv(num_agents=5, render=True)
        elif env_name == 'cleanup':
            print('Initializing Cleanup environment')
            self.env = CleanupEnv(num_agents=5, render=True)
        else:
            print('Error! Not a valid environment type')
            return

        self.env.reset()

        # TODO: initialize agents here

    def rollout(self, horizon=50, save_path=None):
        """ Rollout several timesteps of an episode of the environment.

        Args:
            horizon: The number of timesteps to roll out.
            save_path: If provided, will save each frame to disk at this
                location.
        """
        rewards = []
        observations = []
        shape = self.env.world_map.shape
        full_obs = [
            np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
            for i in range(horizon)
        ]

        for i in range(horizon):
            agents = list(self.env.agents.values())
            action_dim = agents[0].action_space.n
            rand_action = np.random.randint(action_dim, size=5)
            obs, rew, dones, info, = self.env.step({
                'agent-0': rand_action[0],
                'agent-1': rand_action[1],
                'agent-2': rand_action[2],
                'agent-3': rand_action[3],
                'agent-4': rand_action[4]
            })

            sys.stdout.flush()

            if save_path is not None:
                self.env.render(filename=save_path + 'frame' +
                                str(i).zfill(6) + '.png')

            rgb_arr = self.env.map_to_colors()
            full_obs[i] = rgb_arr.astype(np.uint8)
            observations.append(obs['agent-0'])
            rewards.append(rew['agent-0'])

        return rewards, observations, full_obs

    def render_rollout(self,
                       horizon=50,
                       path=None,
                       render_type='pretty',
                       fps=8):
        """ Render a rollout into a video.

        Args:
            horizon: The number of timesteps to roll out.
            path: Directory where the video will be saved.
            render_type: Can be 'pretty' or 'fast'. Impliciations obvious.
            fps: Integer frames per second.
        """
        if path is None:
            path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
            print(path)
            if not os.path.exists(path):
                os.makedirs(path)
        video_name = self.env_name + '_trajectory'

        if render_type == 'pretty':
            image_path = os.path.join(path, 'frames/')
            if not os.path.exists(image_path):
                os.makedirs(image_path)

            rewards, observations, full_obs = self.rollout(
                horizon=horizon, save_path=image_path)
            utility_funcs.make_video_from_image_dir(path,
                                                    image_path,
                                                    fps=fps,
                                                    video_name=video_name)

            # Clean up images
            shutil.rmtree(image_path)
        else:
            rewards, observations, full_obs = self.rollout(horizon=horizon)
            utility_funcs.make_video_from_rgb_imgs(full_obs,
                                                   path,
                                                   fps=fps,
                                                   video_name=video_name)

Exemple #3

0

Afficher le fichier

Fichier : ssd.py Projet : 011235813/lio

class Env(object):
    def __init__(self, config_env):

        self.name = 'ssd'
        self.config = config_env
        self.dim_obs = [self.config.obs_height, self.config.obs_width, 3]
        self.max_steps = self.config.max_steps

        self.cleaning_penalty = self.config.cleaning_penalty
        # Original space (not necessarily in this order, see
        # the original ssd files):
        # no-op, up, down, left, right, turn-ccw, turn-cw, penalty, clean
        if (self.config.disable_left_right_action
                and self.config.disable_rotation_action):
            self.l_action = 4
            self.cleaning_action_idx = 3
            # up, down, no-op, clean
            self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 8}
        elif self.config.disable_left_right_action:
            self.l_action = 6
            self.cleaning_action_idx = 5
            # up, down, no-op, rotate cw, rotate ccw, clean
            self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 5, 4: 6, 5: 8}
        elif self.config.disable_rotation_action:
            self.l_action = 6
            self.cleaning_action_idx = 5
            # left, right, up, down, no-op, clean
            self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8}
        else:  # full action space except penalty beam
            self.l_action = 8
            self.cleaning_action_idx = 7
            # Don't allow penalty beam
            self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8}

        self.obs_cleaned_1hot = self.config.obs_cleaned_1hot

        self.n_agents = self.config.n_agents

        if self.config.map_name == 'cleanup_small_sym':
            ascii_map = maps.CLEANUP_SMALL_SYM
        elif self.config.map_name == 'cleanup_10x10_sym':
            ascii_map = maps.CLEANUP_10x10_SYM

        self.env = CleanupEnv(
            ascii_map=ascii_map,
            num_agents=self.n_agents,
            render=False,
            shuffle_spawn=self.config.shuffle_spawn,
            global_ref_point=self.config.global_ref_point,
            view_size=self.config.view_size,
            random_orientation=self.config.random_orientation,
            cleanup_params=self.config.cleanup_params,
            beam_width=self.config.beam_width)

        # length of action input to learned reward function
        if self.config.obs_cleaned_1hot:
            self.l_action_for_r = 2
        else:
            self.l_action_for_r = self.l_action

        self.steps = 0

    def process_obs(self, obs_dict):

        return [obs / 256.0 for obs in list(obs_dict.values())]

    def reset(self):
        """Resets the environemnt.

        Returns:
            List of agent observations
        """
        obs = self.env.reset()
        self.steps = 0

        return self.process_obs(obs)

    def step(self, actions):
        """Takes a step in env.
        
        Args:
            actions: list of integers

        Returns:
            List of observations, list of rewards, done, info
        """
        actions = [self.map_to_orig[a] for a in actions]
        actions_dict = {
            'agent-%d' % idx: actions[idx]
            for idx in range(self.n_agents)
        }

        # all objects returned by env.step are dicts
        obs_next, rewards, dones, info = self.env.step(actions_dict)
        self.steps += 1

        obs_next = self.process_obs(obs_next)
        rewards = list(rewards.values())
        if self.cleaning_penalty > 0:
            for idx in range(self.n_agents):
                if actions[idx] == 8:
                    rewards[idx] -= self.cleaning_penalty

        # done = dones['__all__']  # apparently they hardcode done to False
        done = dones['__all__'] or self.steps == self.max_steps

        return obs_next, rewards, done, info

    def render(self):

        self.env.render()