Beispiel #1
0
class BatchSampler(object):
    def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1):
        self.env_name = env_name
        self.batch_size = batch_size
        self.num_workers = num_workers

        self.queue = mp.Queue()
        self.envs = SubprocVecEnv(
            [make_env(env_name) for _ in range(num_workers)], queue=self.queue)
        self._env = gym.make(env_name)

    def sample(self, policy, params=None, gamma=0.95, device='cpu'):
        episodes = BatchEpisodes(batch_size=self.batch_size,
                                 gamma=gamma,
                                 device=device)
        for i in range(self.batch_size):
            self.queue.put(i)
        for _ in range(self.num_workers):
            self.queue.put(None)
        observations, batch_ids = self.envs.reset()
        dones = [False]

        # count = -1
        while (not all(dones)) or (not self.queue.empty()):
            # count = count + 1
            with torch.no_grad():
                observations_tensor = torch.from_numpy(observations).to(
                    device=device)
                actions_tensor = policy(observations_tensor,
                                        params=params).sample()
                actions = actions_tensor.cpu().numpy()
            new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(
                actions)
            # if count <2:
            # print("\ndones: ", dones)
            # print("info: ", new_batch_ids)
            # # print(new_observations.shape)
            # print("robot position: ", new_observations[:,:2])
            # print("goal: ", new_observations[:, 4:6])

            new_hid_observations = self.envs.get_peds()
            # new_hid_observations = np.array([[-1,-1], [1,-1], [1,1], [-1,1]])

            episodes.append(observations, new_hid_observations, actions,
                            rewards, batch_ids)
            observations, batch_ids = new_observations, new_batch_ids
        return episodes

    def reset_task(self, task):
        tasks = [task for _ in range(self.num_workers)]
        reset = self.envs.reset_task(tasks)
        return all(reset)

    def sample_tasks(self, num_tasks):
        tasks = self._env.unwrapped.sample_tasks(num_tasks)
        return tasks
Beispiel #2
0
class BatchSampler(object):
    def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 2):
        self.env_name = env_name
        self.batch_size = batch_size
        self.num_workers = num_workers

        self.queue = mp.Queue()
        self.envs = SubprocVecEnv(
            [make_env(env_name) for _ in range(num_workers)], queue=self.queue)
        self._env = gym.make(env_name)

    def sample(self, policy, params=None, gamma=0.95, device='cpu'):
        episodes = BatchEpisodes(batch_size=self.batch_size,
                                 gamma=gamma,
                                 device=device)
        for i in range(self.batch_size):
            self.queue.put(i)
        for _ in range(self.num_workers):
            self.queue.put(None)
        observations, batch_ids = self.envs.reset()
        dones = [False]
        while (not all(dones)) or (not self.queue.empty()):
            with torch.no_grad():
                observations_tensor = torch.from_numpy(observations).to(
                    device=device)
                actions_tensor = policy(observations_tensor,
                                        params=params).sample()
                # actions_tensor = policy(observations_tensor, params=params)
                actions = actions_tensor.cpu().numpy()
            new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(
                actions)
            episodes.append(observations, actions, rewards, batch_ids)
            observations, batch_ids = new_observations, new_batch_ids
        return episodes

    def reset_task(self, task):
        tasks = [task for _ in range(self.num_workers)]
        reset = self.envs.reset_task(tasks)
        return all(reset)

    def sample_tasks(self, num_tasks):
        tasks = self._env.unwrapped.sample_tasks(num_tasks)
        return tasks

    def sample_target_task(self, N):
        tasks = self._env.unwrapped.sample_target_task(N)
        return tasks
Beispiel #3
0
class BatchSampler(object):
    def __init__(self, env_name, batch_size, num_workers=None, test_env=False):
        self.env_name = env_name
        self.batch_size = batch_size
        self.num_workers = num_workers or mp.cpu_count() - 1
        self.test_env = test_env

        self.queue = mp.Queue()
        self.envs = SubprocVecEnv([make_env(env_name, test_env=test_env) for _ in range(num_workers)],
                                  queue=self.queue)
        self._env = make_env(env_name, test_env=test_env)()

    def sample(self, policy, params=None, gamma=0.95, device='cpu'):
        episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma, device=device)
        for i in range(self.batch_size):
            self.queue.put(i)
        for _ in range(self.num_workers):
            self.queue.put(None)
        observations, batch_ids = self.envs.reset()
        dones = [False]
        while (not all(dones)) or (not self.queue.empty()):
            with torch.no_grad():
                observations_tensor = torch.from_numpy(observations).to(device=device, dtype=torch.float32)
                actions_tensor = policy(observations_tensor, params=params).sample()
                actions = actions_tensor.cpu().numpy()
            new_observations, rewards, dones, new_batch_ids, infos = self.envs.step(actions)
            # info keys: reachDist, pickRew, epRew, goalDist, success, goal, task_name

            # NOTE: last infos will be absent if batch_size % num_workers != 0

            episodes.append(observations, actions, rewards, batch_ids, infos)
            observations, batch_ids = new_observations, new_batch_ids
        return episodes

    def reset_task(self, task):
        tasks = [task for _ in range(self.num_workers)]
        reset = self.envs.reset_task(tasks)
        return all(reset)

    def sample_tasks(self, num_tasks, task2prob=None):
        tasks = self._env.unwrapped.sample_tasks(num_tasks, task2prob)
        return tasks
Beispiel #4
0
class BatchSampler(object):
    def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1):
        self.env_name = env_name
        self.batch_size = batch_size
        self.num_workers = num_workers
        
        self.queue = mp.Queue()
        self.envs = SubprocVecEnv([make_env(env_name) for _ in range(num_workers)],
                                  queue=self.queue)
        self._env = gym.make(env_name)

    def sample(self, policy, params=None, gamma=0.9):
        episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma)
        for i in range(self.batch_size):
            self.queue.put(i)
        for _ in range(self.num_workers):
            self.queue.put(None)
        observations, batch_ids = self.envs.reset()
        dones = [False]
        while (not all(dones)) or (not self.queue.empty()):
            observations_tensor = observations
            # 气死 observations和action要一样的维度 垃圾
            # observations_tensor = observations.reshape(observations.shape[0], -1)
            actions_tensor = policy(observations_tensor, params=params).sample()
            # /CPU:0
            with tf.device('/CPU:0'):
                actions = actions_tensor.numpy()
            new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(actions)
            episodes.append(observations, actions, rewards, batch_ids)
            observations, batch_ids = new_observations, new_batch_ids

        return episodes

    def reset_task(self, task):
        tasks = [task for _ in range(self.num_workers)]
        reset = self.envs.reset_task(tasks)
        return all(reset)

    def sample_tasks(self, num_tasks):
        tasks = self._env.unwrapped.sample_tasks(num_tasks)
        return tasks
Beispiel #5
0
class BatchSampler(object):
    def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1):
        self.env_name = env_name
        self.batch_size = batch_size
        self.num_workers = num_workers

        self.queue = mp.Queue()
        self.envs = SubprocVecEnv(
            [make_env(env_name) for _ in range(num_workers)], queue=self.queue)
        self._env = gym.make(env_name)

    def sample(self,
               policy,
               task,
               tree=None,
               params=None,
               gamma=0.95,
               device='cpu'):
        episodes = BatchEpisodes(batch_size=self.batch_size,
                                 gamma=gamma,
                                 device=device)
        for i in range(self.batch_size):
            self.queue.put(i)
        for _ in range(self.num_workers):
            self.queue.put(None)
        observations, batch_ids = self.envs.reset()
        dones = [False]
        while (not all(dones)) or (not self.queue.empty()):
            with torch.no_grad():
                input = torch.from_numpy(observations).float().to(
                    device=device)

                if self.env_name == 'AntPos-v0':
                    _, embedding = tree.forward(
                        torch.from_numpy(
                            task["position"]).float().to(device=device))
                if self.env_name == 'AntVel-v1':
                    _, embedding = tree.forward(
                        torch.from_numpy(np.array(
                            [task["velocity"]])).float().to(device=device))

                # print(input.shape)
                # print(embedding.shape)
                observations_tensor = torch.t(
                    torch.stack([
                        torch.cat([
                            torch.from_numpy(np.array(teo)).to(device=device),
                            embedding[0]
                        ], 0) for teo in input
                    ], 1))

                actions_tensor = policy(observations_tensor,
                                        task=task,
                                        params=params,
                                        enhanced=False).sample()
                actions = actions_tensor.cpu().numpy()
            new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(
                actions)
            episodes.append(observations_tensor.cpu().numpy(), actions,
                            rewards, batch_ids)
            observations, batch_ids = new_observations, new_batch_ids
        return episodes

    def reset_task(self, task):
        tasks = [task for _ in range(self.num_workers)]
        reset = self.envs.reset_task(tasks)
        return all(reset)

    def sample_tasks(self, num_tasks):
        tasks = self._env.unwrapped.sample_tasks(num_tasks)
        return tasks
Beispiel #6
0
class BatchSampler:
    def __init__(self, env_name, batch_size, num_workers=mp.cpu_count()):
        """

		:param env_name:
		:param batch_size: fast batch size
		:param num_workers:
		"""
        self.env_name = env_name
        self.batch_size = batch_size
        self.num_workers = num_workers

        self.queue = mp.Queue()
        # [lambda function]
        env_factorys = [make_env(env_name) for _ in range(num_workers)]
        # this is the main process manager, and it will be in charge of num_workers sub-processes interacting with
        # environment.
        self.envs = SubprocVecEnv(env_factorys, queue_=self.queue)
        self._env = gym.make(env_name)

    def sample(self, policy, params=None, gamma=0.95, device='cpu'):
        """

		:param policy:
		:param params:
		:param gamma:
		:param device:
		:return:
		"""
        episodes = BatchEpisodes(batch_size=self.batch_size,
                                 gamma=gamma,
                                 device=device)
        for i in range(self.batch_size):
            self.queue.put(i)
        for _ in range(self.num_workers):
            self.queue.put(None)

        observations, batch_ids = self.envs.reset()
        dones = [False]
        while (not all(dones)) or (
                not self.queue.empty()):  # if all done and queue is empty
            # for reinforcement learning, the forward process requires no-gradient
            with torch.no_grad():
                # convert observation to cuda
                # compute policy on cuda
                # convert action to cpu
                observations_tensor = torch.from_numpy(observations).to(
                    device=device)
                # forward via policy network
                # policy network will return Categorical(logits=logits)
                actions_tensor = policy(observations_tensor,
                                        params=params).sample()
                actions = actions_tensor.cpu().numpy()

            new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(
                actions)
            # here is observations NOT new_observations, batch_ids NOT new_batch_ids
            episodes.append(observations, actions, rewards, batch_ids)
            observations, batch_ids = new_observations, new_batch_ids

        return episodes

    def reset_task(self, task):
        tasks = [task for _ in range(self.num_workers)]
        reset = self.envs.reset_task(tasks)
        return all(reset)

    def sample_tasks(self, num_tasks):
        tasks = self._env.unwrapped.sample_tasks(num_tasks)
        return tasks
Beispiel #7
0
class BatchSampler(object):
    def __init__(self,
                 env_name,
                 batch_size,
                 num_workers=mp.cpu_count() - 1,
                 args=None):
        self.env_name = env_name
        self.batch_size = batch_size  # NOTE # of trajectories in each env
        self.num_workers = num_workers
        self.args = args

        self.queue = mp.Queue()
        self.envs = SubprocVecEnv(
            [make_env(args, i_worker) for i_worker in range(num_workers)],
            queue=self.queue)
        self._env = make_env(args, i_worker=99)()

    def sample(self, policy, params=None, prey=None, gamma=0.95, device='cpu'):
        """Sample # of trajectories defined by "self.batch_size". The size of each
        trajectory is defined by the Gym env registration defined at:
        ./maml_rl/envs/__init__.py
        """
        assert prey is not None

        episodes = BatchEpisodes(batch_size=self.batch_size,
                                 gamma=gamma,
                                 device=device)
        for i in range(self.batch_size):
            self.queue.put(i)
        for _ in range(self.num_workers):
            self.queue.put(None)

        observations, worker_ids = self.envs.reset(
        )  # TODO reset needs to be fixed
        dones = [False]
        while (not all(dones)) or (not self.queue.empty()):
            with torch.no_grad():
                # Get observations
                predator_observations, prey_observations = self.split_observations(
                    observations)
                predator_observations_torch = torch.from_numpy(
                    predator_observations).to(device=device)
                prey_observations_torch = torch.from_numpy(
                    prey_observations).to(device=device)

                # Get actions
                predator_actions = policy(predator_observations_torch,
                                          params=params).sample()
                predator_actions = predator_actions.cpu().numpy()

                prey_actions = prey.select_deterministic_action(
                    prey_observations_torch)
                prey_actions = prey_actions.cpu().numpy()
            actions = np.concatenate([predator_actions, prey_actions], axis=1)
            new_observations, rewards, dones, new_worker_ids, _ = self.envs.step(
                copy.deepcopy(actions))
            assert np.sum(dones[:, 0]) == np.sum(dones[:, 1])
            dones = dones[:, 0]

            # Get new observations
            new_predator_observations, _ = self.split_observations(
                new_observations)

            # Get rewards
            predator_rewards = rewards[:, 0]
            episodes.append(predator_observations, predator_actions,
                            predator_rewards, worker_ids)
            observations, worker_ids = new_observations, new_worker_ids

        return episodes

    def reset_task(self, task):
        tasks = [task for _ in range(self.num_workers)]
        reset = self.envs.reset_task(tasks)
        return all(reset)

    def sample_tasks(self, num_tasks, test=False):
        if test is False:
            i_agents = np.random.randint(low=0, high=16, size=(num_tasks, ))
        else:
            i_agents = np.random.randint(low=16, high=21, size=(num_tasks, ))

        tasks = [{"i_agent": i_agent} for i_agent in i_agents]
        return tasks

    def split_observations(self, observations):
        predator_observations = []
        prey_observations = []
        for obs in observations:
            assert len(obs) == 2
            predator_observations.append(obs[0])
            prey_observations.append(obs[1])

        return \
            np.asarray(predator_observations, dtype=np.float32), \
            np.asarray(prey_observations, dtype=np.float32)