class VectorizedSampler(BaseSampler):
    def __init__(self, algo, n_envs=None):
        super(VectorizedSampler, self).__init__(algo)
        self.n_envs = n_envs

    def start_worker(self):
        n_envs = self.n_envs
        if n_envs is None:
            n_envs = int(self.algo.batch_size / self.algo.max_path_length)
            n_envs = max(1, min(n_envs, 100))

        if getattr(self.algo.env, 'vectorized', False):
            self.vec_env = self.algo.env.vec_env_executor(
                n_envs=n_envs, max_path_length=self.algo.max_path_length)
        else:
            envs = [
                pickle.loads(pickle.dumps(self.algo.env))
                for _ in range(n_envs)
            ]
            self.vec_env = VecEnvExecutor(
                envs=envs,
                #env=pickle.loads(pickle.dumps(self.algo.env)),
                #n = n_envs,
                max_path_length=self.algo.max_path_length)
        self.env_spec = self.algo.env.spec

    def shutdown_worker(self):
        self.vec_env.terminate()

    def obtain_samples(self,
                       itr,
                       reset_args=None,
                       return_dict=False,
                       log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list
                                       and type(reset_args) != np.ndarray):
            reset_args = [reset_args] * self.vec_env.num_envs

        n_samples = 0
        obses = self.vec_env.reset(reset_args)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time

        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        #logger.record_tabular(log_prefix+"PolicyExecTime", policy_time)
        #logger.record_tabular(log_prefix+"EnvExecTime", env_time)
        #logger.record_tabular(log_prefix+"ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Esempio n. 2
0
class VectorizedSampler(BaseSampler):
    def __init__(self, algo, n_envs=None):
        super(VectorizedSampler, self).__init__(algo)
        self.n_envs = n_envs

    def start_worker(self):
        n_envs = self.n_envs
        if n_envs is None:
            n_envs = int(self.algo.batch_size / self.algo.max_path_length)
            n_envs = max(1, min(n_envs, 100))

        if getattr(self.algo.env, 'vectorized', False):
            self.vec_env = self.algo.env.vec_env_executor(
                n_envs=n_envs, max_path_length=self.algo.max_path_length)
        else:
            envs = [
                pickle.loads(pickle.dumps(self.algo.env))
                for _ in range(n_envs)
            ]
            self.vec_env = VecEnvExecutor(
                envs=envs, max_path_length=self.algo.max_path_length)
        self.env_spec = self.algo.env.spec

    def shutdown_worker(self):
        self.vec_env.terminate()

    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        terminals=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["terminals"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            terminals=tensor_utils.stack_tensor_list(
                                running_paths[idx]["terminals"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
Esempio n. 3
0
class RNNCriticSampler(object):
    def __init__(self, policy, env, n_envs, replay_pool_size, max_path_length, sampling_method,
                 save_rollouts=False, save_rollouts_observations=True, save_env_infos=False, env_str=None, replay_pool_params={}):
        self._policy = policy
        self._n_envs = n_envs

        assert(self._n_envs == 1) # b/c policy reset

        self._replay_pools = [RNNCriticReplayPool(env.spec,
                                                  env.horizon,
                                                  policy.N,
                                                  policy.gamma,
                                                  replay_pool_size // n_envs,
                                                  obs_history_len=policy.obs_history_len,
                                                  sampling_method=sampling_method,
                                                  save_rollouts=save_rollouts,
                                                  save_rollouts_observations=save_rollouts_observations,
                                                  save_env_infos=save_env_infos,
                                                  replay_pool_params=replay_pool_params)
                              for _ in range(n_envs)]

        try:
            envs = [pickle.loads(pickle.dumps(env)) for _ in range(self._n_envs)] if self._n_envs > 1 else [env]
        except:
            envs = [create_env(env_str) for _ in range(self._n_envs)] if self._n_envs > 1 else [env]
        ### need to seed each environment if it is GymEnv
        seed = get_seed()
        if seed is not None and isinstance(utils.inner_env(env), GymEnv):
            for i, env in enumerate(envs):
                utils.inner_env(env).env.seed(seed + i)
        self._vec_env = VecEnvExecutor(
            envs=envs,
            max_path_length=max_path_length
        )
        self._curr_observations = self._vec_env.reset()

        # Initialize the interface between ROS and GCG
        self._interface_gcg = InterfaceGCG()

        self._nb_target_lost = 0

    @property
    def n_envs(self):
        return self._n_envs

    ##################
    ### Statistics ###
    ##################

    @property
    def statistics(self):
        return RNNCriticReplayPool.statistics_pools(self._replay_pools)

    def __len__(self):
        return sum([len(rp) for rp in self._replay_pools])

    ####################
    ### Add to pools ###
    ####################

    def step(self, step, take_random_actions=False, explore=True):
        """ Takes one step in each simulator and adds to respective replay pools """
        ### store last observations and get encoded
        encoded_observations = []
        for i, (replay_pool, observation) in enumerate(zip(self._replay_pools, self._curr_observations)):
            replay_pool.store_observation(step + i, observation)
            encoded_observations.append(replay_pool.encode_recent_observation())

        ### get actions
        if take_random_actions:
            actions = [self._vec_env.action_space.sample() for _ in range(self._n_envs)]
            est_values = [np.nan] * self._n_envs
            if isinstance(self._vec_env.action_space, Discrete):
                logprobs = [-np.log(self._vec_env.action_space.flat_dim)] * self._n_envs
            elif isinstance(self._vec_env.action_space, Box):
                low = self._vec_env.action_space.low
                high = self._vec_env.action_space.high
                logprobs = [-np.sum(np.log(high - low))] * self._n_envs
            else:
                raise NotImplementedError
        else:
            actions, est_values, logprobs, _ = self._policy.get_actions(
                steps=list(range(step, step + self._n_envs)),
                current_episode_steps=self._vec_env.current_episode_steps,
                observations=encoded_observations,
                explore=explore)

        ### take step
        # next_observations, rewards, dones, env_infos = self._vec_env.step(actions)
        while (self._interface_gcg.get_ready_for_action() == False):
            self._interface_gcg.pub_gcg_ready()
        next_observations, rewards, dones, env_infos = self._interface_gcg.take_step(actions)

        if (rewards[0] == -1):
            self._nb_target_lost = self._nb_target_lost + 1
        
        if np.any(dones):
            self._policy.reset_get_action()

        ### add to replay pool
        for replay_pool, action, reward, done, env_info, est_value, logprob in \
                zip(self._replay_pools, actions, rewards, dones, env_infos, est_values, logprobs):
            replay_pool.store_effect(action, reward, done, env_info, est_value, logprob)

        self._curr_observations = next_observations

    #####################
    ### Add offpolicy ###
    #####################

    def _rollouts_file(self, folder, itr):
        return os.path.join(folder, 'itr_{0:d}_rollouts.pkl'.format(itr))

    def add_offpolicy(self, offpolicy_folder, num_offpolicy):
        step = 0
        itr = 0
        replay_pools = itertools.cycle(self._replay_pools)
        done_adding = False

        while os.path.exists(self._rollouts_file(offpolicy_folder, itr)):
            rollouts = joblib.load(self._rollouts_file(offpolicy_folder, itr))['rollouts']
            itr += 1

            for rollout, replay_pool in zip(rollouts, replay_pools):
                r_len = len(rollout['dones'])
                if step + r_len >= num_offpolicy:
                    diff = num_offpolicy - step
                    for k in ('observations', 'actions', 'rewards', 'dones', 'logprobs'):
                        rollout[k] = rollout[k][:diff]
                    done_adding = True
                    r_len = len(rollout['dones'])

                replay_pool.store_rollout(step, rollout)
                step += r_len

                if done_adding:
                    break

            if done_adding:
                break

    #########################
    ### Sample from pools ###
    #########################

    def can_sample(self):
        return np.any([replay_pool.can_sample() for replay_pool in self._replay_pools])

    def sample(self, batch_size):
        return RNNCriticReplayPool.sample_pools(self._replay_pools, batch_size,
                                                only_completed_episodes=self._policy.only_completed_episodes)

    ###############
    ### Logging ###
    ###############

    def log(self, prefix=''):
        RNNCriticReplayPool.log_pools(self._replay_pools, self._nb_target_lost, prefix=prefix)

    def get_recent_paths(self):
        return RNNCriticReplayPool.get_recent_paths_pools(self._replay_pools)

    def reset_nb_target_lost(self):
        self._nb_target_lost = 0
Esempio n. 4
0
class VectorizedSampler(BaseSampler):
    def start_worker(self):
        estimated_envs = int(self.algo.batch_size / self.algo.max_path_length)
        estimated_envs = max(1, min(estimated_envs, 100))
        self.vec_env = VecEnvExecutor(
            self.algo.env,
            n=estimated_envs,
            max_path_length=self.algo.max_path_length
        )
        self.env_spec = self.algo.env.spec

    def shutdown_worker(self):
        self.vec_env.terminate()

    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            self.algo.policy.reset(dones)
            actions, agent_infos = self.algo.policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
Esempio n. 5
0
class VectorizedSampler(BaseSampler):

    def __init__(self, algo, n_envs=None, batch_size=None):
        super(VectorizedSampler, self).__init__(algo)
        self.n_envs = n_envs
        # if batch_size is not None:
        #     self.batch_size = batch_size
        # else:
        self.batch_size = self.algo.batch_size
        print("vectorized sampler initiated")
    def start_worker(self):
        n_envs = self.n_envs
        if n_envs is None:
            n_envs = int(self.algo.batch_size / self.algo.max_path_length)
            n_envs = max(1, min(n_envs, 100))

        if getattr(self.algo.env, 'vectorized', False):
            self.vec_env = self.algo.env.vec_env_executor(n_envs=n_envs, max_path_length=self.algo.max_path_length)
        else:
            envs = [pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs)]
            self.vec_env = VecEnvExecutor(
                envs=envs,
                #env=pickle.loads(pickle.dumps(self.algo.env)),
                #n = n_envs,
                max_path_length=self.algo.max_path_length
            )
        self.env_spec = self.algo.env.spec

    def shutdown_worker(self):
        self.vec_env.terminate()


    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='',  preupdate=False, save_img_obs=False, contexts = None):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray):
            assert False, "debug, should we be using this?"
            print("WARNING, will vectorize reset_args")
            reset_args = [reset_args]*self.vec_env.num_envs


        n_samples = 0
        path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance
        obses = self.vec_env.reset(reset_args)
        if contexts:
            obses = np.concatenate([obses, contexts], axis = 1)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        if contexts:
            policy = self.algo.post_policy
        else:
            policy = self.algo.policy

        while n_samples < self.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            # print("debug, agent_infos", agent_infos)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)   # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict
            if contexts:
                next_obses = np.concatenate([next_obses, contexts], axis = 1)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])  # TODO: let's also add the incomplete running_paths to paths
                    running_paths[idx] = None
                    path_nums[idx] += 1
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        # adding the incomplete paths
        # for idx in range(self.vec_env.num_envs):
        #     if running_paths[idx] is not None:
        #         paths[idx].append(dict(
        #             observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
        #             actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
        #             rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
        #             env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
        #             agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
        #         ))


        pbar.stop()





      #  logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
      #  logger.record_tabular(log_prefix + "EnvExecTime", env_time)
       # logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
class VectorizedSampler(BaseSampler):

    def __init__(self, algo, n_envs=None):
        super(VectorizedSampler, self).__init__(algo)
        self.n_envs = n_envs

    def start_worker(self):
        n_envs = self.n_envs
        if n_envs is None:
            n_envs = int(self.algo.batch_size / self.algo.max_path_length)
            n_envs = max(1, min(n_envs, 100))

        if getattr(self.algo.env, 'vectorized', False):
            self.vec_env = self.algo.env.vec_env_executor(n_envs=n_envs, max_path_length=self.algo.max_path_length)
        else:
            envs = [pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs)]
            self.vec_env = VecEnvExecutor(
                envs=envs,
                #env=pickle.loads(pickle.dumps(self.algo.env)),
                #n = n_envs,
                max_path_length=self.algo.max_path_length
            )
        self.env_spec = self.algo.env.spec

    def shutdown_worker(self):
        self.vec_env.terminate()


    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray):
            reset_args = [reset_args]*self.vec_env.num_envs

        n_samples = 0
        obses = self.vec_env.reset(reset_args)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time


        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular(log_prefix+"PolicyExecTime", policy_time)
        logger.record_tabular(log_prefix+"EnvExecTime", env_time)
        logger.record_tabular(log_prefix+"ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Esempio n. 7
0
class VectorizedSampler(BaseSampler):
    def __init__(self, algo, n_envs=None):
        super(VectorizedSampler, self).__init__(algo)
        self.n_envs = n_envs

    def start_worker(self, include_joint_coords=False):
        n_envs = self.n_envs
        if n_envs is None:
            n_envs = int(self.algo.batch_size / self.algo.max_path_length)
            n_envs = max(1, min(n_envs, 100))

        if getattr(self.algo.env, 'vectorized', False):
            self.vec_env = self.algo.env.vec_env_executor(
                n_envs=n_envs,
                max_path_length=self.algo.max_path_length,
                include_joint_coords=include_joint_coords)
        else:
            envs = [
                pickle.loads(pickle.dumps(self.algo.env))
                for _ in range(n_envs)
            ]
            self.vec_env = VecEnvExecutor(
                envs=envs,
                max_path_length=self.algo.max_path_length,
                include_joint_coords=include_joint_coords)
        self.env_spec = self.algo.env.spec

    def shutdown_worker(self):
        self.vec_env.terminate()

    def obtain_samples_for_visualization(self, include_joint_coords=False):
        tf_env = self.algo.env
        if hasattr(tf_env.wrapped_env, "stats_recorder"):
            setattr(tf_env.wrapped_env.stats_recorder, "done", None)

        import builtins
        builtins.visualize = True

        print("\nAbout to start video...")
        obs_dim = self.env_spec.observation_space.shape[0]
        obs = tf_env.reset()
        obs = self._add_joint_coords_to_obs(obs, include_joint_coords)
        horizon = 1000
        for horizon_num in range(1, horizon + 1):
            # action, _ = self.algo.policy.get_action(obs[:obs_dim])
            action, _ = self.algo.policy.get_action(obs)
            next_obs, reward, done, _info = tf_env.step(action, use_states=obs)
            obs = self._add_joint_coords_to_obs(next_obs, include_joint_coords)
            if done or horizon_num == horizon:
                break
        builtins.visualize = False

    def obtain_samples(self, itr, include_joint_coords=False):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0

        obses = self.vec_env.reset()
        obses = self._add_joint_coords_to_obses(obses, include_joint_coords)
        obs_dim = self.env_spec.observation_space.shape[0]

        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, use_states=obses)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = self._add_joint_coords_to_obses(next_obses,
                                                    include_joint_coords)

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths, n_samples

    def _add_joint_coords_to_obses(self, obses, include_joint_coords):
        if include_joint_coords:
            try:
                inner_env = self._get_inner_env()
                extended_obses = []
                for obs in obses:
                    extended_obses.append(
                        self._add_joint_coords_to_obs(obs,
                                                      include_joint_coords,
                                                      inner_env))
                return np.array(extended_obses)
            except AttributeError:
                inner_envs = self._get_inner_envs()
                extended_obses = []
                for obs_i in range(len(obses)):
                    extended_obses.append(
                        self._add_joint_coords_to_obs(obses[obs_i],
                                                      include_joint_coords,
                                                      inner_envs[obs_i]))
                return np.array(extended_obses)

        return obses

    def _add_joint_coords_to_obs(self,
                                 obs,
                                 include_joint_coords,
                                 inner_env=None):
        if include_joint_coords:
            if not inner_env:
                inner_env = self._get_inner_env()
            if hasattr(inner_env, "get_geom_xpos"):
                return np.append(obs, inner_env.get_geom_xpos().flatten())
            else:
                return np.append(obs, inner_env.env.get_geom_xpos().flatten())
        return obs

    def _get_inner_env(self):
        env = self.vec_env.vec_env
        while hasattr(env, "env"):
            env = env.env
        if hasattr(env.wrapped_env, '_wrapped_env'):
            return env.wrapped_env._wrapped_env
        else:
            return env.wrapped_env.env.unwrapped

    def _get_inner_envs(self):
        inner_envs = []
        for env in self.vec_env.envs:
            while hasattr(env, "env"):
                env = env.env
            if hasattr(env.wrapped_env, '_wrapped_env'):
                inner_envs.append(env.wrapped_env._wrapped_env)
            else:
                inner_envs.append(env.wrapped_env.env.unwrapped)
        return inner_envs
class VectorizedSampler(BaseSampler):
    def __init__(self, algo, n_envs=None):
        super(VectorizedSampler, self).__init__(algo)
        self.n_envs = n_envs

    def start_worker(self):
        n_envs = self.n_envs
        if n_envs is None:
            n_envs = int(self.algo.batch_size / self.algo.max_path_length)
            n_envs = max(1, min(n_envs, 100))

        if getattr(self.algo.env, 'vectorized', False):
            self.vec_env = self.algo.env.vec_env_executor(
                n_envs=n_envs, max_path_length=self.algo.max_path_length)
        else:
            envs = [
                pickle.loads(pickle.dumps(self.algo.env))
                for _ in range(n_envs)
            ]
            self.vec_env = VecEnvExecutor(
                envs=envs,
                #env=pickle.loads(pickle.dumps(self.algo.env)),
                #n = n_envs,
                max_path_length=self.algo.max_path_length)
        self.env_spec = self.algo.env.spec

    def shutdown_worker(self):
        self.vec_env.terminate()

    def get_random_action(self, observation, idx=None):
        # this function takes a numpy array observations and outputs randomly sampled actions.
        # idx: index corresponding to the task/updated policy.
        #flat_obs = self.env_spec.observation_space.flatten(observation)
        #f_dist = self._cur_f_dist
        mean = np.random.uniform(low=-1.0,
                                 high=1.0,
                                 size=[self.n_envs, self.algo.a_size])
        #mean, log_std = [x[0] for x in f_dist([flat_obs])]

        action = mean
        return action, dict(mean=mean)

    def get_MPC_action(self, obs):
        action_list = []
        for i in range(self.n_envs):
            action, _ = self.algo.policy.get_action(obs[i], i)
            action_list.append(action)
        return np.asarray(action_list).reshape(
            [-1, self.algo.env.action_space.shape[0]]), dict(means=action_list)

    def obtain_samples(self,
                       itr,
                       init_state=None,
                       reset_args=None,
                       return_dict=False,
                       log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []
        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list
                                       and type(reset_args) != np.ndarray):
            reset_args = [reset_args] * self.vec_env.num_envs
        if init_state is not None:
            init_state = [init_state] * self.vec_env.num_envs

            n_samples = 0
            obses = self.vec_env.reset(init_state, reset_args)
            dones = np.asarray([True] * self.vec_env.num_envs)
            running_paths = [None] * self.vec_env.num_envs
        else:

            n_samples = 0
            obses = self.vec_env.reset(reset_args)
            dones = np.asarray([True] * self.vec_env.num_envs)
            running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        #policy = self.algo.policy
        import time

        while n_samples < self.algo.max_path_length:
            t = time.time()
            #self.env_spec.reset(reset_args = reset_args)
            #policy.reset(dones)
            actions, agent_infos = self.get_MPC_action(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(
                        dict(
                            observations=running_paths[idx]["observations"],
                            actions=running_paths[idx]["actions"],
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
        logger.record_tabular(log_prefix + "EnvExecTime", env_time)
        logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Esempio n. 9
0
class VectorizedSampler(Sampler):
    def __init__(self, env, policy, n_envs, vec_env=None):
        self.env = env
        self.policy = policy
        self.n_envs = n_envs
        self.vec_env = vec_env
        self.env_spec = env.spec

    def start_worker(self):
        if self.vec_env is None:
            n_envs = self.n_envs
            if getattr(self.env, 'vectorized', False):
                self.vec_env = self.env.vec_env_executor(n_envs=n_envs)
            else:
                envs = [
                    pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)
                ]
                self.vec_env = VecEnvExecutor(envs=envs, )

    def shutdown_worker(self):
        self.vec_env.terminate()

    def obtain_samples(self,
                       itr,
                       max_path_length,
                       batch_size,
                       max_n_trajs=None):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        dones = np.asarray([True] * self.vec_env.n_envs)
        obses = self.vec_env.reset(dones)
        running_paths = [None] * self.vec_env.n_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.policy
        import time
        while n_samples < batch_size:
            t = time.time()
            if hasattr(self.vec_env, "handle_policy_reset"):
                self.vec_env.handle_policy_reset(policy, dones)
            else:
                policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, max_path_length=max_path_length)

            if np.any(dones):
                new_obses = self.vec_env.reset(dones)
                reset_idx = 0
                for idx, done in enumerate(dones):
                    if done:
                        next_obses[idx] = new_obses[reset_idx]
                        reset_idx += 1

            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.n_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.n_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))

                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            if max_n_trajs is not None and len(paths) >= max_n_trajs:
                break

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
class RandomVectorizedSampler(RandomBaseSampler, VectorizedSampler):
    def __init__(self, algo, n_envs=None):
        self.algo = algo
        VectorizedSampler.__init__(self, algo, n_envs=n_envs)

    def start_worker(self):
        n_envs = self.n_envs
        if n_envs is None:
            n_envs = int(self.algo.batch_size / self.algo.max_path_length)
            n_envs = max(1, min(n_envs, 100))

        if getattr(self.algo.env, 'vectorized', False):
            self.vec_env = self.algo.env.vec_env_executor(
                n_envs=n_envs, max_path_length=self.algo.max_path_length)
        else:
            envs = [
                pickle.loads(pickle.dumps(self.algo.env))
                for _ in range(n_envs)
            ]
            self.vec_env = VecEnvExecutor(
                envs=envs, max_path_length=self.algo.max_path_length)
        self.env_spec = self.algo.env.spec

    def shutdown_worker(self):
        self.vec_env.terminate()

    @overrides
    def obtain_samples(self,
                       itr,
                       num_samples=None,
                       log=True,
                       log_prefix='RandomSampler-'):
        if num_samples is None:
            num_samples = self.algo.batch_size

        paths = []
        n_samples_collected = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(num_samples)
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples_collected < num_samples:
            # random actions
            t = time.time()
            actions = np.stack([
                self.vec_env.action_space.sample() for _ in range(len(obses))
            ],
                               axis=0)
            policy_time = time.time() - t
            agent_infos = {}

            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples_collected += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        if log:
            logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
            logger.record_tabular(log_prefix + "EnvExecTime", env_time)
            logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        return paths