Beispiel #1
0
    def train(self):
        self.start_worker()
        self.init_opt()

        manager = multiprocessing.Manager()
        lock = manager.RLock()
        g_counter = manager.Value('counter', 0)
        g_opt_info = manager.Value('opt_info', self.opt_info)

        for epoch in range(self.n_epochs):
            logger.push_prefix('epoch %d | ' % epoch)
            logger.log('Training started')

            results = singleton_pool.run_each(
                train_worker,
                [(g_counter, g_opt_info, self.t_max, self.discount, lock,
                  self.scope)] * singleton_pool.n_parallel)

            threshold = self.epoch_length
            pbar = ProgBarCounter(threshold)
            last_value = 0
            while True:
                time.sleep(0.1)
                with lock:
                    if g_counter.value >= threshold:
                        logger.log('Training finished')
                        pbar.stop()
                        g_counter.value = 0
                        logger.log('Evaluating ...')
                        self.evaluate(epoch, g_opt_info.value)
                        logger.dump_tabular(with_prefix=False)
                        logger.pop_prefix()
                        break
                    pbar.inc(g_counter.value - last_value)
                    last_value = g_counter.value

        self.terminate_task()
    def obtain_samples(self,
                       itr,
                       reset_args=None,
                       return_dict=False,
                       log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list
                                       and type(reset_args) != np.ndarray):
            reset_args = [reset_args] * self.vec_env.num_envs

        n_samples = 0
        obses = self.vec_env.reset(reset_args)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time

        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        #logger.record_tabular(log_prefix+"PolicyExecTime", policy_time)
        #logger.record_tabular(log_prefix+"EnvExecTime", env_time)
        #logger.record_tabular(log_prefix+"ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Beispiel #3
0
    def obtain_samples(self,
                       itr,
                       return_dict=False,
                       log=True,
                       log_prefix='',
                       traj_starting_obs=None,
                       traj_starting_ts=None):
        """

        :param itr: current iteration (int) for logging purposes
        :param return_dict: (boolean) weather to return a dict or a list
        :param log: (boolean) indicates whether to log
        :param log_prefix: (str) prefix to prepend to the log keys
        :param traj_starting_obs: (optional) starting observations to randomly choose from for rolling out trajectories [numpy array of shape (n_observations, ndim_obs),
                                    if env.reset() is called to get a initial observations
        :return:
        """
        # return_dict: whether or not to return a dictionary or list form of paths
        assert traj_starting_obs is None or traj_starting_obs.ndim == 2

        paths = {}
        for i in range(self.meta_batch_size):
            paths[i] = []

        n_samples = 0
        n_parallel_per_task = self.vec_env.num_envs // self.meta_batch_size

        obses = self.vec_env.reset(traj_starting_obs=traj_starting_obs)
        dones = np.asarray([True] * self.n_parallel)
        running_paths = [None] * self.n_parallel

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time

        while n_samples < self.algo.batch_size_dynamics_samples:
            t = time.time()
            policy.reset(dones)

            # get actions from MAML policy
            obs_per_task = np.split(np.asarray(obses), self.meta_batch_size)
            actions, agent_infos = policy.get_actions_batch(obs_per_task)

            assert actions.shape[0] == self.n_parallel

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions,
                traj_starting_obs=traj_starting_obs,
                traj_starting_ts=traj_starting_ts)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx // n_parallel_per_task].append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        if log:
            logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
            logger.record_tabular(log_prefix + "EnvExecTime", env_time)
            logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())
            # path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])
        else:
            assert len(paths) == self.meta_batch_size
        return paths
Beispiel #4
0
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        terminals=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["terminals"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            terminals=tensor_utils.stack_tensor_list(
                                running_paths[idx]["terminals"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='',  preupdate=False, save_img_obs=False, contexts = None):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray):
            assert False, "debug, should we be using this?"
            print("WARNING, will vectorize reset_args")
            reset_args = [reset_args]*self.vec_env.num_envs


        n_samples = 0
        path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance
        obses = self.vec_env.reset(reset_args)
        if contexts:
            obses = np.concatenate([obses, contexts], axis = 1)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        if contexts:
            policy = self.algo.post_policy
        else:
            policy = self.algo.policy

        while n_samples < self.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            # print("debug, agent_infos", agent_infos)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)   # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict
            if contexts:
                next_obses = np.concatenate([next_obses, contexts], axis = 1)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])  # TODO: let's also add the incomplete running_paths to paths
                    running_paths[idx] = None
                    path_nums[idx] += 1
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        # adding the incomplete paths
        # for idx in range(self.vec_env.num_envs):
        #     if running_paths[idx] is not None:
        #         paths[idx].append(dict(
        #             observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
        #             actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
        #             rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
        #             env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
        #             agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
        #         ))


        pbar.stop()





      #  logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
      #  logger.record_tabular(log_prefix + "EnvExecTime", env_time)
       # logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Beispiel #6
0
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            self.algo.policy.reset(dones)
            actions, agent_infos = self.algo.policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
Beispiel #7
0
    def train(self):
        with tf.Session() as sess:
            sess.run(tf.initialize_all_variables())
            self.start_worker()

            logger.log("Populating replay memory with random experience...")
            self.sampler.obtain_random_samples(self.pre_trained_size)

            start_time = time.time()
            total_time_step = 0
            for itr in range(self.start_itr, self.n_itr + 1):
                itr_start_time = time.time()
                self.total_episodic_rewards = [
                    [] for _ in range(len(self.env.agents))
                ]
                with logger.prefix('itr #%d | ' % itr):
                    p_bar = ProgBarCounter(self.max_path_length)
                    logger.log(
                        "Running trajectories, obtaining samples and optimizing Q network..."
                    )
                    for time_step in range(self.max_path_length):
                        total_time_step += 1
                        paths = self.obtain_samples(itr)
                        samples_data = self.process_samples(itr, paths)
                        self.optimize_policy(itr, samples_data)

                        if total_time_step % self.target_network_update == 0:
                            logger.log(
                                "Copying weights to target Q network...")
                            self.target_policy.set_param_values(
                                self.policy.get_param_values())

                        p_bar.inc(time_step + 1)
                        if self.sampler.done:
                            break

                    p_bar.stop()
                    self.sampler.done = True
                    self.log_statistics(itr, time_step + 1)
                    logger.log("Logging statistics...")
                    logger.log("Logging diagnostics...")
                    self.log_diagnostics(paths)

                    if itr % self.save_param_update == 0:
                        logger.log("Saving snapshot...")
                        params = self.get_itr_snapshot(
                            itr, samples_data)  # , **kwargs)
                        if self.store_paths:
                            if isinstance(samples_data, list):
                                params["paths"] = [
                                    sd["paths"] for sd in samples_data
                                ]
                            else:
                                params["paths"] = samples_data["paths"]
                        logger.save_itr_params(itr, params)
                        logger.log("Saved")

                    logger.record_tabular('Time', time.time() - start_time)
                    logger.record_tabular('ItrTime',
                                          time.time() - itr_start_time)
                    logger.dump_tabular(with_prefix=False)
                    if self.plot:
                        self.update_plot()
                        if self.pause_for_plot:
                            input("Plotting evaluation run: Press Enter to "
                                  "continue...")

        self.shutdown_worker()
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray):
            reset_args = [reset_args]*self.vec_env.num_envs

        n_samples = 0
        obses = self.vec_env.reset(reset_args)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time


        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular(log_prefix+"PolicyExecTime", policy_time)
        logger.record_tabular(log_prefix+"EnvExecTime", env_time)
        logger.record_tabular(log_prefix+"ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Beispiel #9
0
    def obtain_samples(self,
                       itr,
                       max_path_length,
                       batch_size,
                       max_n_trajs=None):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        dones = np.asarray([True] * self.vec_env.n_envs)
        obses = self.vec_env.reset(dones)
        running_paths = [None] * self.vec_env.n_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.policy
        import time
        while n_samples < batch_size:
            t = time.time()
            if hasattr(self.vec_env, "handle_policy_reset"):
                self.vec_env.handle_policy_reset(policy, dones)
            else:
                policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, max_path_length=max_path_length)

            if np.any(dones):
                new_obses = self.vec_env.reset(dones)
                reset_idx = 0
                for idx, done in enumerate(dones):
                    if done:
                        next_obses[idx] = new_obses[reset_idx]
                        reset_idx += 1

            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.n_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.n_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))

                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            if max_n_trajs is not None and len(paths) >= max_n_trajs:
                break

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
    def obtain_samples(self,
                       itr,
                       num_samples=None,
                       log=True,
                       log_prefix='RandomSampler-'):
        if num_samples is None:
            num_samples = self.algo.batch_size

        paths = []
        n_samples_collected = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(num_samples)
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples_collected < num_samples:
            # random actions
            t = time.time()
            actions = np.stack([
                self.vec_env.action_space.sample() for _ in range(len(obses))
            ],
                               axis=0)
            policy_time = time.time() - t
            agent_infos = {}

            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples_collected += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        if log:
            logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
            logger.record_tabular(log_prefix + "EnvExecTime", env_time)
            logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        return paths
Beispiel #11
0
    def obtain_samples(self, itr, oracle_policy):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        agent_only_paths = []
        oracle_only_paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs
        agent_only_running_paths = [None] * self.vec_env.num_envs
        oracle_only_running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time

        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)

            agent_actions, binary_actions, agent_infos = policy.get_actions(
                obses)
            oracle_actions, oracle_agent_infos = oracle_policy.get_actions(
                obses)
            sigma = np.round(binary_actions)

            actions_1 = np.array([
                sigma[0, 0] * agent_actions[0, :] +
                sigma[0, 1] * oracle_actions[0, :]
            ])
            actions_2 = np.array([
                sigma[1, 0] * agent_actions[1, :] +
                sigma[1, 1] * oracle_actions[1, :]
            ])

            actions = np.concatenate((actions_1, actions_2), axis=0)

            policy_time += time.time() - t
            t = time.time()

            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, itr)

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]

            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            if sigma[0, 0] == 1 or sigma[1, 0] == 1:

                for idx, observation, action, reward, env_info, agent_info, done in zip(
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if agent_only_running_paths[idx] is None:
                        agent_only_running_paths[idx] = dict(
                            observations=[],
                            actions=[],
                            rewards=[],
                            env_infos=[],
                            agent_infos=[],
                        )
                    agent_only_running_paths[idx]["observations"].append(
                        observation)
                    agent_only_running_paths[idx]["actions"].append(action)
                    agent_only_running_paths[idx]["rewards"].append(reward)
                    agent_only_running_paths[idx]["env_infos"].append(env_info)
                    agent_only_running_paths[idx]["agent_infos"].append(
                        agent_info)

                    if done:
                        agent_only_paths.append(
                            dict(
                                observations=self.env_spec.observation_space.
                                flatten_n(agent_only_running_paths[idx]
                                          ["observations"]),
                                actions=self.env_spec.action_space.flatten_n(
                                    agent_only_running_paths[idx]["actions"]),
                                rewards=tensor_utils.stack_tensor_list(
                                    agent_only_running_paths[idx]["rewards"]),
                                env_infos=tensor_utils.stack_tensor_dict_list(
                                    agent_only_running_paths[idx]
                                    ["env_infos"]),
                                agent_infos=tensor_utils.
                                stack_tensor_dict_list(
                                    agent_only_running_paths[idx]
                                    ["agent_infos"]),
                            ))
                        n_samples += len(
                            agent_only_running_paths[idx]["rewards"])
                        agent_only_running_paths[idx] = None
            """
            To get paths taken by the oracle
            """
            # elif sigma[0] == 0. or sigma[1] == 0.:

            #     for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
            #                                                                             rewards, env_infos, agent_infos,
            #                                                                             dones):
            #         if oracle_only_running_paths[idx] is None:
            #             oracle_only_running_paths[idx] = dict(
            #                 observations=[],
            #                 actions=[],
            #                 rewards=[],
            #                 env_infos=[],
            #                 agent_infos=[],
            #             )
            #         oracle_only_running_paths[idx]["observations"].append(observation)
            #         oracle_only_running_paths[idx]["actions"].append(action)
            #         oracle_only_running_paths[idx]["rewards"].append(reward)
            #         oracle_only_running_paths[idx]["env_infos"].append(env_info)
            #         oracle_only_running_paths[idx]["agent_infos"].append(agent_info)

            #         if done:
            #             oracle_only_paths.append(dict(
            #                 observations=self.env_spec.observation_space.flatten_n(oracle_only_running_paths[idx]["observations"]),
            #                 actions=self.env_spec.action_space.flatten_n(oracle_only_running_paths[idx]["actions"]),
            #                 rewards=tensor_utils.stack_tensor_list(oracle_only_running_paths[idx]["rewards"]),
            #                 env_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["env_infos"]),
            #                 agent_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["agent_infos"]),
            #             ))
            #             n_samples += len(oracle_only_running_paths[idx]["rewards"])
            #             oracle_only_running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        #return paths, agent_only_paths, oracle_only_paths
        return paths, agent_only_paths