Ejemplo n.º 1
0
    def collect_training_trajectories(
            self,
            itr,
            load_initial_expertdata,
            collect_policy,
            batch_size,
    ):
        """
        :param itr:
        :param load_initial_expertdata:  path to expert data pkl file
        :param collect_policy:  the current policy using which we collect data
        :param batch_size:  the number of transitions we collect
        :return:
            paths: a list trajectories
            envsteps_this_batch: the sum over the numbers of environment steps in paths
            train_video_paths: paths which also contain videos for visualization purposes
        """

        # TODO decide whether to load training data or use the current policy to collect more data √
        # HINT: depending on if it's the first iteration or not, decide whether to either
                # (1) load the data. In this case you can directly return as follows
                # ``` return loaded_paths, 0, None ```

                # (2) collect `self.params['batch_size']` transitions
        if itr == 0:
            file = open(load_initial_expertdata, 'rb')
            loaded_paths = pickle.load(file)
            file.close()
            return loaded_paths, 0, None

        # TODO collect `batch_size` samples to be used for training √
        # HINT1: use sample_trajectories from utils
        # HINT2: you want each of these collected rollouts to be of length self.params['ep_len']
        print("\nCollecting data to be used for training...")
        paths = utils.sample_n_trajectories(self.env, collect_policy, batch_size // self.params['ep_len'], self.params['ep_len'])
        envsteps_this_batch = sum(utils.get_pathlength(path) for path in paths)

        # collect more rollouts with the same policy, to be saved as videos in tensorboard
        # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN
        train_video_paths = None
        if self.log_video:
            print('\nCollecting train rollouts to be used for saving videos...')
            ## TODO look in utils and implement sample_n_trajectories √
            train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)

        return paths, envsteps_this_batch, train_video_paths
Ejemplo n.º 2
0
    def sample_recent_data(self, batch_size=1, concat_rew=True):

        if concat_rew:
            return (self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:],
                    self.next_obs[-batch_size:], self.terminals[-batch_size:])
        else:
            num_recent_rollouts_to_return = 0
            num_datapoints_so_far = 0
            index = -1
            while num_datapoints_so_far < batch_size:
                recent_rollout = self.paths[index]
                index -= 1
                num_recent_rollouts_to_return += 1
                num_datapoints_so_far += get_pathlength(recent_rollout)
            rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
            (observations, actions, next_observations, terminals, concatenated_rews,
             unconcatenated_rews) = convert_listofrollouts(rollouts_to_return)
            return observations, actions, unconcatenated_rews, next_observations, terminals
Ejemplo n.º 3
0
    def add_rollouts(self, paths):

        # add new rollouts into our list of rollouts
        self.paths.extend(paths)

        # Remove the old rollouts beyond the self.max_size observations
        # so that they can be garbage collected and the memory consumption
        # does not grow indefinitely.
        path_lengths = [get_pathlength(path) for path in self.paths]
        total_steps = 0
        first_retained_path_idx = 0
        for i in range(len(path_lengths) - 1, -1, -1):
            total_steps += path_lengths[i]
            if total_steps >= self.max_size:
                first_retained_path_idx = i
                break
        self.paths = self.paths[first_retained_path_idx:]

        # convert new rollouts into their component arrays, and append them onto our arrays
        (observations, actions, next_observations, terminals, concatenated_rews,
         unconcatenated_rews) = convert_listofrollouts(paths)

        if self.obs is None:
            self.obs = observations[-self.max_size:]
            self.acs = actions[-self.max_size:]
            self.next_obs = next_observations[-self.max_size:]
            self.terminals = terminals[-self.max_size:]
            self.concatenated_rews = concatenated_rews[-self.max_size:]
            self.unconcatenated_rews = unconcatenated_rews
        else:
            # Make copies of the updated arrays to avoid holding the whole slice bases in memory.
            self.obs = np.concatenate([self.obs, observations])[-self.max_size:].copy()
            self.acs = np.concatenate([self.acs, actions])[-self.max_size:].copy()
            self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:].copy()
            self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:].copy()
            self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:].copy()
            if isinstance(unconcatenated_rews, list):
                self.unconcatenated_rews += unconcatenated_rews
            else:
                self.unconcatenated_rews.append(unconcatenated_rews)

        self.unconcatenated_rews = self.unconcatenated_rews[first_retained_path_idx:]
Ejemplo n.º 4
0
    def collect_training_trajectories(self,
                                      itr,
                                      initial_expertdata,
                                      collect_policy,
                                      num_transitions_to_sample,
                                      save_expert_data_to_disk=False):
        """
        :param itr:
        :param initial_expertdata:  path to expert data pkl file
        :param collect_policy:  the current policy using which we collect data
        :param num_transitions_to_sample:  the number of transitions we collect
        :return:
            paths: a list trajectories
            envsteps_this_batch: the sum over the numbers of environment steps in paths
            train_video_paths: paths which also contain videos for visualization purposes
        """
        # TODO: get this from Piazza

        paths = []
        envsteps_this_batch = 0

        if (itr == 0 and initial_expertdata is not None):
            if os.path.exists(initial_expertdata):
                expert_paths = pickle.load(open(initial_expertdata, 'rb'))
                paths.extend(expert_paths)
                envsteps_this_batch += sum(
                    [utils.get_pathlength(p) for p in expert_paths])
            else:
                warnings.warn(
                    "load_initial_expertdata file {} not found!".format(
                        initial_expertdata))
            return paths, envsteps_this_batch, None

        if (iter == 0 and save_expert_data_to_disk):
            num_transitions_to_sample = self.params['batch_size_initial']

        if envsteps_this_batch < num_transitions_to_sample:
            # HINT1: use sample_trajectories from utils
            # HINT2: you want each of these collected rollouts to be of length self.params['ep_len']
            print("\nCollecting data to be used for training...")
            exp_paths, exp_envsteps = utils.sample_trajectories_mp(
                env=self.env,
                policy=collect_policy,
                min_timesteps_per_batch=num_transitions_to_sample -
                envsteps_this_batch,
                max_path_length=self.params['ep_len'],
                render=False,
                num_envs_per_core=self.params['num_envs_per_core'],
                num_cores=self.params['num_cores'],
            )
            paths.extend(exp_paths)
            envsteps_this_batch += exp_envsteps

        # collect more rollouts with the same policy, to be saved as videos in tensorboard
        # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN
        train_video_paths = None
        if self.logvideo:
            print(
                '\nCollecting train rollouts to be used for saving videos...')
            train_video_paths = utils.sample_n_trajectories(
                self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)

        if save_expert_data_to_disk and itr == 0:
            with open('expert_data_{}.pkl'.format(self.params['env_name']),
                      'wb') as file:
                pickle.dump(paths, file)

        return paths, envsteps_this_batch, train_video_paths
Ejemplo n.º 5
0
    def collect_training_trajectories(
        self,
        itr,
        load_initial_expertdata,
        collect_policy,
        batch_size,
    ):
        """
        :param itr:
        :param load_initial_expertdata:  path to expert data pkl file
        :param collect_policy:  the current policy using which we collect data
        :param batch_size:  the number of transitions we collect
        :return:
            paths: a list trajectories
            envsteps_this_batch: the sum over the numbers of environment steps in paths
            train_video_paths: paths which also contain videos for visualization purposes
        """

        # TODO decide whether to load training data or use the current policy to collect more data
        # HINT: depending on if it's the first iteration or not, decide whether to either
        # (1) load the data. In this case you can directly return as follows
        # ``` return loaded_paths, 0, None ```

        # (2) collect `self.params['batch_size']` transitions

        paths = []
        envsteps_this_batch = 0

        # TODO: do we want to add weak data (exp_paths) in 0th iteration?
        #     maybe we should only feed expert_path even when
        #     total steps is not enough in first rounds
        if itr == 0:
            if os.path.exists(load_initial_expertdata):
                expert_paths = pickle.load(open(load_initial_expertdata, 'rb'))
                paths.extend(expert_paths)
                envsteps_this_batch += sum(
                    [utils.get_pathlength(p) for p in expert_paths])
            else:
                warnings.warn(
                    "load_initial_expertdata file {} not found!".format(
                        load_initial_expertdata))

        # if envsteps_this_batch < batch_size:
        if (envsteps_this_batch < batch_size and itr > 0):
            # TODO collect `batch_size` samples to be used for training
            # HINT1: use sample_trajectories from utils
            # HINT2: you want each of these collected rollouts to be of length self.params['ep_len']
            print("\nCollecting data to be used for training...")
            exp_paths, exp_envsteps = utils.sample_trajectories(
                env=self.env,
                policy=collect_policy,
                min_timesteps_per_batch=batch_size - envsteps_this_batch,
                max_path_length=self.params['ep_len'],
                render=False)
            paths.extend(exp_paths)
            envsteps_this_batch += exp_envsteps

        # collect more rollouts with the same policy, to be saved as videos in tensorboard
        # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN
        train_video_paths = None
        if self.log_video:
            print(
                '\nCollecting train rollouts to be used for saving videos...')
            ## TODO look in utils and implement sample_n_trajectories
            train_video_paths = utils.sample_n_trajectories(
                self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)

        return paths, envsteps_this_batch, train_video_paths