Exemple #1
0
    def load_paths(self):
        paths = []
        for i in range(len(self.data)):
            p = self.data[i]
            H = len(p["observations"]) - 1

            path_builder = PathBuilder()

            for t in range(H):
                p["observations"][t]

                ob = path["observations"][t, :]
                action = path["actions"][t, :]
                reward = path["rewards"][t]
                next_ob = path["observations"][t + 1, :]
                terminal = 0
                agent_info = {}  # todo (need to unwrap each key)
                env_info = {}  # todo (need to unwrap each key)

                path_builder.add_all(
                    observations=ob,
                    actions=action,
                    rewards=reward,
                    next_observations=next_ob,
                    terminals=terminal,
                    agent_infos=agent_info,
                    env_infos=env_info,
                )

            path = path_builder.get_all_stacked()
            paths.append(path)
        return paths
Exemple #2
0
def rollout(
    env,
    policy,
    max_path_length,
    no_terminal=False,
    render=False,
    render_kwargs={},
):
    path_builder = PathBuilder()
    observation = env.reset()

    for _ in range(max_path_length):
        action, agent_info = policy.get_action(observation)
        if render: env.render(**render_kwargs)

        next_ob, reward, terminal, env_info = env.step(action)
        if no_terminal: terminal = False

        path_builder.add_all(
            observations=observation,
            actions=action,
            rewards=np.array([reward]),
            next_observations=next_ob,
            terminals=np.array([terminal]),
            absorbing=np.array([0., 0.]),
            agent_info=agent_info,
            env_info=env_info,
        )

        observation = next_ob
        if terminal: break
    return path_builder
Exemple #3
0
 def _start_new_rollout(self, env_idx=None):
     if env_idx is None:
         self._current_path_builders = [PathBuilder() for _ in range(self._env_num)]
         self._obs = self._env.reset()
     else:
         self._current_path_builders[env_idx] = PathBuilder()
         self._obs[env_idx] = self._env.reset(env_idx)[env_idx]
    def train_online(self, start_epoch=0):
        # No need for training mode to be True when generating trajectories
        # training mode is automatically set to True
        # in _try_to_train and before exiting
        # it that function it reverts it to False
        self.training_mode(False)
        self._current_path_builder = PathBuilder()
        self._n_rollouts_total = 0

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            print('EPOCH STARTED')
            # print('epoch')
            for _ in range(self.num_rollouts_per_epoch):
                # print('rollout')
                task_params, obs_task_params = self.train_task_params_sampler.sample(
                )
                self.generate_exploration_rollout(
                    task_params=task_params, obs_task_params=obs_task_params)

                # print(self._n_rollouts_total)
                if self._n_rollouts_total % self.num_rollouts_between_updates == 0:
                    gt.stamp('sample')
                    # print('train')
                    if not self.do_not_train: self._try_to_train(epoch)
                    gt.stamp('train')

            if not self.do_not_eval:
                self._try_to_eval(epoch)
                gt.stamp('eval')

            self._end_epoch()
Exemple #5
0
    def train_batch(self, start_epoch):
        self._current_path_builder = PathBuilder()

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):

            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            observation = self._start_new_rollout()
            # This implementation is rather naive. If you want to (e.g.)
            # parallelize data collection, this would be the place to do it.
            for i in range(self.num_env_steps_per_epoch):
                observation, terminal = self._take_step_in_env(observation)

                #print(i, terminal)
            assert terminal[0] == True
            gt.stamp('sample')

            self._try_to_train()
            gt.stamp('train')

            set_to_eval_mode(self.env)
            #print(i, terminal)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
Exemple #6
0
def rollout_path(env, task_params, obs_task_params, post_cond_policy):
    cur_eval_path_builder = PathBuilder()
    
    # reset the env using the params
    observation = env.reset(task_params=task_params, obs_task_params=obs_task_params)
    terminal = False
    task_identifier = env.task_identifier

    while (not terminal) and len(cur_eval_path_builder) < MAX_PATH_LENGTH:
        agent_obs = observation['obs']
        action, agent_info = post_cond_policy.get_action(agent_obs)
        
        next_ob, raw_reward, terminal, env_info = (env.step(action))
        terminal = False
        
        reward = raw_reward
        terminal = np.array([terminal])
        reward = np.array([reward])
        cur_eval_path_builder.add_all(
            observations=observation,
            actions=action,
            rewards=reward,
            next_observations=next_ob,
            terminals=terminal,
            agent_infos=agent_info,
            env_infos=env_info,
            task_identifiers=task_identifier
        )
        observation = next_ob

    return cur_eval_path_builder.get_all_stacked()
Exemple #7
0
    def train_online(self, start_epoch=0):
        # No need for training mode to be True when generating trajectories
        # training mode is automatically set to True
        # in _try_to_train and before exiting
        # it that function it reverts it to False
        self.training_mode(False)
        self._current_path_builder = PathBuilder()

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            for _ in range(self.num_rollouts_per_epoch):
                task_params, obs_task_params = self.train_task_params_sampler.sample(
                )
                self.generate_exploration_rollout(
                    task_params=task_params, obs_task_params=obs_task_params)

            # essentially in each epoch we gather data then do a certain amount of training
            gt.stamp('sample')
            if not self.do_not_train: self._try_to_train()
            gt.stamp('train')

            if epoch % self.freq_eval == 0:
                # and then we evaluate it
                if not self.do_not_eval: self._try_to_eval(epoch)
                gt.stamp('eval')

            self._end_epoch()
Exemple #8
0
    def train_online(self, start_epoch=0):
        if not self.environment_farming:
            observation = self._start_new_rollout()
        self._current_path_builder = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)

            for _ in range(self.num_env_steps_per_epoch):
                if not self.environment_farming:
                    observation = self.play_one_step(observation)
                else:
                    # acquire a remote environment
                    remote_env = self.farmer.force_acq_env()
                    self.play_ignore(remote_env)

                # Training out of threads
                self._try_to_train()
                gt.stamp('train')

            if epoch % 10 == 0:
                self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch()
Exemple #9
0
 def test_path_length(self):
     path = PathBuilder()
     for _ in range(10):
         path.add_all(
             action=np.array([1, 2, 3]),
             obs=-np.array([1, 2, 3]),
         )
     self.assertEqual(len(path), 10)
    def load_path(self, path, replay_buffer, obs_dict=None):
        rewards = []
        path_builder = PathBuilder()

        print("loading path, length", len(path["observations"]),
              len(path["actions"]))
        H = min(len(path["observations"]), len(path["actions"]))
        print("actions", np.min(path["actions"]), np.max(path["actions"]))

        for i in range(H):
            if obs_dict:
                ob = path["observations"][i][self.obs_key]
                next_ob = path["next_observations"][i][self.obs_key]
            else:
                ob = path["observations"][i]
                next_ob = path["next_observations"][i]

            if i == 0:
                current_obs = np.zeros((self.stack_obs + 1, len(ob)))
                current_obs[-2, :] = ob
                current_obs[-1, :] = next_ob
            else:
                current_obs = np.vstack((current_obs[1:, :], next_ob))
                assert (current_obs[-2, :] == ob
                        ).all(), "mismatch between obs and next_obs"
            obs1 = current_obs[:self.stack_obs, :].flatten()
            obs2 = current_obs[1:, :].flatten()

            action = path["actions"][i]
            reward = path["rewards"][i]
            terminal = path["terminals"][i]
            if not self.load_terminals:
                terminal = np.zeros(terminal.shape)
            agent_info = path["agent_infos"][i]
            env_info = path["env_infos"][i]

            if self.recompute_reward:
                reward = self.env.compute_reward(
                    action,
                    next_ob,
                )

            reward = np.array([reward])
            rewards.append(reward)
            terminal = np.array([terminal]).reshape((1, ))
            path_builder.add_all(
                observations=obs1,
                actions=action,
                rewards=reward,
                next_observations=obs2,
                terminals=terminal,
                agent_infos=agent_info,
                env_infos=env_info,
            )
        self.demo_trajectory_rewards.append(rewards)
        path = path_builder.get_all_stacked()
        replay_buffer.add_path(path)
        print("path sum rewards", sum(rewards), len(rewards))
Exemple #11
0
 def _handle_rollout_ending(self):
     """
     Implement anything that needs to happen after every rollout.
     """
     self.replay_buffer.terminate_episode()
     self._n_rollouts_total += 1
     if len(self._current_path_builder) > 0:
         self._exploration_paths.append(self._current_path_builder)
         self._current_path_builder = PathBuilder()
    def load_path(self, path, replay_buffer, obs_dict=None):
        rewards = []
        path_builder = PathBuilder()
        H = min(len(path["observations"]), len(path["actions"]))

        if obs_dict:
            traj_obs = self.preprocess(path["observations"])
            next_traj_obs = self.preprocess(path["next_observations"])
        else:
            traj_obs = self.env.encode(path["observations"])
            next_traj_obs = self.env.encode(path["next_observations"])

        for i in range(H):
            ob = traj_obs[i]
            next_ob = next_traj_obs[i]
            action = path["actions"][i]

            # #temp fix#
            # ob['state_desired_goal'] = np.zeros_like(ob['state_desired_goal'])
            # ob['latent_desired_goal'] = np.zeros_like(ob['latent_desired_goal'])

            # next_ob['state_desired_goal'] = np.zeros_like(next_ob['state_desired_goal'])
            # next_ob['latent_desired_goal'] = np.zeros_like(next_ob['latent_desired_goal'])

            # action[3] /= 5
            # #temp fix#

            reward = path["rewards"][i]
            terminal = path["terminals"][i]
            if not self.load_terminals:
                terminal = np.zeros(terminal.shape)
            agent_info = path["agent_infos"][i]
            env_info = path["env_infos"][i]
            if self.reward_fn:
                reward = self.reward_fn(ob, action, next_ob, next_ob)

            reward = np.array([reward]).flatten()
            rewards.append(reward)
            terminal = np.array([terminal]).reshape((1, ))
            path_builder.add_all(
                observations=ob,
                actions=action,
                rewards=reward,
                next_observations=next_ob,
                terminals=terminal,
                agent_infos=agent_info,
                env_infos=env_info,
            )
        self.demo_trajectory_rewards.append(rewards)
        path = path_builder.get_all_stacked()
        replay_buffer.add_path(path)
        print("rewards", np.min(rewards), np.max(rewards))
        print("loading path, length", len(path["observations"]),
              len(path["actions"]))
        print("actions", np.min(path["actions"]), np.max(path["actions"]))
        print("path sum rewards", sum(rewards), len(rewards))
Exemple #13
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()
        observation = self._start_new_rollout()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            for _ in range(self.num_env_steps_per_epoch):
                action, agent_info = self._get_action_and_info(observation, )
                if self.render:
                    self.training_env.render()
                next_ob, raw_reward, terminal, env_info = (
                    self.training_env.step(action))
                self._n_env_steps_total += 1
                reward = raw_reward * self.reward_scale
                terminal = np.array([terminal])
                reward = np.array([reward])

                self.posterior_state = self.neural_process.update_posterior_state(
                    self.posterior_state, observation[self.extra_obs_dim:],
                    action, reward, next_ob)
                next_ob = np.concatenate(
                    [self.get_latent_repr(self.posterior_state), next_ob])

                self._handle_step(
                    observation,
                    action,
                    reward,
                    next_ob,
                    terminal,
                    agent_info=agent_info,
                    env_info=env_info,
                )
                if terminal or len(
                        self._current_path_builder) >= self.max_path_length:
                    self._handle_rollout_ending()
                    observation = self._start_new_rollout()
                else:
                    observation = next_ob

                gt.stamp('sample')
                if epoch >= self.epoch_to_start_training:
                    self._try_to_train()
                gt.stamp('train')

            if epoch >= self.epoch_to_start_training:
                self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch()
    def load_path(self, path, replay_buffer, obs_dict=None):
        # Filter data #
        if not self.data_filter_fn(path): return

        rewards = []
        path_builder = PathBuilder()

        print("loading path, length", len(path["observations"]),
              len(path["actions"]))
        H = min(len(path["observations"]), len(path["actions"]))
        print("actions", np.min(path["actions"]), np.max(path["actions"]))

        for i in range(H):
            if obs_dict:
                ob = path["observations"][i][self.obs_key]
                next_ob = path["next_observations"][i][self.obs_key]
            else:
                ob = path["observations"][i]
                next_ob = path["next_observations"][i]
            action = path["actions"][i]
            reward = path["rewards"][i]
            terminal = path["terminals"][i]
            if not self.load_terminals:
                terminal = np.zeros(terminal.shape)
            agent_info = path["agent_infos"][i]
            env_info = path["env_infos"][i]

            if self.recompute_reward:
                reward = self.env.compute_reward(
                    action,
                    next_ob,
                )

            reward = np.array([reward]).flatten()
            rewards.append(reward)
            terminal = np.array([terminal]).reshape((1, ))
            path_builder.add_all(
                observations=ob,
                actions=action,
                rewards=reward,
                next_observations=next_ob,
                terminals=terminal,
                agent_infos=agent_info,
                env_infos=env_info,
            )
        self.demo_trajectory_rewards.append(rewards)
        path = path_builder.get_all_stacked()
        replay_buffer.add_path(path)
        print("path sum rewards", sum(rewards), len(rewards))
Exemple #15
0
    def _handle_rollout_ending(self, eval_task=False):
        """
        Implement anything that needs to happen after every rollout.
        """
        if eval_task:
            self.eval_enc_replay_buffer.terminate_episode(self.task_idx)
        else:
            self.replay_buffer.terminate_episode(self.task_idx)
            self.enc_replay_buffer.terminate_episode(self.task_idx)

        self._n_rollouts_total += 1
        if len(self._current_path_builder) > 0:
            self._exploration_paths.append(
                self._current_path_builder.get_all_stacked())
            self._current_path_builder = PathBuilder()
    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)
            if it_ == 0:
                print('collecting initial pool of data for train and eval')
                # temp for evaluating
                for idx in self.train_tasks:
                    self.task_idx = idx
                    self.env.reset_task(idx)
                    self.collect_data(self.num_initial_steps, 1, np.inf)
            # Sample data from train tasks.
            for i in range(self.num_tasks_sample):
                idx = np.random.randint(len(self.train_tasks))
                self.task_idx = idx
                self.env.reset_task(idx)
                self.enc_replay_buffer.task_buffers[idx].clear()

                # collect some trajectories with z ~ prior
                if self.num_steps_prior > 0:
                    self.collect_data(self.num_steps_prior, 1, np.inf)
                # collect some trajectories with z ~ posterior
                if self.num_steps_posterior > 0:
                    self.collect_data(self.num_steps_posterior, 1,
                                      self.update_post_train)
                # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior
                if self.num_extra_rl_steps_posterior > 0:
                    self.collect_data(self.num_extra_rl_steps_posterior,
                                      1,
                                      self.update_post_train,
                                      add_to_enc_buffer=False)

            # Sample train tasks and compute gradient updates on parameters.
            for train_step in range(self.num_train_steps_per_itr):
                indices = np.random.choice(self.train_tasks, self.meta_batch)
                self._do_training(indices)
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()
Exemple #17
0
 def _handle_rollout_ending(self):
     self._n_rollouts_total += 1
     if len(self._current_path_builder) > 0:
         path = self._current_path_builder.get_all_stacked()
         self.replay_buffer.add_path(path)
         self._exploration_paths.append(path)
         self._current_path_builder = PathBuilder()
Exemple #18
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            observation = self._start_new_rollout()
            for _ in range(self.num_env_steps_per_epoch):
                observation = self._take_step_in_env(observation)
                gt.stamp('sample')

                self._try_to_fit(epoch)
                gt.stamp('env_fit')

                self._try_to_train()
                gt.stamp('train')

            self.logger.record_tabular(self.env_loss_key, self.env_loss)

            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)

            self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
Exemple #19
0
 def collect_new_steps(
         self,
         max_path_length,
         num_steps,
         discard_incomplete_paths,
         random=False,
 ):
     steps_collector = PathBuilder()
     for _ in range(num_steps):
         self.collect_one_step(
             max_path_length,
             discard_incomplete_paths,
             steps_collector,
             random,
         )
     return [steps_collector.get_all_stacked()]
Exemple #20
0
    def train_batch(self, start_epoch):
        self._current_path_builder = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            observation = self._start_new_rollout()
            # This implementation is rather naive. If you want to (e.g.)
            # parallelize data collection, this would be the place to do it.
            for _ in range(self.num_env_steps_per_epoch):
                observation = self._take_step_in_env(observation)
            gt.stamp('sample')

            self._try_to_train()
            gt.stamp('train')

            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')

            self._try_to_fit(epoch)
            gt.stamp('env_fit')
            self.logger.record_tabular(self.env_loss_key, self.env_loss)

            self._end_epoch(epoch)
            self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
Exemple #21
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()

        observation = self._start_new_rollout()
        self.sample_z = self.sample_z_vec()
        observation = np.concatenate([observation, self.sample_z])

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            for t in range(self.num_env_steps_per_epoch):
                #print("step", t, "pool", self.replay_buffer.num_steps_can_sample())

                observation = self._take_step_in_env(observation)
                gt.stamp('sample')

                self._try_to_train()
                gt.stamp('train')

            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
Exemple #22
0
 def _handle_rollout_ending(self):
     self._n_rollouts_total += 1
     if len(self._current_path_builder) > 0:
         path = self._current_path_builder.get_all_stacked()
         # self.env.update_rewards(path)
         self.replay_buffer.add_path(path)
         self._exploration_paths.append(path)  # unneeded, wastes memory
         self._current_path_builder = PathBuilder()
def rollout_path(env, task_params, obs_task_params, post_cond_policy, max_path_length, task_idx):
    cur_eval_path_builder = PathBuilder()
    
    # reset the env using the params
    observation = env.reset(task_params=task_params, obs_task_params=obs_task_params)
    terminal = False
    task_identifier = env.task_identifier

    while (not terminal) and len(cur_eval_path_builder) < max_path_length:
        agent_obs = observation['obs']
        action, agent_info = post_cond_policy.get_action(agent_obs)
        
        next_ob, raw_reward, terminal, env_info = (env.step(action))
        # img = env.render(mode='rgb_array', width=200, height=200)
        if len(cur_eval_path_builder) % 10 == 0:
            # img = env.render(mode='rgb_array')

            env._wrapped_env._get_viewer('rgb_array').render(200, 200, camera_id=0)
            # window size used for old mujoco-py:
            data = env._wrapped_env._get_viewer('rgb_array').read_pixels(200, 200, depth=False)
            # original image is upside-down, so flip it
            img = data[::-1, :, :]
            imsave('plots/walker_irl_frames/walker_task_%02d_step_%03d.png' % (task_idx, len(cur_eval_path_builder)), img)
        terminal = False

        # print(env_info['l2_dist'])
        # print('{}: {}'.format(agent_obs[-3:], env_info['l2_dist']))
        # print(agent_obs)
        # print(env_info['l2_dist'])
        
        reward = raw_reward
        terminal = np.array([terminal])
        reward = np.array([reward])
        cur_eval_path_builder.add_all(
            observations=observation,
            actions=action,
            rewards=reward,
            next_observations=next_ob,
            terminals=terminal,
            agent_infos=agent_info,
            env_infos=env_info,
            task_identifiers=task_identifier
        )
        observation = next_ob

    return cur_eval_path_builder.get_all_stacked()
Exemple #24
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()
        observation = self._start_new_rollout()
        #observation = self.concat_state_z(state, self.curr_z)

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            for _ in range(self.num_env_steps_per_epoch):
                ''' TODO'''
                ''' append the latent variable here'''
                action, agent_info = self._get_action_and_info(observation, )
                if self.render:
                    self.training_env.render()

                next_state, raw_reward, terminal, env_info = (
                    self.training_env.step(action))

                # print (terminal)
                next_ob = self.concat_state_z(next_state, self.curr_z)
                self._n_env_steps_total += 1
                reward = raw_reward * self.reward_scale
                terminal = np.array([terminal])
                reward = np.array([reward])
                self._handle_step(
                    observation,
                    action,
                    reward,
                    next_ob,
                    terminal,
                    agent_info=agent_info,
                    env_info=env_info,
                )
                if terminal or len(
                        self._current_path_builder) >= self.max_path_length:
                    self._handle_rollout_ending()
                    observation = self._start_new_rollout()
                    #print ('starting new rollout')
                else:
                    observation = next_ob

                gt.stamp('sample')
                self._try_to_train()
                gt.stamp('train')
Exemple #25
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()
        observation = self._start_new_rollout()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            for _ in range(self.num_env_steps_per_epoch):
                observation = self._take_step_in_env(observation)

                gt.stamp('sample')
                self._try_to_train()
                gt.stamp('train')

            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
def rollout_path(env, task_params, obs_task_params, post_cond_policy, max_path_length):
    cur_eval_path_builder = PathBuilder()
    within_correct = False
    within_incorrect = False
    
    # reset the env using the params
    observation = env.reset(task_params=task_params, obs_task_params=obs_task_params)
    terminal = False
    task_identifier = env.task_identifier

    while (not terminal) and len(cur_eval_path_builder) < max_path_length:
        agent_obs = observation['obs']
        action, agent_info = post_cond_policy.get_action(agent_obs)
        
        next_ob, raw_reward, terminal, env_info = (env.step(action))
        terminal = False

        # print(env_info['l2_dist'])
        # print('{}: {}'.format(agent_obs[-3:], env_info['l2_dist']))
        # print(agent_obs)
        # print(env_info['l2_dist'])
        
        reward = raw_reward
        terminal = np.array([terminal])
        reward = np.array([reward])
        cur_eval_path_builder.add_all(
            observations=observation,
            actions=action,
            rewards=reward,
            next_observations=next_ob,
            terminals=terminal,
            agent_infos=agent_info,
            env_infos=env_info,
            task_identifiers=task_identifier
        )
        observation = next_ob

        if env_info['within_radius_of_correct']:
            within_correct = True
        if env_info['within_radius_of_incorrect']:
            within_incorrect = True

    return within_correct, within_incorrect
Exemple #27
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()
        observation = self._start_new_rollout()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)

            self.training_mode(True)
            processes = []

            import threading
            gt.stamp('sample')

            if self._can_train():
                ctx = mp.get_context("spawn")

                for net in self.networks:
                    #    net.cuda()
                    net.share_memory()

                for rank in range(0, self.heads):
                    p = ctx.Process(target=self.train_head, args=(rank, ))
                    p.start()
                    processes.append(p)

            for step in range(self.num_env_steps_per_epoch):
                observation = self._take_step_in_env(observation)
                #self._try_to_train()

            for p in processes:
                p.join()
            gt.stamp('train')
            self.training_mode(False)
            self._n_train_steps_total += self.num_env_steps_per_epoch

            self.current_behavior_policy = np.random.randint(self.heads)
            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
Exemple #28
0
 def _handle_rollout_ending(self, env=None):
     """
     Implement anything that needs to happen after every rollout.
     """
     #WARNING terminate_episode does NOTHING so it isn't adopted to farming
     self.replay_buffer.terminate_episode()
     self._n_rollouts_total += 1
     if not self.environment_farming:
         if len(self._current_path_builder) > 0:
             self._exploration_paths.append(
                 self._current_path_builder.get_all_stacked())
             self._current_path_builder = PathBuilder()
     elif env:
         _current_path_builder = env.get_current_path_builder()
         if _current_path_builder == None:
             raise '_handle_rollout_ending: env object should have current_path_builder field!'
         self._exploration_paths.append(
             _current_path_builder.get_all_stacked())
         env.newPathBuilder()
     else:
         raise '_handle_rollout_ending: env object should given to the fnc in farming mode!'
    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)

            # Sample train tasks and compute gradient updates on parameters.
            batch_idxes = np.random.randint(0,
                                            len(self.train_goals),
                                            size=self.meta_batch_size)
            train_batch_obj_id = self.replay_buffers.sample_training_data(
                batch_idxes, self.use_same_context)
            for _ in range(self.num_train_steps_per_itr):
                train_raw_batch = ray.get(train_batch_obj_id)
                gt.stamp('sample_training_data', unique=False)

                batch_idxes = np.random.randint(0,
                                                len(self.train_goals),
                                                size=self.meta_batch_size)
                # In this way, we can start the data sampling job for the
                # next training while doing training for the current loop.
                train_batch_obj_id = self.replay_buffers.sample_training_data(
                    batch_idxes, self.use_same_context)
                gt.stamp('set_up_sampling', unique=False)

                train_data = self.construct_training_batch(train_raw_batch)
                gt.stamp('construct_training_batch', unique=False)

                self._do_training(train_data)
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()
            if it_ == self.num_iterations:
                logger.save_itr_params(it_, self.agent.get_snapshot())
Exemple #30
0
    def load_path(self, path, replay_buffer):
        rewards = []
        path_builder = PathBuilder()

        print("loading path, length", len(path["observations"]),
              len(path["actions"]))
        H = min(len(path["observations"]), len(path["actions"]))
        print("actions", np.min(path["actions"]), np.max(path["actions"]))

        for i in range(H):
            ob = path["observations"][i]
            action = path["actions"][i]
            reward = path["rewards"][i]
            next_ob = path["next_observations"][i]
            terminal = path["terminals"][i]
            agent_info = path["agent_infos"][i]
            env_info = path["env_infos"][i]

            if self.recompute_reward:
                reward = self.env.compute_reward(
                    action,
                    next_ob,
                )

            reward = np.array([reward])
            rewards.append(reward)
            terminal = np.array([terminal]).reshape((1, ))
            path_builder.add_all(
                observations=ob,
                actions=action,
                rewards=reward,
                next_observations=next_ob,
                terminals=terminal,
                agent_infos=agent_info,
                env_infos=env_info,
            )
        self.demo_trajectory_rewards.append(rewards)
        path = path_builder.get_all_stacked()
        replay_buffer.add_path(path)