def gen_rollout(self, obs_task_params, task_params, z, num_rollouts):
        # set up the post cond policy
        z = z.cpu().data.numpy()
        post_cond_policy = PostCondMLPPolicyWrapper(self.main_policy, z)
        post_cond_policy.policy.eval()

        # generate some rollouts
        successes = []
        for roll_num in range(num_rollouts):
            observation = self.env.reset(task_params=task_params, obs_task_params=obs_task_params)
            terminal = False
            timestep = 0
            cur_success = False

            while (not terminal) and timestep < self.alg.max_path_length:
                agent_obs = observation['obs']
                action, agent_info = post_cond_policy.get_action(agent_obs)
                
                next_ob, raw_reward, terminal, env_info = (self.env.step(action))
                if env_info['is_success']: cur_success = True
                if self.alg.no_terminal: terminal = False
                observation = next_ob
                timestep += 1
            
            successes.append(float(cur_success))
        return successes
Beispiel #2
0
    def get_eval_policy(self, task_identifier, mode='meta_test'):
        if self.wrap_absorbing: raise NotImplementedError('wrap absorbing')
        if mode == 'meta_train':
            rb = self.train_context_expert_replay_buffer
        else:
            rb = self.test_context_expert_replay_buffer
        
        eval_context_size = np.random.randint(self.min_context_size, self.max_context_size+1)
        list_of_trajs = rb.sample_trajs_from_task(
            task_identifier,
            eval_context_size\
                if self.few_shot_version else self.num_context_trajs_for_eval,
        )
        # list_of_trajs = rb.sample_trajs_from_task(
        #     task_identifier,
        #     3 if self.few_shot_version else self.num_context_trajs_for_eval,
        # )
        
        if self.use_target_enc:
            enc_to_use = self.target_enc
        else:
            enc_to_use = self.encoder
        
        mode = enc_to_use.training
        enc_to_use.eval()
        post_dist = enc_to_use([list_of_trajs])
        enc_to_use.train(mode)

        z = post_dist.sample()
        # z = post_dist.mean
        z = z.cpu().data.numpy()[0]
        if self.use_target_policy:
            return PostCondMLPPolicyWrapper(self.target_policy, z, deterministic=self.eval_deterministic)
        else:
            return PostCondMLPPolicyWrapper(self.policy, z, deterministic=self.eval_deterministic)
Beispiel #3
0
    def get_exploration_policy(self, task_identifier):
        if self.wrap_absorbing: raise NotImplementedError('wrap absorbing')
        if self.few_shot_version:
            # no need for this if/else statement like this, make it cleaner
            this_context_size = np.random.randint(self.min_context_size,
                                                  self.max_context_size + 1)
            list_of_trajs = self.train_context_expert_replay_buffer.sample_trajs_from_task(
                task_identifier, this_context_size)
            mask = None
        else:
            list_of_trajs = self.train_context_expert_replay_buffer.sample_trajs_from_task(
                task_identifier,
                self.num_context_trajs_for_exploration,
            )
            mask = None

        if self.use_target_enc:
            enc_to_use = self.target_enc
        else:
            enc_to_use = self.encoder

        mode = enc_to_use.training
        enc_to_use.eval()
        post_dist = enc_to_use([list_of_trajs], mask)
        enc_to_use.train(mode)

        z = post_dist.sample()
        # z = post_dist.mean
        z = z.cpu().data.numpy()[0]
        if self.use_target_policy:
            return PostCondMLPPolicyWrapper(self.target_policy, z)
        else:
            return PostCondMLPPolicyWrapper(self.policy, z)
Beispiel #4
0
    def get_eval_policy(self, task_identifier, mode='meta_test'):
        if task_identifier not in self.context_buffer.task_replay_buffers:
            # generate some rollouts with prior policy
            eval_context_buffer = MetaEnvReplayBuffer(
                self.context_buffer_size_per_task,
                self.training_env,
                policy_uses_pixels=self.policy_uses_pixels,
            )

            n_steps_total = 0
            steps_needed = self.num_context_trajs_for_exploration * self.max_path_length
            task_params = self.training_env.task_id_to_task_params(
                task_identifier)
            obs_task_params = self.training_env.task_id_to_obs_task_params(
                task_identifier)
            while n_steps_total < steps_needed:
                first_obs = self.training_env.reset(
                    task_params=task_params, obs_task_params=obs_task_params)
                task_id = self.training_env.task_identifier

                z = self.prior_dist.sample()
                z = z.cpu().data.numpy()[0]
                post_cond_policy = PostCondMLPPolicyWrapper(
                    self.main_policy, z)

                new_path = rollout(self.training_env,
                                   post_cond_policy,
                                   max_path_length=min(
                                       self.max_path_length + 1,
                                       steps_needed - n_steps_total + 1),
                                   do_not_reset=True,
                                   first_obs=first_obs)
                n_steps_total += len(new_path['observations'])
                eval_context_buffer.add_path(new_path, task_id)

            list_of_trajs = eval_context_buffer.sample_trajs_from_task(
                task_identifier,
                self.num_context_trajs_for_exploration,
                samples_per_traj=self.samples_per_traj)
            mask = None
        else:
            list_of_trajs = self.context_buffer.sample_trajs_from_task(
                task_identifier,
                self.num_context_trajs_for_exploration,
            )
            mask = None

        enc_to_use = self.encoder
        mode = enc_to_use.training
        enc_to_use.eval()
        post_dist = enc_to_use([list_of_trajs], mask)
        enc_to_use.train(mode)

        z = post_dist.sample()
        z = z.cpu().data.numpy()[0]
        return PostCondMLPPolicyWrapper(self.main_policy, z)
Beispiel #5
0
def gather_eval_data(policy, np_encoder, expert_buffer_for_eval_tasks):
    # return all the metrics we would need for evaluating the models
    # for each trajectory we need to know 1) was it successful 2) was it a good reach
    # policy.cuda()
    # np_encoder.cuda()

    policy.eval()
    np_encoder.eval()

    params_sampler = _BaseParamsSampler(random=52269, num_colors=16)
    env = EvalEnv()

    all_statistics = {}
    task_num = 0

    algorithm_all_percent_good_reach = []
    algorithm_all_percent_solved = []
    for task_params, obs_task_params in params_sampler:
        print('\tEvaluating task %d...' % task_num)
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for context_num in range(4):
            print('\t\tTry with new context number %d...' % context_num)
            # get a context
            list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                task_id, 1)
            post_dist = np_encoder([list_of_trajs])
            all_good_reach_for_context = [0 for _ in range(20)]
            all_solved_for_context = [0 for _ in range(20)]
            for post_sample_num in range(4):
                z = post_dist.sample()
                z = z.cpu().data.numpy()[0]
                post_cond_policy = PostCondMLPPolicyWrapper(policy, z)
                # reset the env seed
                env.seed(seed=ENV_EVAL_SEED)
                for t in range(20):
                    stacked_path = rollout_path(env, task_params,
                                                obs_task_params,
                                                post_cond_policy)
                    # print(stacked_path['observations'][0])
                    stats = env.log_statistics([stacked_path])
                    if stats['Percent_Good_Reach'] > 0:
                        all_good_reach_for_context[t] = 1.0
                    if stats['Percent_Solved'] > 0:
                        all_solved_for_context[t] = 1.0
                    # paths_for_context_size.append(stacked_path)
            algorithm_all_percent_good_reach.append(
                np.mean(all_good_reach_for_context))
            algorithm_all_percent_solved.append(
                np.mean(all_solved_for_context))
    return {
        'algorithm_all_percent_good_reach': algorithm_all_percent_good_reach,
        'algorithm_all_percent_solved': algorithm_all_percent_solved
    }
 def get_exploration_policy(self, task_identifier):
     list_of_trajs = self.train_context_expert_replay_buffer.sample_trajs_from_task(
         task_identifier,
         self.num_context_trajs_for_exploration,
     )
     post_dist = self.encoder([list_of_trajs])
     # z = post_dist.sample()
     z = post_dist.mean
     z = z.cpu().data.numpy()[0]
     return PostCondMLPPolicyWrapper(self.main_policy, z)
 def get_eval_policy(self, task_identifier, mode='meta_test'):
     if mode == 'meta_train':
         rb = self.train_context_expert_replay_buffer
     else:
         rb = self.test_context_expert_replay_buffer
     list_of_trajs = rb.sample_trajs_from_task(
         task_identifier,
         self.num_context_trajs_for_eval,
     )
     post_dist = self.encoder([list_of_trajs])
     # z = post_dist.sample()
     z = post_dist.mean
     z = z.cpu().data.numpy()[0]
     return PostCondMLPPolicyWrapper(self.main_policy, z)
def gather_eval_data(policy, np_encoder, expert_buffer_for_eval_tasks):
    # return all the metrics we would need for evaluating the models
    # for each trajectory we need to know 1) was it successful 2) was it a good reach
    # policy.cuda()
    # np_encoder.cuda()

    policy.eval()
    np_encoder.eval()

    params_sampler = _BaseParamsSampler(random=52269, num_colors=16)
    env = EvalEnv()

    all_statistics = {}
    task_num = 0
    for task_params, obs_task_params in params_sampler:
        print('\tEvaluating task %d...' % task_num)
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for context_size in range(1, 7):
            print('\t\tEvaluating context size %d...' % context_size)
            paths_for_context_size = []
            for _ in range(3):
                # get a context
                list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                    task_id, context_size)
                post_dist = np_encoder([list_of_trajs])

                for _ in range(3):
                    # sample from the posterior and get the PostCondPolicy
                    z = post_dist.sample()
                    z = z.cpu().data.numpy()[0]
                    post_cond_policy = PostCondMLPPolicyWrapper(policy, z)

                    for _ in range(4):
                        stacked_path = rollout_path(env, task_params,
                                                    obs_task_params,
                                                    post_cond_policy)
                        paths_for_context_size.append(stacked_path)

            stats_for_context_size = env.log_statistics(paths_for_context_size)
            all_statistics[context_size] = {
                'Percent_Good_Reach':
                stats_for_context_size['Percent_Good_Reach'],
                'Percent_Solved': stats_for_context_size['Percent_Solved']
            }
    return all_statistics
Beispiel #9
0
    def get_exploration_policy(self, task_identifier):
        list_of_trajs = self.context_buffer.sample_trajs_from_task(
            task_identifier,
            self.num_context_trajs_for_exploration,
            samples_per_traj=self.samples_per_traj)
        mask = None

        enc_to_use = self.encoder
        mode = enc_to_use.training
        enc_to_use.eval()
        post_dist = enc_to_use([list_of_trajs], mask)
        enc_to_use.train(mode)

        z = post_dist.sample()
        z = z.cpu().data.numpy()[0]
        return PostCondMLPPolicyWrapper(self.main_policy, z)
Beispiel #10
0
    def pretrain(self):
        print('Generating initial contexts')

        # fill the contexts
        for task_params, obs_task_params in self.train_task_params_sampler:
            print('task')
            n_steps_total = 0
            # print(n_steps_total)
            while n_steps_total < self.context_buffer_size_per_task:
                # print('------')
                # print(n_steps_total)
                # print(self.context_buffer_size_per_task)
                # print(self.max_path_length)

                first_obs = self.training_env.reset(
                    task_params=task_params, obs_task_params=obs_task_params)
                task_id = self.training_env.task_identifier

                z = self.prior_dist.sample()
                z = z.cpu().data.numpy()[0]
                post_cond_policy = PostCondMLPPolicyWrapper(
                    self.main_policy, z)

                new_path = rollout(
                    self.training_env,
                    post_cond_policy,
                    max_path_length=min(
                        self.max_path_length + 1,
                        self.context_buffer_size_per_task - n_steps_total + 1),
                    do_not_reset=True,
                    first_obs=first_obs)
                # print(len(new_path['observations']))
                n_steps_total += len(new_path['observations'])

                if self.add_context_rollouts_to_replay_buffer:
                    self.replay_buffer.add_path(new_path, task_id)
                self.context_buffer.add_path(new_path, task_id)

        print('Generating initial replay buffer rollouts')
        super().pretrain()
Beispiel #11
0
def gather_eval_data(alg,
                     num_rollouts_per_context=8,
                     deterministic=True,
                     num_diff_context=1,
                     eval_params_sampler=None,
                     expert_buffer_for_eval_tasks=None,
                     evaluating_expert=False,
                     eval_deterministic=True,
                     eval_no_task_info=False):
    context_sizes = [1]
    if not evaluating_expert:
        alg.encoder.eval()

    all_statistics = {}
    task_num = 0

    # env = alg.env
    env = Walker2DRandomDynamicsEnv()

    _means = []
    _stds = []

    for task_params, obs_task_params in eval_params_sampler:
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_rets = []
        print('\tEvaluating task {}...'.format(obs_task_params))
        print(task_params)
        task_num += 1
        task_id = env.task_identifier

        for context_size in context_sizes:
            _cont_size_dict = {}

            for c_idx in range(num_diff_context):
                if not evaluating_expert:
                    if eval_no_task_info:
                        print('Evaluting with no task information!')
                        new_task_params = {}
                        for k in task_params:
                            new_task_params[k] = np.ones(task_params[k].shape)
                        raise NotImplementedError()
                    else:
                        list_of_trajs = alg.expert_buffer_for_eval_tasks.sample_trajs_from_task(
                            task_id, context_size)
                    alg.encoder.eval()
                    post_dist = alg.encoder([list_of_trajs])
                    z = post_dist.sample()
                    z = z.cpu().data.numpy()[0]
                    # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z)
                    post_cond_policy = PostCondMLPPolicyWrapper(
                        alg.main_policy, z)
                    post_cond_policy.policy.eval()
                else:
                    # if eval_no_task_info:
                    #     print('Evaluting with no task information!')
                    #     post_cond_policy = alg.get_eval_policy(0.0*np.ones(obs_task_params.shape))
                    # else:
                    #     post_cond_policy = alg.get_eval_policy(np.ones(obs_task_params))

                    # For evaluating a standard walker expert
                    # post_cond_policy = alg.policy
                    # post_cond_policy = alg.eval_policy
                    post_cond_policy = MakeDeterministic(alg.policy)

                post_cond_policy.deterministic = eval_deterministic
                context_returns = []
                for _ in range(num_rollouts_per_context):
                    stacked_path = rollout_path(env, task_params,
                                                obs_task_params,
                                                post_cond_policy,
                                                alg.max_path_length)
                    context_returns.append(np.sum(stacked_path['rewards']))
                task_rets.extend(context_returns)

        all_statistics[task_id] = task_rets
        print('\nReturns: %.4f +/- %.4f' %
              (np.mean(task_rets), np.std(task_rets)))
        _means.append(np.mean(task_rets))
        _stds.append(np.std(task_rets))
    for i in range(len(_means)):
        print('%.4f +/- %.4f' % (_means[i], _stds[i]))
    return all_statistics
def gather_eval_data(
        policy,
        encoder,
        env,
        expert_buffer_for_eval_tasks=None,
        num_diff_context_per_task=8,
        context_size_min=1,
        context_size_max=12,
        num_rollouts_per_context=20,
        deterministic=True,
        params_sampler=None,
    ):
    policy.eval()
    encoder.eval()

    all_success_transitions = []
    all_no_op_transitions = []

    task_num = 0
    for task_params, obs_task_params in params_sampler:
        print('\n\tEvaluating task {}...'.format(task_num))
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for _ in range(num_diff_context_per_task):
            print('new context transition')

            transition_success_rate = []
            transition_no_op_rate = []
            list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                task_id,
                context_size_max
            )
            for i in range(context_size_min, context_size_max+1):
                print('next size')
                correct = []
                incorrect = []
                no_op = []

                new_list_of_trajs = list_of_trajs[:i]
                print(len(new_list_of_trajs))
                post_dist = encoder([new_list_of_trajs])
                z = post_dist.mean
                z = z.cpu().data.numpy()[0]

                post_cond_policy = PostCondMLPPolicyWrapper(policy, z)
                post_cond_policy.policy.eval()
                post_cond_policy.deterministic = deterministic
            
                for _ in range(num_rollouts_per_context):
                    max_path_length = 50
                    within_correct, within_incorrect = rollout_path(
                        env,
                        task_params,
                        obs_task_params,
                        post_cond_policy,
                        max_path_length
                    )
                    correct.append(within_correct)
                    incorrect.append(within_incorrect)
                    no_op.append(not (within_correct or within_incorrect))
                
                transition_success_rate.append(np.mean(correct))
                transition_no_op_rate.append(np.mean(no_op))
                # task_rets.append(np.sum(stacked_path['rewards']))
            all_success_transitions.append(transition_success_rate)
            all_no_op_transitions.append(transition_no_op_rate)

            print(transition_success_rate)
            print(transition_no_op_rate)
        
        if task_num == 32: break


        # print('Returns: %.1f +/- %.1f' % (np.mean(task_rets), np.std(task_rets)))
        # all_statistics[task_id] = task_rets
    
    return {
        'all_success_transitions': all_success_transitions,
        'all_no_op_transitions': all_no_op_transitions,
    }
Beispiel #13
0
def gather_eval_data(policy, np_encoder, expert_buffer_for_eval_tasks, max_k=8, sample_from_prior=False):
    # return all the metrics we would need for evaluating the models
    # for each trajectory we need to know 1) was it successful 2) was it a good reach
    # policy.cuda()
    # np_encoder.cuda()

    policy.eval()
    np_encoder.eval()

    params_sampler = _BaseParamsSampler(random=52269, num_colors=NUM_EVAL_TASKS)
    env = EvalEnv()

    all_statistics = {}
    task_num = 0

    '''
    algorithm = [tasks]
    task = [contexts]
    context = [post_samples]
    # post samples run on the same set of trajs
    post_samples = [trajs]
    trajs \in {0,1}
    '''
    algorithm_good_reach = []
    algorithm_solved = []
    for task_params, obs_task_params in params_sampler:
        print('\tEvaluating task %d...' % task_num)
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        task_good_reach = []
        task_solved = []

        for context_num in range(4):
            print('\t\tTry with new context, number %d...' % context_num)
            # get a single trajectory context
            list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                task_id,
                1
            )
            post_dist = np_encoder([list_of_trajs])
            context_good_reach = []
            context_solved = []

            # evaluate all posterior sample trajs with same initial state
            env_seed = np.random.randint(0, high=10000)

            for post_sample_num in range(1,max_k+1):
                z = post_dist.sample()
                z = z.cpu().data.numpy()[0]
                if sample_from_prior:
                    z = np.random.normal(size=z.shape)
                post_cond_policy = PostCondMLPPolicyWrapper(policy, z)
                # reset the env seed
                env.seed(seed=env_seed)

                post_good_reach = []
                post_solved = []
                for t in range(20):
                    stacked_path = rollout_path(
                        env,
                        task_params,
                        obs_task_params,
                        post_cond_policy
                    )
                    # print(stacked_path['observations'][0])
                    stats = env.log_statistics([stacked_path])
                    if stats['Percent_Good_Reach'] > 0:
                        post_good_reach.append(1.0)
                    else:
                        post_good_reach.append(0.0)
                    
                    if stats['Percent_Solved'] > 0:
                        post_solved.append(1.0)
                    else:
                        post_solved.append(0.0)
                    # paths_for_context_size.append(stacked_path)
                
                context_good_reach.append(post_good_reach)
                context_solved.append(post_solved)
            
            task_good_reach.append(context_good_reach)
            task_solved.append(context_solved)
        
        algorithm_good_reach.append(task_good_reach)
        algorithm_solved.append(task_solved)
    return {
        'algorithm_good_reach': algorithm_good_reach,
        'algorithm_solved': algorithm_solved
    }
def gather_eval_data(policy,
                     np_encoder,
                     expert_buffer_for_eval_tasks,
                     max_context_size=6,
                     sample_from_prior=False):
    # return all the metrics we would need for evaluating the models
    # for each trajectory we need to know 1) was it successful 2) was it a good reach
    # policy.cuda()
    # np_encoder.cuda()

    policy.eval()
    np_encoder.eval()

    params_sampler = _BaseParamsSampler(random=52269,
                                        num_colors=NUM_EVAL_TASKS)
    env = EvalEnv()

    all_statistics = {}
    task_num = 0

    if sample_from_prior: max_context_size = 1

    all_good_reach = defaultdict(list)
    all_solved = defaultdict(list)
    all_no_op_fail = defaultdict(list)
    for task_params, obs_task_params in params_sampler:
        print('\tEvaluating task %d...' % task_num)
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for context_size in range(1, max_context_size + 1):
            print('\t\tEvaluating context size %d...' % context_size)
            paths_for_context_size = []
            for _ in range(NUM_CONTEXT_SAMPLES):
                # get a context
                list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                    task_id, context_size)
                post_dist = np_encoder([list_of_trajs])

                for _ in range(NUM_POST_SAMPLES):
                    # sample from the posterior and get the PostCondPolicy
                    # z = post_dist.sample()
                    z = post_dist.mean
                    z = z.cpu().data.numpy()[0]
                    if sample_from_prior:
                        z = np.random.normal(size=z.shape)
                    post_cond_policy = PostCondMLPPolicyWrapper(policy, z)

                    for _ in range(NUM_ROLLOUTS_PER_POST_SAMPLE):
                        stacked_path = rollout_path(env, task_params,
                                                    obs_task_params,
                                                    post_cond_policy)
                        paths_for_context_size.append(stacked_path)

            stats_for_context_size = env.log_statistics(paths_for_context_size)
            all_good_reach[context_size].append(
                stats_for_context_size['Percent_Good_Reach'])
            all_solved[context_size].append(
                stats_for_context_size['Percent_Solved'])
            all_no_op_fail[context_size].append(
                stats_for_context_size['Percent_NoOp_Fail'])
    return {
        'algorithm_good_reach': all_good_reach,
        'algorithm_solved': all_solved,
        'algorithm_no_op_fail': all_no_op_fail
    }
def gather_eval_data(alg,
                     sample_from_prior=False,
                     num_rollouts_per_task=8,
                     context_sizes=[4],
                     deterministic=True,
                     eval_expert=False,
                     just_loading_policy=False,
                     render=False):
    if not eval_expert: alg.encoder.eval()

    all_statistics = {}
    task_num = 0

    params_sampler = EvalParamsSampler()
    if not just_loading_policy:
        env = alg.env
    else:
        env = AntRandDirec2DEnv()

    for task_params, obs_task_params in params_sampler:
        _task_dict = {}
        # print('\tEvaluating task %.4f...' % obs_task_params)
        print('\n\tEvaluating task {}'.format(obs_task_params))
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for context_size in context_sizes:
            _cont_size_dict = {}
            print('\t\tTry with context size: %d...' % context_size)

            # evaluate all posterior sample trajs with same initial state
            env_seed = np.random.randint(0, high=10000)

            if sample_from_prior: raise NotImplementedError
            # z = post_dist.sample()
            # z = z.cpu().data.numpy()[0]
            # if sample_from_prior:
            #     z = np.random.normal(size=z.shape)
            if eval_expert:
                if just_loading_policy:
                    post_cond_policy = PostCondMLPPolicyWrapper(
                        alg, obs_task_params)
                else:
                    post_cond_policy = alg.get_eval_policy(obs_task_params)
            else:
                post_cond_policy = alg.get_eval_policy(task_id,
                                                       mode='meta_test')
            post_cond_policy.policy.eval()
            post_cond_policy.deterministic = deterministic

            # reset the env seed
            env.seed(seed=env_seed)
            _rets = []
            _min_dists = []
            _last_100 = []
            for _ in range(num_rollouts_per_task):
                if just_loading_policy:
                    # max_path_length = 200
                    # max_path_length = 300
                    max_path_length = 100
                else:
                    alg.max_path_length
                stacked_path = rollout_path(env, task_params, obs_task_params,
                                            post_cond_policy, max_path_length,
                                            eval_expert, render)
                obs = np.array(
                    [d['obs'] for d in stacked_path['observations']])

        all_statistics[task_id] = _task_dict
    return all_statistics
def gather_eval_data(
    policy,
    encoder,
    env,
    num_diff_context=4,
    num_rollouts_per_context=4,
    deterministic=True,
    expert_buffer_for_eval_tasks=None,
    params_sampler=None,
    eval_non_meta_policy=False
    ):
    policy.eval()
    if not eval_non_meta_policy:
        encoder.eval()

    all_statistics = {}
    task_num = 0

    for task_params, obs_task_params in params_sampler:
        task_rets = []
        # print('\tEvaluating task %.4f...' % obs_task_params)
        # print('\n\tEvaluating task {}...'.format(obs_task_params))
        print('\n\tEvaluating task {}...'.format(task_num))
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for _ in range(num_diff_context):
            if not eval_non_meta_policy:
                list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                    task_id,
                    1
                )
                post_dist = encoder([list_of_trajs])
                z = post_dist.mean
                z = z.cpu().data.numpy()[0]

                post_cond_policy = PostCondMLPPolicyWrapper(policy, z)
                post_cond_policy.policy.eval()
                post_cond_policy.deterministic = deterministic
            else:
                if deterministic:
                    print('DETERMINISTIC')
                    post_cond_policy = MakeDeterministic(policy)
                else:
                    post_cond_policy = policy
            
            for _ in range(num_rollouts_per_context):
                max_path_length = 1000

                stacked_path = rollout_path(
                    env,
                    task_params,
                    obs_task_params,
                    post_cond_policy,
                    max_path_length,
                    task_num
                )
                task_rets.append(np.sum(stacked_path['rewards']))

        print('Returns: %.1f +/- %.1f' % (np.mean(task_rets), np.std(task_rets)))
        all_statistics[task_id] = task_rets
    return all_statistics
def gather_eval_data(alg,
                     sample_from_prior=False,
                     num_rollouts_per_task=8,
                     context_sizes=[4],
                     deterministic=True,
                     num_diff_context=1):
    alg.encoder.eval()

    all_statistics = {}
    task_num = 0

    params_sampler = alg.test_task_params_sampler
    expert_buffer_for_eval_tasks = alg.test_context_expert_replay_buffer
    env = alg.env

    _all_rets = []

    for task_params, obs_task_params in params_sampler:
        _task_dict = {}
        print('\tEvaluating task %.4f...' % obs_task_params)
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for context_size in context_sizes:
            _cont_size_dict = {}
            print('\t\tTry with context size: %d...' % context_size)
            # list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
            #     task_id,
            #     context_size
            # )

            # # evaluate all posterior sample trajs with same initial state
            # env_seed = np.random.randint(0, high=10000)

            if sample_from_prior: raise NotImplementedError
            # z = post_dist.sample()
            # z = z.cpu().data.numpy()[0]
            # if sample_from_prior:
            #     z = np.random.normal(size=z.shape)

            #
            # post_cond_policy = alg.get_eval_policy(task_id, mode='meta_test')
            # post_cond_policy.policy.eval()
            # post_cond_policy.deterministic = deterministic
            #

            # reset the env seed
            _vels = []
            # _std_vels = []
            _run_costs = []
            _rets = []
            # env.seed(seed=env_seed)

            for c_idx in range(num_diff_context):
                list_of_trajs = alg.test_context_expert_replay_buffer.sample_trajs_from_task(
                    task_id, context_size)
                alg.encoder.eval()
                post_dist = alg.encoder([list_of_trajs])
                z = post_dist.sample()
                z = z.cpu().data.numpy()[0]
                # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z)
                post_cond_policy = PostCondMLPPolicyWrapper(alg.policy, z)
                post_cond_policy.policy.eval()
                post_cond_policy.deterministic = deterministic
                for _ in range(num_rollouts_per_task):
                    stacked_path = rollout_path(env, task_params,
                                                obs_task_params,
                                                post_cond_policy,
                                                alg.max_path_length)

                    # compute mean vel, return, run cost per traj
                    _vels.extend([d['vel'] for d in stacked_path['env_infos']])
                    # _std_vels.append(np.std([d['vel'] for d in stacked_path['env_infos']]))
                    _run_costs.append(
                        np.sum([
                            d['run_cost'] for d in stacked_path['env_infos']
                        ]))
                    _rets.append(np.sum(stacked_path['rewards']))

            _cont_size_dict['_vels'] = _vels
            # _cont_size_dict['std_vels'] = _std_vels
            _cont_size_dict['run_costs'] = _run_costs
            _cont_size_dict['rets'] = _rets
            _task_dict[context_size] = _cont_size_dict

            print('\t\tVel: %.4f +/- %.4f' % (np.mean(_vels), np.std(_vels)))
            _all_rets.extend(_rets)

        all_statistics[task_id] = _task_dict
    print('\nReturns: %.4f +/- %.4f' % (np.mean(_all_rets), np.std(_all_rets)))
    return all_statistics
Beispiel #18
0
    def _do_training(self, epoch):
        # sample a mini-batch of tasks
        task_batch = self.train_task_params_sampler.sample_unique(
            self.num_tasks_used_per_update)

        # reset the context buffer for these tasks
        for task_params, obs_task_params in task_batch:
            self.training_env.reset(task_params=task_params,
                                    obs_task_params=obs_task_params)
            task_id = self.training_env.task_identifier
            self.context_buffer.task_replay_buffers[task_id]._size = 0
            self.context_buffer.task_replay_buffers[task_id]._top = 0

        # generate contexts for each task in the minibatch
        for task_params, obs_task_params in task_batch:
            n_steps_total = 0
            while n_steps_total < self.context_buffer_size_per_task:
                first_obs = self.training_env.reset(
                    task_params=task_params, obs_task_params=obs_task_params)
                task_id = self.training_env.task_identifier

                z = self.prior_dist.sample()
                z = z.cpu().data.numpy()[0]
                post_cond_policy = PostCondMLPPolicyWrapper(
                    self.main_policy, z)

                new_path = rollout(
                    self.training_env,
                    post_cond_policy,
                    max_path_length=min(
                        self.max_path_length + 1,
                        self.context_buffer_size_per_task - n_steps_total + 1),
                    do_not_reset=True,
                    first_obs=first_obs)
                n_steps_total += len(new_path['observations'])

                if self.add_context_rollouts_to_replay_buffer:
                    self.replay_buffer.add_path(new_path, task_id)
                self.context_buffer.add_path(new_path, task_id)

        # # generate rollouts using the posteriors
        # for task_params, obs_task_params in task_batch:
        #     n_steps_total = 0
        #     while n_steps_total < self.num_posterior_steps_per_task:
        #         first_obs = self.training_env.reset(task_params=task_params, obs_task_params=obs_task_params)
        #         task_id = self.training_env.task_identifier

        #         post_cond_policy = self.get_posterior_policy(task_id)
        #         new_path = rollout(
        #             self.training_env,
        #             post_cond_policy,
        #             max_path_length=min(self.max_path_length, self.num_context_steps - self.max_path_length),
        #             do_not_reset=True,
        #             first_obs=first_obs
        #         )
        #         n_steps_total += len(new_path['observations'])

        #         self.replay_buffer.add_path(new_path, task_id)

        # now do some training
        for t in range(self.num_update_loops_per_train_call):
            self._do_update(epoch)
def gather_eval_data(
    alg,
    sample_from_prior=False,
    num_rollouts_per_task=8,
    context_sizes=[4],
    num_diff_context=1,
    deterministic=True,
    eval_expert=False,
    just_loading_policy=False,
    render=False,
    use_separate_expert_buffer=False,
    expert_buffer_for_eval_tasks=None,
):
    if not eval_expert: alg.encoder.eval()

    all_statistics = {}
    task_num = 0

    # params_sampler = alg.test_task_params_sampler
    # params_sampler = alg.train_task_params_sampler
    params_sampler = AntRandGoalExpertTestSampler()
    if not just_loading_policy:
        env = alg.env
    else:
        env = AntRandGoalEnv()

    for task_params, obs_task_params in params_sampler:
        _task_dict = {}
        # print('\tEvaluating task %.4f...' % obs_task_params)
        print('\n\tEvaluating task {}'.format(obs_task_params))
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for context_size in context_sizes:
            _cont_size_dict = {}
            print('\t\tTry with context size: %d...' % context_size)

            # evaluate all posterior sample trajs with same initial state
            env_seed = np.random.randint(0, high=10000)
            # reset the env seed
            env.seed(seed=env_seed)
            _rets = []
            _min_dists = []
            _last_100 = []

            for _ in range(num_diff_context):
                if sample_from_prior: raise NotImplementedError
                # z = post_dist.sample()
                # z = z.cpu().data.numpy()[0]
                # if sample_from_prior:
                #     z = np.random.normal(size=z.shape)
                if eval_expert:
                    if just_loading_policy:
                        post_cond_policy = PostCondMLPPolicyWrapper(
                            alg, obs_task_params)
                    else:
                        post_cond_policy = alg.get_eval_policy(obs_task_params)
                else:
                    if use_separate_expert_buffer:
                        list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                            task_id, context_size)
                        post_dist = alg.encoder([list_of_trajs])
                        z = post_dist.mean
                        z = z.cpu().data.numpy()[0]
                        # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z)
                        post_cond_policy = PostCondMLPPolicyWrapper(
                            alg.policy, z)
                    else:
                        post_cond_policy = alg.get_eval_policy(
                            task_id, mode='meta_test')
                    # post_cond_policy = alg.get_eval_policy(task_id, mode='meta_train')
                post_cond_policy.policy.eval()
                post_cond_policy.deterministic = deterministic

                for _ in range(num_rollouts_per_task):
                    if just_loading_policy:
                        max_path_length = 100
                    else:
                        max_path_length = alg.max_path_length
                    stacked_path = rollout_path(env, task_params,
                                                obs_task_params,
                                                post_cond_policy,
                                                max_path_length, eval_expert,
                                                render)
                    obs = np.array(
                        [d['obs'] for d in stacked_path['observations']])
                    # print(np.max(obs, axis=0))
                    # print(np.min(obs, axis=0))
                    # print(np.mean(obs, axis=0))
                    # print(np.std(obs, axis=0))
                    # print(obs.shape)
                    # print(np.max(obs))
                    # print(np.min(obs))

                    _rets.append(np.sum(stacked_path['rewards']))
                    rew_frw = [
                        d['reward_forward'] for d in stacked_path['env_infos']
                    ]
                    _min_dists.append(-np.max(rew_frw))
                    _last_100.append(np.mean(rew_frw[-100:]))

            _cont_size_dict['rets'] = _rets
            _cont_size_dict['min_dists'] = _min_dists
            _cont_size_dict['last_100'] = _last_100
            _task_dict[context_size] = _cont_size_dict

            print('\t\t\tMin Dist: %.4f +/- %.4f' %
                  (np.mean(_min_dists), np.std(_min_dists)))
            print(_min_dists)

        all_statistics[task_id] = _task_dict
    return all_statistics