Esempio n. 1
0
    def run(self, env, device):

        baseline = ch.models.robotics.LinearValue(env.state_size,
                                                  env.action_size)
        policy = DiagNormalPolicy(env.state_size, env.action_size)

        self.log_model(policy, device, input_shape=(1, env.state_size))

        t = trange(self.params['num_iterations'], desc='Iteration', position=0)
        try:
            for iteration in t:

                iter_reward = 0.0

                task_list = env.sample_tasks(self.params['batch_size'])
                for task_i in trange(len(task_list),
                                     leave=False,
                                     desc='Task',
                                     position=0):
                    task = task_list[task_i]
                    env.set_task(task)
                    env.reset()
                    task = Runner(env)

                    episodes = task.run(policy, episodes=params['n_episodes'])
                    task_reward = episodes.reward().sum().item(
                    ) / params['n_episodes']

                    iter_reward += task_reward

                # Log
                average_return = iter_reward / self.params['batch_size']
                metrics = {'average_return': average_return}

                t.set_postfix(metrics)
                self.log_metrics(metrics)

                if iteration % self.params['save_every'] == 0:
                    self.save_model_checkpoint(policy, str(iteration + 1))
                    self.save_model_checkpoint(
                        baseline, 'baseline_' + str(iteration + 1))

        # Support safely manually interrupt training
        except KeyboardInterrupt:
            print(
                '\nManually stopped training! Start evaluation & saving...\n')
            self.logger['manually_stopped'] = True
            self.params['num_iterations'] = iteration

        self.save_model(policy)
        self.save_model(baseline, name='baseline')

        self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'],
                                                2)) + ' sec'
        # Evaluate on new test tasks
        policy = MAML(policy, lr=self.params['inner_lr'])
        self.logger['test_reward'] = evaluate_ppo(env_name, policy, baseline,
                                                  params)
        self.log_metrics({'test_reward': self.logger['test_reward']})
        self.save_logs_to_file()
Esempio n. 2
0
def measure_change_through_time(path, env_name, policy, rep_params):
    env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length'])
    global metrics
    metrics = ['CCA']

    sanity_task = env.sample_tasks(1)

    with torch.no_grad():
        env.set_task(sanity_task[0])
        env.seed(rep_params['seed'])
        env.reset()
        env_task = Runner(env)
        sanity_ep = env_task.run(policy, episodes=1)

    init_change_m = defaultdict(list)
    init_change_v = defaultdict(list)
    adapt_change_m = defaultdict(list)
    adapt_change_v = defaultdict(list)
    checkpoints = path + f'/model_checkpoints/'
    i = 0

    file_list = os.listdir(checkpoints)
    file_list = [file for file in file_list if 'baseline' not in file]
    models_list = {}
    for file in file_list:
        n_file = file.split('_')[-1]
        n_file = n_file.split('.')[0]
        n_file = int(n_file)
        models_list[n_file] = f'model_{n_file}.pt'

    prev_policy = policy
    for key in sorted(models_list.keys()):
        model_chckpnt = models_list[key]
        if i > 40:
            break
        i += 1

        print(f'Loading {model_chckpnt} ...')
        chckpnt_policy = DiagNormalPolicy(9, 4)
        chckpnt_policy.load_state_dict(torch.load(os.path.join(checkpoints, model_chckpnt)))
        chckpnt_policy = MAML(chckpnt_policy, lr=rep_params['inner_lr'])

        mean, variance = episode_mean_var(sanity_ep, policy, chckpnt_policy, layer=6)
        a_mean, a_variance = episode_mean_var(sanity_ep, prev_policy, chckpnt_policy, layer=6)
        init_change_m['CCA'] += [mean['CCA']]
        init_change_v['CCA'] += [variance['CCA']]
        adapt_change_m['CCA'] += [a_mean['CCA']]
        adapt_change_v['CCA'] += [a_variance['CCA']]

        prev_policy = chckpnt_policy

    for metric in metrics:
        plot_sim_across_steps(init_change_m[metric], init_change_v[metric], metric=metric,
                              title='Similarity between init and adapted (in %)')

    for metric in metrics:
        difference = [1 - x for x in adapt_change_m[metric]]
        plot_sim_across_steps(difference, adapt_change_v[metric], metric=metric,
                              title='Representation difference after each step (in %)')
Esempio n. 3
0
def sanity_check(env_name, model_1, model_2, rep_params):
    # Sample a sanity batch
    env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length'])

    env.active_env.random_init = False

    sanity_task = env.sample_tasks(1)

    with torch.no_grad():
        env.set_task(sanity_task[0])
        env.seed(rep_params['seed'])
        env.reset()
        env_task = Runner(env)
        init_sanity_ep = env_task.run(model_1, episodes=1)

        env.set_task(sanity_task[0])
        env.seed(rep_params['seed'])
        env.reset()
        env_task = Runner(env)
        adapt_sanity_ep = env_task.run(model_2, episodes=1)
        env_task.reset()
        adapt_2_sanity_ep = env_task.run(model_2, episodes=1)

        init_san_rew = init_sanity_ep.reward().sum().item()
        adapt_san_rew = adapt_sanity_ep.reward().sum().item()
        adapt_2_san_rew = adapt_2_sanity_ep.reward().sum().item()

        # print(f'Why are these not equal? They should be equal: {init_san_rew}={adapt_san_rew}={adapt_2_san_rew}')
        # assert (init_san_rew == adapt_san_rew), "Environment initial states are random"
        init_sanity_state = init_sanity_ep[0].state

        init_rep_sanity = model_1.get_representation(init_sanity_state)
        init_rep_sanity_2 = model_1.get_representation(init_sanity_state, layer=3)

        adapt_rep_sanity = model_2.get_representation(init_sanity_state)
        adapt_rep_sanity_2 = model_2.get_representation(init_sanity_state, layer=3)

        init_rep_array = init_rep_sanity.detach().numpy()
        init_rep_2_array = init_rep_sanity_2.detach().numpy()
        adapt_rep_array = adapt_rep_sanity.detach().numpy()
        adapt_rep_2_array = adapt_rep_sanity_2.detach().numpy()

        print(f'Are the representations of the two models for the same state identical? '
              f'{np.array_equal(init_rep_array, adapt_rep_array)}')

        assert np.array_equal(init_rep_array, adapt_rep_array), "Representations not identical"
        assert np.array_equal(init_rep_2_array, adapt_rep_2_array), "Representations not identical"
Esempio n. 4
0
    def run(self, env, device):

        set_device(device)
        baseline = ch.models.robotics.LinearValue(env.state_size,
                                                  env.action_size)

        policy = DiagNormalPolicyANIL(env.state_size, env.action_size,
                                      params['fc_neurons'])
        policy = MAML(policy, lr=self.params['inner_lr'])
        body = policy.body
        head = policy.head

        all_parameters = list(body.parameters()) + list(head.parameters())
        meta_optimizer = torch.optim.Adam(all_parameters,
                                          lr=self.params['outer_lr'])

        self.log_model(policy.body,
                       device,
                       input_shape=(1, env.state_size),
                       name='body')
        self.log_model(policy.head,
                       device,
                       input_shape=(env.action_size, params['fc_neurons']),
                       name='head')

        t = trange(self.params['num_iterations'])
        try:
            for iteration in t:
                meta_optimizer.zero_grad()

                iter_reward = 0.0
                iter_loss = 0.0

                task_list = env.sample_tasks(self.params['meta_batch_size'])

                for task_i in trange(len(task_list),
                                     leave=False,
                                     desc='Task',
                                     position=0):
                    task = task_list[task_i]

                    learner = policy.clone()
                    env.set_task(task)
                    env.reset()
                    task = Runner(env, extra_info=extra_info)

                    # Fast adapt
                    loss, task_rew, task_suc = fast_adapt_ppo(task,
                                                              learner,
                                                              baseline,
                                                              self.params,
                                                              anil=True)

                    # print(f'Task {task_i}: Loss: {loss.item()} | Rew: {task_rew}')
                    iter_reward += task_rew
                    iter_loss += loss

                # Log
                average_return = iter_reward / self.params['meta_batch_size']
                av_loss = iter_loss / self.params['meta_batch_size']
                metrics = {
                    'average_return': average_return,
                    'loss': av_loss.item()
                }

                t.set_postfix(metrics)
                self.log_metrics(metrics)

                # Meta-optimize: Back-propagate through the accumulated gradients and optimize
                av_loss.backward()
                meta_optimizer.step()

                if iteration % self.params['save_every'] == 0:
                    self.save_model_checkpoint(policy.body,
                                               'body_' + str(iteration + 1))
                    self.save_model_checkpoint(policy.head,
                                               'head_' + str(iteration + 1))
                    self.save_model_checkpoint(
                        baseline, 'baseline_' + str(iteration + 1))

        # Support safely manually interrupt training
        except KeyboardInterrupt:
            print(
                '\nManually stopped training! Start evaluation & saving...\n')
            self.logger['manually_stopped'] = True
            self.params['num_iterations'] = iteration

        self.save_model(policy.body, name='body')
        self.save_model(policy.head, name='head')
        self.save_model(baseline, name='baseline')

        self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'],
                                                2)) + ' sec'
        # Evaluate on new test tasks
        self.logger['test_reward'] = evaluate_ppo(env_name, policy, baseline,
                                                  eval_params)
        self.log_metrics({'test_reward': self.logger['test_reward']})
        self.save_logs_to_file()
Esempio n. 5
0
    def run(self, env, device):

        set_device(device)
        baseline = ch.models.robotics.LinearValue(env.state_size,
                                                  env.action_size)
        policy = DiagNormalPolicy(env.state_size, env.action_size)

        self.log_model(policy, device, input_shape=(1, env.state_size))

        t = trange(self.params['num_iterations'], desc='Iteration', position=0)
        try:
            for iteration in t:

                iter_loss = 0.0
                iter_reward = 0.0
                # iter_success_per_task = {}
                iter_replays = []
                iter_policies = []

                task_list = env.sample_tasks(self.params['meta_batch_size'])

                for task_i in trange(len(task_list),
                                     leave=False,
                                     desc='Task',
                                     position=0):
                    task = task_list[task_i]
                    # task_id = f'task_{task["task"]}'

                    learner = deepcopy(policy)
                    env.set_task(task)
                    env.reset()
                    task = Runner(env, extra_info=extra_info)

                    # Adapt
                    learner, eval_loss, task_replay, task_rew, task_suc = fast_adapt_trpo(
                        task, learner, baseline, self.params, first_order=True)

                    # Calculate average success rate of support episodes
                    # task_adapt_suc = get_ep_successes(task_replay[0]) / self.params['adapt_batch_size']
                    # iter_success_per_task[task_id + '_adapt'] = task_adapt_suc
                    # iter_success_per_task[task_id] = task_suc
                    iter_reward += task_rew
                    iter_loss += eval_loss.item()
                    iter_replays.append(task_replay)
                    iter_policies.append(learner)

                # Log
                average_return = iter_reward / self.params['meta_batch_size']
                average_loss = iter_loss / self.params['meta_batch_size']
                metrics = {
                    'average_return': average_return,
                    'loss': average_loss
                }
                t.set_postfix(metrics)
                # metrics.update(iter_success_per_task)
                self.log_metrics(metrics)

                # Meta-optimize
                meta_optimize_trpo(self.params, policy, baseline, iter_replays,
                                   iter_policies)

                if iteration % self.params['save_every'] == 0:
                    self.save_model_checkpoint(policy, str(iteration + 1))
                    self.save_model_checkpoint(
                        baseline, 'baseline_' + str(iteration + 1))

        # Support safely manually interrupt training
        except KeyboardInterrupt:
            print(
                '\nManually stopped training! Start evaluation & saving...\n')
            self.logger['manually_stopped'] = True
            self.params['num_iterations'] = iteration

        self.save_model(policy)
        self.save_model(baseline, name='baseline')

        self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'],
                                                2)) + ' sec'
        # Evaluate on new test tasks
        self.logger['test_reward'] = evaluate_trpo(env_name, policy, baseline,
                                                   eval_params)
        self.log_metrics({'test_reward': self.logger['test_reward']})
        self.save_logs_to_file()
Esempio n. 6
0
def run_rep_rl_exp(path, env_name, policy, baseline, rep_params):
    global metrics
    metrics = rep_params['metrics']

    rep_path = path + '/rep_exp'
    if not os.path.isdir(rep_path):
        os.mkdir(rep_path)

    # An instance of the model before adaptation
    init_model = deepcopy(policy)
    adapt_model = deepcopy(policy)

    sanity_check(env_name, init_model, adapt_model, rep_params)
    del adapt_model

    # column 0: adaptation results, column 1: init results
    # acc_results = np.zeros((rep_params['n_tasks'], 2))
    # Create a dictionary of layer : results for each metric (e.g cca_results["0"] = [0.3, 0.2, 0.1])
    cca_results = {str(layer): [] for layer in rep_params['layers']}
    cka_l_results = {str(layer): [] for layer in rep_params['layers']}
    cka_k_results = {str(layer): [] for layer in rep_params['layers']}

    env = make_env(env_name, 1, rep_params['seed'], test=True, max_path_length=rep_params['max_path_length'])
    if rep_params['eval_each_task']:
        tasks = sample_from_each_task(env)
    else:
        tasks = env.sample_tasks(rep_params['n_tasks'])

    # Measure changes (mean and variance) of a specific layer across steps from the initial model and ith model
    init_mean = defaultdict(list)
    init_var = defaultdict(list)
    # Measure changes (mean and variance) of a specific layer across steps from the (i-1)th model and ith model
    adapt_mean = defaultdict(list)
    adapt_var = defaultdict(list)

    # Average layer changes across all tasks
    av_layer_changes_mean = defaultdict(list)
    av_layer_changes_std = defaultdict(list)

    for task in tasks:
        print(f'Adapting on Task: {ML10_eval_task_names[task["task"]]}')
        # Sample task
        env.set_task(task)
        env.reset()
        task_i = Runner(env)

        before_adapt_model = deepcopy(policy)  # for step 0: before adapt == init model
        after_adapt_model = deepcopy(policy)

        for step in range(rep_params['adapt_steps']):
            # Adapt the model to support episodes
            adapt_ep = task_i.run(before_adapt_model, episodes=rep_params['adapt_batch_size'])

            if step == 0:
                performance_before = get_ep_successes(adapt_ep, rep_params['max_path_length']) / rep_params[
                    'adapt_batch_size']

            if rep_params['algo'] == 'vpg':
                # Calculate loss & fit the value function
                inner_loss = vpg_a2c_loss(adapt_ep, after_adapt_model, baseline, rep_params['gamma'], rep_params['tau'])
                # Adapt model based on the loss
                after_adapt_model.adapt(inner_loss, allow_unused=rep_params['anil'])
            elif rep_params['algo'] == 'ppo':
                # Calculate loss & fit the value function & update the policy
                single_ppo_update(adapt_ep, after_adapt_model, baseline, rep_params, anil=rep_params['anil'])
            else:
                after_adapt_model = trpo_update(adapt_ep, after_adapt_model, baseline,
                                                rep_params['inner_lr'], rep_params['gamma'], rep_params['tau'],
                                                anil=rep_params['anil'])

            performance_after = get_ep_successes(adapt_ep, rep_params['max_path_length']) / rep_params[
                'adapt_batch_size']

            """ ACROSS STEPS """
            i_m_change, i_v_change, a_m_change, a_v_change = change_across_steps(adapt_ep, init_model,
                                                                                 before_adapt_model, after_adapt_model,
                                                                                 step)
            for metric in metrics:
                init_mean[metric] += [i_m_change[metric]]
                init_var[metric] += [i_v_change[metric]]
                adapt_mean[metric] += [a_m_change[metric]]
                adapt_var[metric] += [a_v_change[metric]]

            before_adapt_model = after_adapt_model.clone()

        """ ACROSS LAYERS """
        layer_changes = change_across_layers(rep_params['layers'], adapt_ep, before_adapt_model, after_adapt_model)
        for layer, changes in layer_changes.items():
            av_layer_changes_mean[layer] += [changes[0]['CCA']]
            av_layer_changes_std[layer] += [changes[1]['CCA']]
        print(f'Performance before: {performance_before}\nPerformance after: {performance_after}')

        """ ACROSS LAYERS PER TASK """
        # for metric in metrics:
        #     plot_sim_across_layers(layer_changes, metric)

    """ ACROSS LAYERS AVERAGE """
    for layer, changes in av_layer_changes_mean.items():
        av_layer_changes_mean[layer] = statistics.mean(changes)
        av_layer_changes_std[layer] = statistics.stdev(changes)

    print(av_layer_changes_mean)
    print(av_layer_changes_std)

    plot_sim_across_layers_average(av_layer_changes_mean, av_layer_changes_std,
                                   title='Before / After adaptation on the ML10 test tasks')
    """ ACROSS STEPS """
    # for metric in metrics:
    #     plot_sim_across_steps(init_mean[metric], init_var[metric], metric=metric,
    #                           title='Similarity between init and adapted (in %)')
    #     difference = [1 - x for x in adapt_mean[metric]]
    #     plot_sim_across_steps(difference, adapt_var[metric], metric=metric,
    #                           title='Representation difference after each step (in %)')

    """
    cca_plot = dict(title="CCA Evolution",
                    x_legend="Inner loop steps",
                    y_legend="CCA similarity",
                    y_axis=cca_results,
                    path=path + "/inner_CCA_evolution.png")
    cka_l_plot = dict(title="Linear CKA Evolution",
                      x_legend="Inner loop steps",
                      y_legend="CKA similarity",
                      y_axis=cka_l_results,
                      path=path + "/inner_Linear_CKA_evolution.png")
    cka_k_plot = dict(title="Kernel CKA Evolution",
                      x_legend="Inner loop steps",
                      y_legend="CKA similarity",
                      y_axis=cka_k_results,
                      path=path + "/inner_Kernel_CKA_evolution.png")
    plot_dict(cca_plot, save=True)
    # plot_dict(cka_l_plot, save=True)
    # plot_dict(cka_k_plot, save=True)
    """

    with open(rep_path + '/rep_params.json', 'w') as fp:
        json.dump(rep_params, fp, sort_keys=True, indent=4)

    return 0
Esempio n. 7
0
def run_cl_rl_exp(path,
                  env_name,
                  policy,
                  baseline,
                  cl_params,
                  workers,
                  plots=False,
                  test_on_train=False):
    cl_path = path + '/cl_exp'
    if not os.path.isdir(cl_path):
        os.mkdir(cl_path)

    if test_on_train:
        ML10_task_names = ML10_train_task_names
        test = False
    else:
        ML10_task_names = ML10_eval_task_names
        test = True

    n_tasks = len(ML10_task_names)

    env = make_env(env_name,
                   workers,
                   cl_params['seed'],
                   test=test,
                   max_path_length=cl_params['max_path_length'])

    # Matrix R NxN of rewards / success rates in tasks j after trained on a tasks i
    # (x_axis = test tasks, y_axis = train tasks)
    rew_matrix = np.zeros((n_tasks, n_tasks))
    suc_matrix = np.zeros((n_tasks, n_tasks))

    # Sample tasks randomly
    tasks = sample_from_each_task(env)

    rew_adapt_progress = {}
    suc_adapt_progress = {}

    for i, train_task in enumerate(tasks):
        print(
            f'Adapting on Task {i}: {ML10_task_names[train_task["task"]]} '
            f'and goal {train_task["goal"]}',
            end='...')
        learner = deepcopy(policy)
        env.set_task(train_task)
        env.reset()
        task_i = Runner(env, extra_info=cl_params['extra_info'])

        rew_adapt_progress[f'task_{i + 1}'] = {}
        suc_adapt_progress[f'task_{i + 1}'] = {}

        if cl_params['anil']:
            learner.module.turn_off_body_grads()

        for step in range(cl_params['adapt_steps']):
            # Collect adaptation / support episodes
            adapt_ep = task_i.run(learner,
                                  episodes=cl_params['adapt_batch_size'])

            if cl_params['algo'] == 'vpg':
                # Calculate loss & fit the value function
                inner_loss = vpg_a2c_loss(adapt_ep, learner, baseline,
                                          cl_params['gamma'], cl_params['tau'])
                # Adapt model based on the loss
                learner.adapt(inner_loss, allow_unused=cl_params['anil'])

            elif cl_params['algo'] == 'ppo':
                # Calculate loss & fit the value function & update the policy
                single_ppo_update(adapt_ep,
                                  learner,
                                  baseline,
                                  cl_params,
                                  anil=cl_params['anil'])
            else:

                learner = trpo_update(adapt_ep,
                                      learner,
                                      baseline,
                                      cl_params['inner_lr'],
                                      cl_params['gamma'],
                                      cl_params['tau'],
                                      anil=cl_params['anil'],
                                      first_order=True)

            adapt_rew = adapt_ep.reward().sum().item(
            ) / cl_params['adapt_batch_size']
            adapt_suc_per_ep, _ = get_success_per_ep(
                adapt_ep, cl_params['max_path_length'])
            adapt_suc = sum(
                adapt_suc_per_ep.values()) / cl_params['adapt_batch_size']

            rew_adapt_progress[f'task_{i + 1}'][f'step_{step}'] = adapt_rew
            suc_adapt_progress[f'task_{i + 1}'][f'step_{step}'] = adapt_suc

        print(f'Done!')

        # Evaluate on all tasks
        for j, valid_task in enumerate(tasks):
            print(
                f'\tEvaluating on Task {j}: {ML10_task_names[valid_task["task"]]} '
                f'and goal {valid_task["goal"]}...',
                end='\t')
            evaluator = learner.clone()
            env.set_task(valid_task)
            env.reset()
            task_j = Runner(env, extra_info=cl_params['extra_info'])

            with torch.no_grad():
                eval_ep = task_j.run(evaluator,
                                     episodes=cl_params['eval_batch_size'])
            task_j_reward = eval_ep.reward().sum().item(
            ) / cl_params['eval_batch_size']
            task_j_success = get_ep_successes(
                eval_ep,
                cl_params['max_path_length']) / cl_params['eval_batch_size']

            _, success_step = get_success_per_ep(eval_ep,
                                                 cl_params['max_path_length'])

            rew_matrix[i, j] = task_j_reward
            suc_matrix[i, j] = task_j_success
            print(f'Success: {task_j_success * 100}%')

    # Plot matrix results
    if plots:
        plot_task_res(rew_matrix, y_title='Reward')
        plot_task_res(suc_matrix, y_title='Success Rate')

        # Plot adaptation progress
        plot_progress(rew_adapt_progress, y_title='Reward')
        plot_progress(suc_adapt_progress, y_title='Success Rate')

    print(f'Rewards Matrix:\n{rew_matrix}\n')
    print(f'Success rates Matrix:\n{suc_matrix}\n')

    if cl_params['normalize_rewards']:
        norm_rew = preprocessing.normalize(rew_matrix)
        scaler = preprocessing.StandardScaler()
        stand_rew = scaler.fit_transform(rew_matrix)
        print(stand_rew)
        print(norm_rew)
        rew_matrix = norm_rew

    cl_res_rew = calc_cl_metrics(rew_matrix)
    cl_res_suc = calc_cl_metrics(suc_matrix)

    print(f'Metrics based on rewards: {cl_res_rew}')
    print(f'Metrics based on success rates: {cl_res_suc}')

    save_acc_matrix(cl_path, rew_matrix, name='cl_rew_matrix')
    save_acc_matrix(cl_path, suc_matrix, name='cl_suc_matrix')

    with open(cl_path + '/cl_params.json', 'w') as fp:
        json.dump(cl_params, fp, sort_keys=True, indent=4)

    with open(cl_path + '/cl_res_rew.json', 'w') as fp:
        json.dump(cl_res_rew, fp, sort_keys=True, indent=4)

    with open(cl_path + '/cl_res_suc.json', 'w') as fp:
        json.dump(cl_res_suc, fp, sort_keys=True, indent=4)

    return rew_matrix, cl_res_rew, cl_res_suc
Esempio n. 8
0
def evaluate(algo,
             env_name,
             policy,
             baseline,
             params,
             anil,
             render=False,
             test_on_train=False,
             each3=False):
    rewards_per_task = defaultdict(list)
    tasks_rewards = []
    tasks_success_rate = []

    if test_on_train:
        ml_task_names = ML10_train_task_names  # Meta-train tasks
    else:
        ml_task_names = ML10_eval_task_names  # Meta-testing tasks

    extra_info = True if 'ML' in env_name else False  # if env is metaworld, log success metric
    env = make_env(env_name,
                   1,
                   params['seed'],
                   test=(not test_on_train),
                   max_path_length=params['max_path_length'])

    if each3:
        # Overwrite number of tasks and just sample 3 trials from each task
        eval_task_list = sample_3_from_each_task(env)
    elif isinstance(params['n_tasks'], str):
        eval_task_list = [sample_explicit_task(env, params['n_tasks'])]
    else:
        eval_task_list = env.sample_tasks(params['n_tasks'])

    for i, task in enumerate(eval_task_list):
        learner = deepcopy(policy)
        env.set_task(task)
        env.reset()
        env_task = Runner(env, extra_info=extra_info)

        # Adapt
        if algo == 'vpg':
            _, task_reward, task_suc = fast_adapt_vpg(env_task,
                                                      learner,
                                                      baseline,
                                                      params,
                                                      anil=anil,
                                                      render=render)
        elif algo == 'ppo':
            _, task_reward, task_suc = fast_adapt_ppo(env_task,
                                                      learner,
                                                      baseline,
                                                      params,
                                                      render=render)
        else:
            learner, _, _, task_reward, task_suc = fast_adapt_trpo(
                env_task, learner, baseline, params, anil=anil, render=render)

        # Evaluate
        n_query_episodes = params['adapt_batch_size']
        query_episodes = env_task.run(learner,
                                      episodes=n_query_episodes,
                                      render=render)
        query_rew = query_episodes.reward().sum().item() / n_query_episodes
        query_success_rate = get_ep_successes(
            query_episodes, params['max_path_length']) / n_query_episodes

        tasks_rewards.append(query_rew)
        tasks_success_rate.append(query_success_rate)
        if extra_info:
            print(
                f'Task {i + 1} / {len(eval_task_list)}: {ml_task_names[task["task"]]} task'
                f'\t {query_rew:.1f} rew | {query_success_rate * 100}% success rate'
            )
            rewards_per_task[ml_task_names[task["task"]]] += [
                query_rew, query_success_rate
            ]

    final_eval_reward = sum(tasks_rewards) / params['n_tasks']
    final_eval_suc = sum(tasks_success_rate) / params['n_tasks']

    if 'ML' in env_name:
        return tasks_rewards, final_eval_reward, final_eval_suc, rewards_per_task
    return tasks_rewards, final_eval_reward, final_eval_suc
Esempio n. 9
0
    def run(self, env, device):

        set_device(device)
        baseline = ch.models.robotics.LinearValue(env.state_size,
                                                  env.action_size)

        policy = DiagNormalPolicyANIL(env.state_size, env.action_size,
                                      params['fc_neurons'])
        policy = MAML(policy, lr=self.params['inner_lr'])

        self.log_model(policy.body,
                       device,
                       input_shape=(1, env.state_size),
                       name='body')
        self.log_model(policy.head,
                       device,
                       input_shape=(env.action_size, params['fc_neurons']),
                       name='head')

        t = trange(self.params['num_iterations'])
        try:
            for iteration in t:

                iter_loss = 0.0
                iter_reward = 0.0
                iter_replays = []
                iter_policies = []

                task_list = env.sample_tasks(self.params['meta_batch_size'])

                for task_i in trange(len(task_list),
                                     leave=False,
                                     desc='Task',
                                     position=0):
                    task = task_list[task_i]

                    learner = deepcopy(policy)
                    env.set_task(task)
                    env.reset()
                    task = Runner(env, extra_info=extra_info)

                    # Fast adapt
                    learner, eval_loss, task_replay, task_rew, task_suc = fast_adapt_trpo(
                        task,
                        learner,
                        baseline,
                        self.params,
                        anil=True,
                        first_order=True)

                    iter_reward += task_rew
                    iter_loss += eval_loss.item()
                    iter_replays.append(task_replay)
                    iter_policies.append(learner)

                # Log
                average_return = iter_reward / self.params['meta_batch_size']
                average_loss = iter_loss / self.params['meta_batch_size']
                metrics = {
                    'average_return': average_return,
                    'loss': average_loss
                }

                t.set_postfix(metrics)
                self.log_metrics(metrics)

                # Meta-optimize
                meta_optimize_trpo(self.params,
                                   policy,
                                   baseline,
                                   iter_replays,
                                   iter_policies,
                                   anil=True)

                if iteration % self.params['save_every'] == 0:
                    self.save_model_checkpoint(policy.body,
                                               'body_' + str(iteration + 1))
                    self.save_model_checkpoint(policy.head,
                                               'head_' + str(iteration + 1))
                    self.save_model_checkpoint(
                        baseline, 'baseline_' + str(iteration + 1))

        # Support safely manually interrupt training
        except KeyboardInterrupt:
            print(
                '\nManually stopped training! Start evaluation & saving...\n')
            self.logger['manually_stopped'] = True
            self.params['num_iterations'] = iteration

        self.save_model(policy.body, name="body")
        self.save_model(policy.head, name="head")
        self.save_model(baseline, name="baseline")

        self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'],
                                                2)) + ' sec'
        # Evaluate on new test tasks
        self.logger['test_reward'] = evaluate_trpo(env_name, policy, baseline,
                                                   eval_params)
        self.log_metrics({'test_reward': self.logger['test_reward']})
        self.save_logs_to_file()