def run(self, env, device): baseline = ch.models.robotics.LinearValue(env.state_size, env.action_size) policy = DiagNormalPolicy(env.state_size, env.action_size) self.log_model(policy, device, input_shape=(1, env.state_size)) t = trange(self.params['num_iterations'], desc='Iteration', position=0) try: for iteration in t: iter_reward = 0.0 task_list = env.sample_tasks(self.params['batch_size']) for task_i in trange(len(task_list), leave=False, desc='Task', position=0): task = task_list[task_i] env.set_task(task) env.reset() task = Runner(env) episodes = task.run(policy, episodes=params['n_episodes']) task_reward = episodes.reward().sum().item( ) / params['n_episodes'] iter_reward += task_reward # Log average_return = iter_reward / self.params['batch_size'] metrics = {'average_return': average_return} t.set_postfix(metrics) self.log_metrics(metrics) if iteration % self.params['save_every'] == 0: self.save_model_checkpoint(policy, str(iteration + 1)) self.save_model_checkpoint( baseline, 'baseline_' + str(iteration + 1)) # Support safely manually interrupt training except KeyboardInterrupt: print( '\nManually stopped training! Start evaluation & saving...\n') self.logger['manually_stopped'] = True self.params['num_iterations'] = iteration self.save_model(policy) self.save_model(baseline, name='baseline') self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'], 2)) + ' sec' # Evaluate on new test tasks policy = MAML(policy, lr=self.params['inner_lr']) self.logger['test_reward'] = evaluate_ppo(env_name, policy, baseline, params) self.log_metrics({'test_reward': self.logger['test_reward']}) self.save_logs_to_file()
def measure_change_through_time(path, env_name, policy, rep_params): env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length']) global metrics metrics = ['CCA'] sanity_task = env.sample_tasks(1) with torch.no_grad(): env.set_task(sanity_task[0]) env.seed(rep_params['seed']) env.reset() env_task = Runner(env) sanity_ep = env_task.run(policy, episodes=1) init_change_m = defaultdict(list) init_change_v = defaultdict(list) adapt_change_m = defaultdict(list) adapt_change_v = defaultdict(list) checkpoints = path + f'/model_checkpoints/' i = 0 file_list = os.listdir(checkpoints) file_list = [file for file in file_list if 'baseline' not in file] models_list = {} for file in file_list: n_file = file.split('_')[-1] n_file = n_file.split('.')[0] n_file = int(n_file) models_list[n_file] = f'model_{n_file}.pt' prev_policy = policy for key in sorted(models_list.keys()): model_chckpnt = models_list[key] if i > 40: break i += 1 print(f'Loading {model_chckpnt} ...') chckpnt_policy = DiagNormalPolicy(9, 4) chckpnt_policy.load_state_dict(torch.load(os.path.join(checkpoints, model_chckpnt))) chckpnt_policy = MAML(chckpnt_policy, lr=rep_params['inner_lr']) mean, variance = episode_mean_var(sanity_ep, policy, chckpnt_policy, layer=6) a_mean, a_variance = episode_mean_var(sanity_ep, prev_policy, chckpnt_policy, layer=6) init_change_m['CCA'] += [mean['CCA']] init_change_v['CCA'] += [variance['CCA']] adapt_change_m['CCA'] += [a_mean['CCA']] adapt_change_v['CCA'] += [a_variance['CCA']] prev_policy = chckpnt_policy for metric in metrics: plot_sim_across_steps(init_change_m[metric], init_change_v[metric], metric=metric, title='Similarity between init and adapted (in %)') for metric in metrics: difference = [1 - x for x in adapt_change_m[metric]] plot_sim_across_steps(difference, adapt_change_v[metric], metric=metric, title='Representation difference after each step (in %)')
def sanity_check(env_name, model_1, model_2, rep_params): # Sample a sanity batch env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length']) env.active_env.random_init = False sanity_task = env.sample_tasks(1) with torch.no_grad(): env.set_task(sanity_task[0]) env.seed(rep_params['seed']) env.reset() env_task = Runner(env) init_sanity_ep = env_task.run(model_1, episodes=1) env.set_task(sanity_task[0]) env.seed(rep_params['seed']) env.reset() env_task = Runner(env) adapt_sanity_ep = env_task.run(model_2, episodes=1) env_task.reset() adapt_2_sanity_ep = env_task.run(model_2, episodes=1) init_san_rew = init_sanity_ep.reward().sum().item() adapt_san_rew = adapt_sanity_ep.reward().sum().item() adapt_2_san_rew = adapt_2_sanity_ep.reward().sum().item() # print(f'Why are these not equal? They should be equal: {init_san_rew}={adapt_san_rew}={adapt_2_san_rew}') # assert (init_san_rew == adapt_san_rew), "Environment initial states are random" init_sanity_state = init_sanity_ep[0].state init_rep_sanity = model_1.get_representation(init_sanity_state) init_rep_sanity_2 = model_1.get_representation(init_sanity_state, layer=3) adapt_rep_sanity = model_2.get_representation(init_sanity_state) adapt_rep_sanity_2 = model_2.get_representation(init_sanity_state, layer=3) init_rep_array = init_rep_sanity.detach().numpy() init_rep_2_array = init_rep_sanity_2.detach().numpy() adapt_rep_array = adapt_rep_sanity.detach().numpy() adapt_rep_2_array = adapt_rep_sanity_2.detach().numpy() print(f'Are the representations of the two models for the same state identical? ' f'{np.array_equal(init_rep_array, adapt_rep_array)}') assert np.array_equal(init_rep_array, adapt_rep_array), "Representations not identical" assert np.array_equal(init_rep_2_array, adapt_rep_2_array), "Representations not identical"
def run(self, env, device): set_device(device) baseline = ch.models.robotics.LinearValue(env.state_size, env.action_size) policy = DiagNormalPolicyANIL(env.state_size, env.action_size, params['fc_neurons']) policy = MAML(policy, lr=self.params['inner_lr']) body = policy.body head = policy.head all_parameters = list(body.parameters()) + list(head.parameters()) meta_optimizer = torch.optim.Adam(all_parameters, lr=self.params['outer_lr']) self.log_model(policy.body, device, input_shape=(1, env.state_size), name='body') self.log_model(policy.head, device, input_shape=(env.action_size, params['fc_neurons']), name='head') t = trange(self.params['num_iterations']) try: for iteration in t: meta_optimizer.zero_grad() iter_reward = 0.0 iter_loss = 0.0 task_list = env.sample_tasks(self.params['meta_batch_size']) for task_i in trange(len(task_list), leave=False, desc='Task', position=0): task = task_list[task_i] learner = policy.clone() env.set_task(task) env.reset() task = Runner(env, extra_info=extra_info) # Fast adapt loss, task_rew, task_suc = fast_adapt_ppo(task, learner, baseline, self.params, anil=True) # print(f'Task {task_i}: Loss: {loss.item()} | Rew: {task_rew}') iter_reward += task_rew iter_loss += loss # Log average_return = iter_reward / self.params['meta_batch_size'] av_loss = iter_loss / self.params['meta_batch_size'] metrics = { 'average_return': average_return, 'loss': av_loss.item() } t.set_postfix(metrics) self.log_metrics(metrics) # Meta-optimize: Back-propagate through the accumulated gradients and optimize av_loss.backward() meta_optimizer.step() if iteration % self.params['save_every'] == 0: self.save_model_checkpoint(policy.body, 'body_' + str(iteration + 1)) self.save_model_checkpoint(policy.head, 'head_' + str(iteration + 1)) self.save_model_checkpoint( baseline, 'baseline_' + str(iteration + 1)) # Support safely manually interrupt training except KeyboardInterrupt: print( '\nManually stopped training! Start evaluation & saving...\n') self.logger['manually_stopped'] = True self.params['num_iterations'] = iteration self.save_model(policy.body, name='body') self.save_model(policy.head, name='head') self.save_model(baseline, name='baseline') self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'], 2)) + ' sec' # Evaluate on new test tasks self.logger['test_reward'] = evaluate_ppo(env_name, policy, baseline, eval_params) self.log_metrics({'test_reward': self.logger['test_reward']}) self.save_logs_to_file()
def run(self, env, device): set_device(device) baseline = ch.models.robotics.LinearValue(env.state_size, env.action_size) policy = DiagNormalPolicy(env.state_size, env.action_size) self.log_model(policy, device, input_shape=(1, env.state_size)) t = trange(self.params['num_iterations'], desc='Iteration', position=0) try: for iteration in t: iter_loss = 0.0 iter_reward = 0.0 # iter_success_per_task = {} iter_replays = [] iter_policies = [] task_list = env.sample_tasks(self.params['meta_batch_size']) for task_i in trange(len(task_list), leave=False, desc='Task', position=0): task = task_list[task_i] # task_id = f'task_{task["task"]}' learner = deepcopy(policy) env.set_task(task) env.reset() task = Runner(env, extra_info=extra_info) # Adapt learner, eval_loss, task_replay, task_rew, task_suc = fast_adapt_trpo( task, learner, baseline, self.params, first_order=True) # Calculate average success rate of support episodes # task_adapt_suc = get_ep_successes(task_replay[0]) / self.params['adapt_batch_size'] # iter_success_per_task[task_id + '_adapt'] = task_adapt_suc # iter_success_per_task[task_id] = task_suc iter_reward += task_rew iter_loss += eval_loss.item() iter_replays.append(task_replay) iter_policies.append(learner) # Log average_return = iter_reward / self.params['meta_batch_size'] average_loss = iter_loss / self.params['meta_batch_size'] metrics = { 'average_return': average_return, 'loss': average_loss } t.set_postfix(metrics) # metrics.update(iter_success_per_task) self.log_metrics(metrics) # Meta-optimize meta_optimize_trpo(self.params, policy, baseline, iter_replays, iter_policies) if iteration % self.params['save_every'] == 0: self.save_model_checkpoint(policy, str(iteration + 1)) self.save_model_checkpoint( baseline, 'baseline_' + str(iteration + 1)) # Support safely manually interrupt training except KeyboardInterrupt: print( '\nManually stopped training! Start evaluation & saving...\n') self.logger['manually_stopped'] = True self.params['num_iterations'] = iteration self.save_model(policy) self.save_model(baseline, name='baseline') self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'], 2)) + ' sec' # Evaluate on new test tasks self.logger['test_reward'] = evaluate_trpo(env_name, policy, baseline, eval_params) self.log_metrics({'test_reward': self.logger['test_reward']}) self.save_logs_to_file()
def run_rep_rl_exp(path, env_name, policy, baseline, rep_params): global metrics metrics = rep_params['metrics'] rep_path = path + '/rep_exp' if not os.path.isdir(rep_path): os.mkdir(rep_path) # An instance of the model before adaptation init_model = deepcopy(policy) adapt_model = deepcopy(policy) sanity_check(env_name, init_model, adapt_model, rep_params) del adapt_model # column 0: adaptation results, column 1: init results # acc_results = np.zeros((rep_params['n_tasks'], 2)) # Create a dictionary of layer : results for each metric (e.g cca_results["0"] = [0.3, 0.2, 0.1]) cca_results = {str(layer): [] for layer in rep_params['layers']} cka_l_results = {str(layer): [] for layer in rep_params['layers']} cka_k_results = {str(layer): [] for layer in rep_params['layers']} env = make_env(env_name, 1, rep_params['seed'], test=True, max_path_length=rep_params['max_path_length']) if rep_params['eval_each_task']: tasks = sample_from_each_task(env) else: tasks = env.sample_tasks(rep_params['n_tasks']) # Measure changes (mean and variance) of a specific layer across steps from the initial model and ith model init_mean = defaultdict(list) init_var = defaultdict(list) # Measure changes (mean and variance) of a specific layer across steps from the (i-1)th model and ith model adapt_mean = defaultdict(list) adapt_var = defaultdict(list) # Average layer changes across all tasks av_layer_changes_mean = defaultdict(list) av_layer_changes_std = defaultdict(list) for task in tasks: print(f'Adapting on Task: {ML10_eval_task_names[task["task"]]}') # Sample task env.set_task(task) env.reset() task_i = Runner(env) before_adapt_model = deepcopy(policy) # for step 0: before adapt == init model after_adapt_model = deepcopy(policy) for step in range(rep_params['adapt_steps']): # Adapt the model to support episodes adapt_ep = task_i.run(before_adapt_model, episodes=rep_params['adapt_batch_size']) if step == 0: performance_before = get_ep_successes(adapt_ep, rep_params['max_path_length']) / rep_params[ 'adapt_batch_size'] if rep_params['algo'] == 'vpg': # Calculate loss & fit the value function inner_loss = vpg_a2c_loss(adapt_ep, after_adapt_model, baseline, rep_params['gamma'], rep_params['tau']) # Adapt model based on the loss after_adapt_model.adapt(inner_loss, allow_unused=rep_params['anil']) elif rep_params['algo'] == 'ppo': # Calculate loss & fit the value function & update the policy single_ppo_update(adapt_ep, after_adapt_model, baseline, rep_params, anil=rep_params['anil']) else: after_adapt_model = trpo_update(adapt_ep, after_adapt_model, baseline, rep_params['inner_lr'], rep_params['gamma'], rep_params['tau'], anil=rep_params['anil']) performance_after = get_ep_successes(adapt_ep, rep_params['max_path_length']) / rep_params[ 'adapt_batch_size'] """ ACROSS STEPS """ i_m_change, i_v_change, a_m_change, a_v_change = change_across_steps(adapt_ep, init_model, before_adapt_model, after_adapt_model, step) for metric in metrics: init_mean[metric] += [i_m_change[metric]] init_var[metric] += [i_v_change[metric]] adapt_mean[metric] += [a_m_change[metric]] adapt_var[metric] += [a_v_change[metric]] before_adapt_model = after_adapt_model.clone() """ ACROSS LAYERS """ layer_changes = change_across_layers(rep_params['layers'], adapt_ep, before_adapt_model, after_adapt_model) for layer, changes in layer_changes.items(): av_layer_changes_mean[layer] += [changes[0]['CCA']] av_layer_changes_std[layer] += [changes[1]['CCA']] print(f'Performance before: {performance_before}\nPerformance after: {performance_after}') """ ACROSS LAYERS PER TASK """ # for metric in metrics: # plot_sim_across_layers(layer_changes, metric) """ ACROSS LAYERS AVERAGE """ for layer, changes in av_layer_changes_mean.items(): av_layer_changes_mean[layer] = statistics.mean(changes) av_layer_changes_std[layer] = statistics.stdev(changes) print(av_layer_changes_mean) print(av_layer_changes_std) plot_sim_across_layers_average(av_layer_changes_mean, av_layer_changes_std, title='Before / After adaptation on the ML10 test tasks') """ ACROSS STEPS """ # for metric in metrics: # plot_sim_across_steps(init_mean[metric], init_var[metric], metric=metric, # title='Similarity between init and adapted (in %)') # difference = [1 - x for x in adapt_mean[metric]] # plot_sim_across_steps(difference, adapt_var[metric], metric=metric, # title='Representation difference after each step (in %)') """ cca_plot = dict(title="CCA Evolution", x_legend="Inner loop steps", y_legend="CCA similarity", y_axis=cca_results, path=path + "/inner_CCA_evolution.png") cka_l_plot = dict(title="Linear CKA Evolution", x_legend="Inner loop steps", y_legend="CKA similarity", y_axis=cka_l_results, path=path + "/inner_Linear_CKA_evolution.png") cka_k_plot = dict(title="Kernel CKA Evolution", x_legend="Inner loop steps", y_legend="CKA similarity", y_axis=cka_k_results, path=path + "/inner_Kernel_CKA_evolution.png") plot_dict(cca_plot, save=True) # plot_dict(cka_l_plot, save=True) # plot_dict(cka_k_plot, save=True) """ with open(rep_path + '/rep_params.json', 'w') as fp: json.dump(rep_params, fp, sort_keys=True, indent=4) return 0
def run_cl_rl_exp(path, env_name, policy, baseline, cl_params, workers, plots=False, test_on_train=False): cl_path = path + '/cl_exp' if not os.path.isdir(cl_path): os.mkdir(cl_path) if test_on_train: ML10_task_names = ML10_train_task_names test = False else: ML10_task_names = ML10_eval_task_names test = True n_tasks = len(ML10_task_names) env = make_env(env_name, workers, cl_params['seed'], test=test, max_path_length=cl_params['max_path_length']) # Matrix R NxN of rewards / success rates in tasks j after trained on a tasks i # (x_axis = test tasks, y_axis = train tasks) rew_matrix = np.zeros((n_tasks, n_tasks)) suc_matrix = np.zeros((n_tasks, n_tasks)) # Sample tasks randomly tasks = sample_from_each_task(env) rew_adapt_progress = {} suc_adapt_progress = {} for i, train_task in enumerate(tasks): print( f'Adapting on Task {i}: {ML10_task_names[train_task["task"]]} ' f'and goal {train_task["goal"]}', end='...') learner = deepcopy(policy) env.set_task(train_task) env.reset() task_i = Runner(env, extra_info=cl_params['extra_info']) rew_adapt_progress[f'task_{i + 1}'] = {} suc_adapt_progress[f'task_{i + 1}'] = {} if cl_params['anil']: learner.module.turn_off_body_grads() for step in range(cl_params['adapt_steps']): # Collect adaptation / support episodes adapt_ep = task_i.run(learner, episodes=cl_params['adapt_batch_size']) if cl_params['algo'] == 'vpg': # Calculate loss & fit the value function inner_loss = vpg_a2c_loss(adapt_ep, learner, baseline, cl_params['gamma'], cl_params['tau']) # Adapt model based on the loss learner.adapt(inner_loss, allow_unused=cl_params['anil']) elif cl_params['algo'] == 'ppo': # Calculate loss & fit the value function & update the policy single_ppo_update(adapt_ep, learner, baseline, cl_params, anil=cl_params['anil']) else: learner = trpo_update(adapt_ep, learner, baseline, cl_params['inner_lr'], cl_params['gamma'], cl_params['tau'], anil=cl_params['anil'], first_order=True) adapt_rew = adapt_ep.reward().sum().item( ) / cl_params['adapt_batch_size'] adapt_suc_per_ep, _ = get_success_per_ep( adapt_ep, cl_params['max_path_length']) adapt_suc = sum( adapt_suc_per_ep.values()) / cl_params['adapt_batch_size'] rew_adapt_progress[f'task_{i + 1}'][f'step_{step}'] = adapt_rew suc_adapt_progress[f'task_{i + 1}'][f'step_{step}'] = adapt_suc print(f'Done!') # Evaluate on all tasks for j, valid_task in enumerate(tasks): print( f'\tEvaluating on Task {j}: {ML10_task_names[valid_task["task"]]} ' f'and goal {valid_task["goal"]}...', end='\t') evaluator = learner.clone() env.set_task(valid_task) env.reset() task_j = Runner(env, extra_info=cl_params['extra_info']) with torch.no_grad(): eval_ep = task_j.run(evaluator, episodes=cl_params['eval_batch_size']) task_j_reward = eval_ep.reward().sum().item( ) / cl_params['eval_batch_size'] task_j_success = get_ep_successes( eval_ep, cl_params['max_path_length']) / cl_params['eval_batch_size'] _, success_step = get_success_per_ep(eval_ep, cl_params['max_path_length']) rew_matrix[i, j] = task_j_reward suc_matrix[i, j] = task_j_success print(f'Success: {task_j_success * 100}%') # Plot matrix results if plots: plot_task_res(rew_matrix, y_title='Reward') plot_task_res(suc_matrix, y_title='Success Rate') # Plot adaptation progress plot_progress(rew_adapt_progress, y_title='Reward') plot_progress(suc_adapt_progress, y_title='Success Rate') print(f'Rewards Matrix:\n{rew_matrix}\n') print(f'Success rates Matrix:\n{suc_matrix}\n') if cl_params['normalize_rewards']: norm_rew = preprocessing.normalize(rew_matrix) scaler = preprocessing.StandardScaler() stand_rew = scaler.fit_transform(rew_matrix) print(stand_rew) print(norm_rew) rew_matrix = norm_rew cl_res_rew = calc_cl_metrics(rew_matrix) cl_res_suc = calc_cl_metrics(suc_matrix) print(f'Metrics based on rewards: {cl_res_rew}') print(f'Metrics based on success rates: {cl_res_suc}') save_acc_matrix(cl_path, rew_matrix, name='cl_rew_matrix') save_acc_matrix(cl_path, suc_matrix, name='cl_suc_matrix') with open(cl_path + '/cl_params.json', 'w') as fp: json.dump(cl_params, fp, sort_keys=True, indent=4) with open(cl_path + '/cl_res_rew.json', 'w') as fp: json.dump(cl_res_rew, fp, sort_keys=True, indent=4) with open(cl_path + '/cl_res_suc.json', 'w') as fp: json.dump(cl_res_suc, fp, sort_keys=True, indent=4) return rew_matrix, cl_res_rew, cl_res_suc
def evaluate(algo, env_name, policy, baseline, params, anil, render=False, test_on_train=False, each3=False): rewards_per_task = defaultdict(list) tasks_rewards = [] tasks_success_rate = [] if test_on_train: ml_task_names = ML10_train_task_names # Meta-train tasks else: ml_task_names = ML10_eval_task_names # Meta-testing tasks extra_info = True if 'ML' in env_name else False # if env is metaworld, log success metric env = make_env(env_name, 1, params['seed'], test=(not test_on_train), max_path_length=params['max_path_length']) if each3: # Overwrite number of tasks and just sample 3 trials from each task eval_task_list = sample_3_from_each_task(env) elif isinstance(params['n_tasks'], str): eval_task_list = [sample_explicit_task(env, params['n_tasks'])] else: eval_task_list = env.sample_tasks(params['n_tasks']) for i, task in enumerate(eval_task_list): learner = deepcopy(policy) env.set_task(task) env.reset() env_task = Runner(env, extra_info=extra_info) # Adapt if algo == 'vpg': _, task_reward, task_suc = fast_adapt_vpg(env_task, learner, baseline, params, anil=anil, render=render) elif algo == 'ppo': _, task_reward, task_suc = fast_adapt_ppo(env_task, learner, baseline, params, render=render) else: learner, _, _, task_reward, task_suc = fast_adapt_trpo( env_task, learner, baseline, params, anil=anil, render=render) # Evaluate n_query_episodes = params['adapt_batch_size'] query_episodes = env_task.run(learner, episodes=n_query_episodes, render=render) query_rew = query_episodes.reward().sum().item() / n_query_episodes query_success_rate = get_ep_successes( query_episodes, params['max_path_length']) / n_query_episodes tasks_rewards.append(query_rew) tasks_success_rate.append(query_success_rate) if extra_info: print( f'Task {i + 1} / {len(eval_task_list)}: {ml_task_names[task["task"]]} task' f'\t {query_rew:.1f} rew | {query_success_rate * 100}% success rate' ) rewards_per_task[ml_task_names[task["task"]]] += [ query_rew, query_success_rate ] final_eval_reward = sum(tasks_rewards) / params['n_tasks'] final_eval_suc = sum(tasks_success_rate) / params['n_tasks'] if 'ML' in env_name: return tasks_rewards, final_eval_reward, final_eval_suc, rewards_per_task return tasks_rewards, final_eval_reward, final_eval_suc
def run(self, env, device): set_device(device) baseline = ch.models.robotics.LinearValue(env.state_size, env.action_size) policy = DiagNormalPolicyANIL(env.state_size, env.action_size, params['fc_neurons']) policy = MAML(policy, lr=self.params['inner_lr']) self.log_model(policy.body, device, input_shape=(1, env.state_size), name='body') self.log_model(policy.head, device, input_shape=(env.action_size, params['fc_neurons']), name='head') t = trange(self.params['num_iterations']) try: for iteration in t: iter_loss = 0.0 iter_reward = 0.0 iter_replays = [] iter_policies = [] task_list = env.sample_tasks(self.params['meta_batch_size']) for task_i in trange(len(task_list), leave=False, desc='Task', position=0): task = task_list[task_i] learner = deepcopy(policy) env.set_task(task) env.reset() task = Runner(env, extra_info=extra_info) # Fast adapt learner, eval_loss, task_replay, task_rew, task_suc = fast_adapt_trpo( task, learner, baseline, self.params, anil=True, first_order=True) iter_reward += task_rew iter_loss += eval_loss.item() iter_replays.append(task_replay) iter_policies.append(learner) # Log average_return = iter_reward / self.params['meta_batch_size'] average_loss = iter_loss / self.params['meta_batch_size'] metrics = { 'average_return': average_return, 'loss': average_loss } t.set_postfix(metrics) self.log_metrics(metrics) # Meta-optimize meta_optimize_trpo(self.params, policy, baseline, iter_replays, iter_policies, anil=True) if iteration % self.params['save_every'] == 0: self.save_model_checkpoint(policy.body, 'body_' + str(iteration + 1)) self.save_model_checkpoint(policy.head, 'head_' + str(iteration + 1)) self.save_model_checkpoint( baseline, 'baseline_' + str(iteration + 1)) # Support safely manually interrupt training except KeyboardInterrupt: print( '\nManually stopped training! Start evaluation & saving...\n') self.logger['manually_stopped'] = True self.params['num_iterations'] = iteration self.save_model(policy.body, name="body") self.save_model(policy.head, name="head") self.save_model(baseline, name="baseline") self.logger['elapsed_time'] = str(round(t.format_dict['elapsed'], 2)) + ' sec' # Evaluate on new test tasks self.logger['test_reward'] = evaluate_trpo(env_name, policy, baseline, eval_params) self.log_metrics({'test_reward': self.logger['test_reward']}) self.save_logs_to_file()