def experiment(variant): domain = variant['domain'] seed = variant['seed'] exp_mode = variant['exp_mode'] max_path_length = variant['algo_params']['max_path_length'] bcq_interactions = variant['bcq_interactions'] num_tasks = variant['num_tasks'] filename = f'./goals/{domain}-{exp_mode}-goals.pkl' idx_list, train_goals, wd_goals, ood_goals = pickle.load( open(filename, 'rb')) idx_list = idx_list[:num_tasks] sub_buffer_dir = f"buffers/{domain}/{exp_mode}/max_path_length_{max_path_length}/interactions_{bcq_interactions}k/seed_{seed}" buffer_dir = os.path.join(variant['data_models_root'], sub_buffer_dir) print("Buffer directory: " + buffer_dir) # Load buffer bcq_buffers = [] buffer_loader_id_list = [] for i, idx in enumerate(idx_list): bname = f'goal_0{idx}.zip_pkl' if idx < 10 else f'goal_{idx}.zip_pkl' filename = os.path.join(buffer_dir, bname) rp_buffer = ReplayBuffer.remote( index=i, seed=seed, num_trans_context=variant['num_trans_context'], in_mdp_batch_size=variant['in_mdp_batch_size'], ) buffer_loader_id_list.append(rp_buffer.load_from_gzip.remote(filename)) bcq_buffers.append(rp_buffer) ray.get(buffer_loader_id_list) assert len(bcq_buffers) == len(idx_list) train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, ) set_seed(variant['seed']) # create multi-task environment and sample tasks env = env_producer(variant['domain'], seed=0) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic(env=env, train_goals=train_goals, wd_goals=wd_goals, ood_goals=ood_goals, replay_buffers=train_buffer, nets=[agent, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['domain'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
def experiment(variant): # create multi-task environment and sample tasks env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic( env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['env_name'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
def sim_policy(variant, path_to_exp, num_trajs=1, deterministic=False, save_video=False, animated=False): ''' simulate a trained policy adapting to a new task optionally save videos of the trajectories - requires ffmpeg :variant: experiment configuration dict :path_to_exp: path to exp folder :num_trajs: number of trajectories to simulate per task (default 1) :deterministic: if the policy is deterministic (default stochastic) :save_video: whether to generate and save a video (default False) ''' # create multi-task environment and sample tasks env = CameraWrapper( NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])), variant['util_params']['gpu_id']) if animated: env.render() tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) eval_tasks = list(tasks[-variant['n_eval_tasks']:]) print('testing on {} test tasks, {} trajectories each'.format( len(eval_tasks), num_trajs)) # instantiate networks latent_dim = variant['latent_size'] context_encoder = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim reward_dim = 1 net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=obs_dim + action_dim + reward_dim, output_size=context_encoder, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) # deterministic eval if deterministic: agent = MakeDeterministic(agent) # load trained weights (otherwise simulate random policy) context_encoder.load_state_dict( torch.load(os.path.join(path_to_exp, 'context_encoder.pth'), map_location=torch.device('cpu'))) policy.load_state_dict( torch.load(os.path.join(path_to_exp, 'policy.pth'), map_location=torch.device('cpu'))) # loop through tasks collecting rollouts all_rets = [] video_frames = [] for idx in eval_tasks: env.reset_task(idx) agent.clear_z() paths = [] for n in range(num_trajs): path = rollout( env, agent, max_path_length=variant['algo_params']['num_steps_per_eval'], accum_context=True, animated=animated, save_frames=save_video) paths.append(path) if save_video: video_frames += [t['frame'] for t in path['env_infos']] if n >= variant['algo_params']['num_exp_traj_eval']: agent.infer_posterior(agent.context) all_rets.append([sum(p['rewards']) for p in paths]) if save_video: # save frames to file temporarily temp_dir = os.path.join(path_to_exp, 'temp') os.makedirs(temp_dir, exist_ok=True) for i, frm in enumerate(video_frames): frm.save(os.path.join(temp_dir, '%06d.jpg' % i)) video_filename = os.path.join(path_to_exp, 'video.mp4'.format(idx)) # run ffmpeg to make the video os.system('ffmpeg -i {}/%06d.jpg -vcodec mpeg4 {}'.format( temp_dir, video_filename)) # delete the frames shutil.rmtree(temp_dir) # compute average returns across tasks n = min([len(a) for a in all_rets]) rets = [a[:n] for a in all_rets] rets = np.mean(np.stack(rets), axis=0) for i, ret in enumerate(rets): print('trajectory {}, avg return: {} \n'.format(i, ret))
def experiment(variant): print (variant['env_name']) print (variant['env_params']) env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) cont_latent_dim, num_cat, latent_dim, num_dir, dir_latent_dim = read_dim(variant['global_latent']) r_cont_dim, r_n_cat, r_cat_dim, r_n_dir, r_dir_dim = read_dim(variant['vrnn_latent']) reward_dim = 1 net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] glob = variant['algo_params']['glob'] rnn = variant['rnn'] vrnn_latent = variant['vrnn_latent'] encoder_model = MlpEncoder if recurrent: if variant['vrnn_constraint'] == 'logitnormal': output_size = r_cont_dim * 2 + r_n_cat * r_cat_dim + r_n_dir * r_dir_dim * 2 else: output_size = r_cont_dim * 2 + r_n_cat * r_cat_dim + r_n_dir * r_dir_dim if variant['rnn_sample'] == 'batch_sampling': if variant['algo_params']['use_next_obs']: input_size = (2 * obs_dim + action_dim + reward_dim) * variant['temp_res'] else: input_size = (obs_dim + action_dim + reward_dim) * variant['temp_res'] else: if variant['algo_params']['use_next_obs']: input_size = (2 * obs_dim + action_dim + reward_dim) else: input_size = (obs_dim + action_dim + reward_dim) if rnn == 'rnn': recurrent_model = RecurrentEncoder recurrent_context_encoder = recurrent_model( hidden_sizes=[net_size, net_size, net_size], input_size=input_size, output_size = output_size ) elif rnn == 'vrnn': recurrent_model = VRNNEncoder recurrent_context_encoder = recurrent_model( hidden_sizes=[net_size, net_size, net_size], input_size=input_size, output_size=output_size, temperature=variant['temperature'], vrnn_latent=variant['vrnn_latent'], vrnn_constraint=variant['vrnn_constraint'], r_alpha=variant['vrnn_alpha'], r_var=variant['vrnn_var'], ) else: recurrent_context_encoder = None ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if glob: if dir_latent_dim > 0 and variant['constraint'] == 'logitnormal': output_size = cont_latent_dim * 2 + num_cat * latent_dim + num_dir * dir_latent_dim * 2 else: output_size = cont_latent_dim * 2 + num_cat * latent_dim + num_dir * dir_latent_dim if variant['algo_params']['use_next_obs']: input_size = 2 * obs_dim + action_dim + reward_dim else: input_size = obs_dim + action_dim + reward_dim global_context_encoder = encoder_model( hidden_sizes=[net_size, net_size, net_size], input_size=input_size, output_size=output_size, ) else: global_context_encoder = None qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \ + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \ + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim, output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \ + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim, output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \ + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \ + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim, latent_dim=latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \ + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim, action_dim=action_dim, ) agent = PEARLAgent( global_context_encoder, recurrent_context_encoder, variant['global_latent'], variant['vrnn_latent'], policy, variant['temperature'], variant['unitkl'], variant['alpha'], variant['constraint'], variant['vrnn_constraint'], variant['var'], variant['vrnn_alpha'], variant['vrnn_var'], rnn, variant['temp_res'], variant['rnn_sample'], variant['weighted_sample'], **variant['algo_params'] ) if variant['path_to_weights'] is not None: path = variant['path_to_weights'] with open(os.path.join(path, 'extra_data.pkl'), 'rb') as f: extra_data = pickle.load(f) variant['algo_params']['start_epoch'] = extra_data['epoch'] + 1 replay_buffer = extra_data['replay_buffer'] enc_replay_buffer = extra_data['enc_replay_buffer'] variant['algo_params']['_n_train_steps_total'] = extra_data['_n_train_steps_total'] variant['algo_params']['_n_env_steps_total'] = extra_data['_n_env_steps_total'] variant['algo_params']['_n_rollouts_total'] = extra_data['_n_rollouts_total'] else: replay_buffer=None enc_replay_buffer=None algorithm = PEARLSoftActorCritic( env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, qf1, qf2, target_qf1, target_qf2], latent_dim=latent_dim, replay_buffer=replay_buffer, enc_replay_buffer=enc_replay_buffer, temp_res=variant['temp_res'], rnn_sample=variant['rnn_sample'], **variant['algo_params'] ) if variant['path_to_weights'] is not None: path = variant['path_to_weights'] if recurrent_context_encoder != None: recurrent_context_encoder.load_state_dict(torch.load(os.path.join(path, 'recurrent_context_encoder.pth'))) if global_context_encoder != None: global_context_encoder.load_state_dict(torch.load(os.path.join(path, 'global_context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) target_qf1.load_state_dict(torch.load(os.path.join(path, 'target_qf1.pth'))) target_qf2.load_state_dict(torch.load(os.path.join(path, 'target_qf2.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) if ptu.gpu_enabled(): algorithm.to() DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) exp_id = 'debug' if DEBUG else None if variant.get('log_name', "") == "": log_name = variant['env_name'] else: log_name = variant['log_name'] experiment_log_dir = setup_logger(log_name, \ variant=variant, \ exp_id=exp_id, \ base_log_dir=variant['util_params']['base_log_dir'], \ config_log_dir=variant['util_params']['config_log_dir'], \ log_dir=variant['util_params']['log_dir']) if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) env.save_all_tasks(experiment_log_dir) if variant['eval']: algorithm._try_to_eval(0, eval_all=True, eval_train_offline=False, animated=True) else: algorithm.train()
def setup_and_run(variant): ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['seed'] % variant['util_params']['num_gpus']) #setup env env_name = variant['env_name'] env_params = variant['env_params'] env_params['n_tasks'] = variant["n_train_tasks"] + variant["n_eval_tasks"] env = NormalizedBoxEnv(ENVS[env_name](**env_params)) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) latent_dim = variant['latent_size'] reward_dim = 1 #setup encoder context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) #setup actor, critic qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic( env=env, train_tasks=list(np.arange(variant['n_train_tasks'])), eval_tasks=list( np.arange(variant['n_train_tasks'], variant['n_train_tasks'] + variant['n_eval_tasks'])), nets=[agent, qf1, qf2, target_qf1, target_qf2], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) target_qf1.load_state_dict( torch.load(os.path.join(path, 'target_qf1.pth'))) target_qf2.load_state_dict( torch.load(os.path.join(path, 'target_qf2.pth'))) # TODO hacky, revisit after model refactor policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) if ptu.gpu_enabled(): algorithm.to() os.environ['DEBUG'] = str(int(variant['util_params']['debug'])) #setup logger run_mode = variant['run_mode'] exp_log_name = os.path.join( variant['env_name'], run_mode, variant['log_annotation'] + variant['variant_name'], 'seed-' + str(variant['seed'])) setup_logger(exp_log_name, variant=variant, exp_id=None, base_log_dir=os.environ.get('PEARL_DATA_PATH'), snapshot_mode='gap', snapshot_gap=10) # run the algorithm if run_mode == 'TRAIN': algorithm.train() elif run_mode == 'EVAL': assert variant['algo_params']['dump_eval_paths'] == True algorithm._try_to_eval() else: algorithm.eval_with_loaded_latent()
def experiment(variant): # create multi-task environment and sample tasks env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) #low Qs first and then high Qs q_list = [[ FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=2 * obs_dim + action_dim, output_size=1, ), FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=2 * obs_dim + action_dim, output_size=1, ) ], [ FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ), FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) ]] #low vf first and then high vf vf_list = [ FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=2 * obs_dim, output_size=1, ), FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) ] #NOTE: Reduced number of hidden layers in h_policy from 3 to 2 (idea being it's not doing as much as the whole policy in PEARL) h_policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=obs_dim, ) #NOTE: Kept the 3 layers because f**k it it'll get tons of data l_policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size, net_size], obs_dim=2 * obs_dim, latent_dim=0, action_dim=action_dim, ) #TODO Implement BernAgent agent = BURNAgent(latent_dim, context_encoder, h_policy, l_policy, c=2, **variant['algo_params']) algorithm = BURNSoftActorCritic( env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, q_list, vf_list], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights #TODO Make sure weights are properly saved if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) q_list[0][0].load_state_dict( torch.load(os.path.join(path, 'l_qf1.pth'))) q_list[0][1].load_state_dict( torch.load(os.path.join(path, 'l_qf2.pth'))) q_list[1][0].load_state_dict( torch.load(os.path.join(path, 'h_qf1.pth'))) q_list[1][1].load_state_dict( torch.load(os.path.join(path, 'h_qf2.pth'))) vf_list[0].load_state_dict(torch.load(os.path.join(path, 'l_vf.pth'))) vf_list[1].load_state_dict(torch.load(os.path.join(path, 'h_vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) h_policy.load_state_dict(torch.load(os.path.join(path, 'h_policy.pth'))) l_policy.load_state_dict(torch.load(os.path.join(path, 'l_policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['env_name'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
def main( env_name, seed, deterministic, traj_prior, start_ft_after, ft_steps, avoid_freezing_z, lr, batch_size, avoid_loading_critics ): config = "configs/{}.json".format(env_name) variant = default_config if config: with open(osp.join(config)) as f: exp_params = json.load(f) variant = deep_update_dict(exp_params, variant) exp_name = variant['env_name'] print("Experiment: {}".format(exp_name)) env = NormalizedBoxEnv(ENVS[exp_name](**variant['env_params'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print("Observation space:") print(env.observation_space) print(obs_dim) print("Action space:") print(env.action_space) print(action_dim) print("-" * 10) # instantiate networks latent_dim = variant['latent_size'] reward_dim = 1 context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant['algo_params']['use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) target_qf1 = qf1.copy() target_qf2 = qf2.copy() policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent( latent_dim, context_encoder, policy, **variant['algo_params'] ) # deterministic eval if deterministic: agent = MakeDeterministic(agent) # load trained weights (otherwise simulate random policy) path_to_exp = "output/{}/pearl_{}".format(env_name, seed-1) print("Based on experiment: {}".format(path_to_exp)) context_encoder.load_state_dict(torch.load(os.path.join(path_to_exp, 'context_encoder.pth'))) policy.load_state_dict(torch.load(os.path.join(path_to_exp, 'policy.pth'))) if not avoid_loading_critics: qf1.load_state_dict(torch.load(os.path.join(path_to_exp, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path_to_exp, 'qf2.pth'))) target_qf1.load_state_dict(torch.load(os.path.join(path_to_exp, 'target_qf1.pth'))) target_qf2.load_state_dict(torch.load(os.path.join(path_to_exp, 'target_qf2.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): agent.to(device) policy.to(device) context_encoder.to(device) qf1.to(device) qf2.to(device) target_qf1.to(device) target_qf2.to(device) helper = PEARLFineTuningHelper( env=env, agent=agent, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, num_exp_traj_eval=traj_prior, start_fine_tuning=start_ft_after, fine_tuning_steps=ft_steps, should_freeze_z=(not avoid_freezing_z), replay_buffer_size=int(1e6), batch_size=batch_size, discount=0.99, policy_lr=lr, qf_lr=lr, temp_lr=lr, target_entropy=-action_dim, ) helper.fine_tune(variant=variant, seed=seed)