def experiment(specs): with open(path.join(specs['specific_exp_dir'], 'variant.json'), 'r') as f: variant = json.load(f) variant['algo_params']['do_not_train'] = True variant['seed'] = specs['seed'] policy = joblib.load(path.join(specs['specific_exp_dir'], 'params.pkl'))['exploration_policy'] assert False, 'Do you really wanna make it deterministic?' policy = MakeDeterministic(policy) env_specs = variant['env_specs'] env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) variant['algo_params']['replay_buffer_size'] = int( np.floor(specs['num_episodes'] * variant['algo_params']['max_path_length'] / specs['subsampling'])) # Hack until I figure out how things are gonna be in general then I'll clean it up if 'policy_uses_pixels' not in variant['algo_params']: variant['algo_params']['policy_uses_pixels'] = False if 'policy_uses_task_params' not in variant['algo_params']: variant['algo_params']['policy_uses_task_params'] = False if 'concat_task_params_to_policy_obs' not in variant['algo_params']: variant['algo_params']['concat_task_params_to_policy_obs'] = False replay_buffer = ExpertReplayBuffer( variant['algo_params']['replay_buffer_size'], env, subsampling=specs['subsampling'], policy_uses_pixels=variant['algo_params']['policy_uses_pixels'], policy_uses_task_params=variant['algo_params'] ['policy_uses_task_params'], concat_task_params_to_policy_obs=variant['algo_params'] ['concat_task_params_to_policy_obs'], ) variant['algo_params']['freq_saving'] = 1 algorithm = ExpertTrajGeneratorAlgorithm( env=env, training_env=training_env, exploration_policy=policy, replay_buffer=replay_buffer, max_num_episodes=specs['num_episodes'], **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] print(demos_path) buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] # target_state_buffer /= variant['rescale'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) policy = joblib.load(variant['policy_checkpoint'])['exploration_policy'] if variant['eval_deterministic']: policy = MakeDeterministic(policy) policy.to(ptu.device) eval_sampler = PathSampler(env, policy, variant['num_eval_steps'], variant['max_path_length'], no_terminal=variant['no_terminal'], render=variant['render'], render_kwargs=variant['render_kwargs']) test_paths = eval_sampler.obtain_samples() obs = [] for path in test_paths: obs += path['observations'] x = [o[0] for o in obs] y = [o[1] for o in obs] fig, ax = plt.subplots(figsize=(6, 6)) plt.scatter(x, y) plt.xlim(-1.25, 20) plt.ylim(-1.25, 10) ax.set_yticks([0, 5, 10]) ax.set_xticks([0, 5, 10, 15, 20]) plt.savefig('./figs/' + variant['env_specs']['task_name'] + '.pdf', bbox_inches='tight') return 1
def experiment(variant): env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) policy = joblib.load(variant['policy_checkpoint'])['exploration_policy'] if variant['eval_deterministic']: policy = MakeDeterministic(policy) policy.to(ptu.device) eval_sampler = PathSampler(env, policy, variant['num_eval_steps'], variant['max_path_length'], no_terminal=variant['no_terminal'], render=variant['render'], render_kwargs=variant['render_kwargs']) test_paths = eval_sampler.obtain_samples() average_returns = eval_util.get_average_returns(test_paths) print(average_returns) return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning GAIL expert_buffer = extra_data['train'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ) # set up the discriminator models disc_model = StandardAIRLDisc( obs_dim + action_dim, num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) # set up the AdvBC algorithm algorithm = AdvBC(env, policy, disc_model, expert_buffer, training_env=training_env, wrap_absorbing=variant['wrap_absorbing_state'], **variant['algo_params']) print(algorithm.use_target_disc) print(algorithm.soft_target_disc_tau) print(algorithm.exploration_policy) print(algorithm.eval_policy) print(algorithm.disc_optimizer.defaults['lr']) print(algorithm.policy_optimizer.defaults['lr']) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] print(demos_path) buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] # target_state_buffer /= variant['rescale'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the energy model if variant['ebil_params']['mode'] == 'deen': """ ebm_model = MLPEBM( obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'], ) """ ebm_exp_name = 'ebm-deen-' + variant['env_specs'][ 'env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) load_ebm_dir = ebm_dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] else: raise NotImplementedError # Test if variant['test']: batch_data = target_state_buffer / variant['rescale'] obs = torch.Tensor(batch_data[:1000]).to(ptu.device) print("Not expert data", ebm_model(obs * 200).mean().item()) print("Expert data", ebm_model(obs).mean().item()) exit(1) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = EBIL(env=env, training_env=training_env, exploration_policy=policy, rew_func=variant['rew_func'], cons=variant['cons'], rescale=variant['rescale'], ebm=ebm_model, policy_trainer=trainer, target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['ebil_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the critic model critic_model = MLPDisc(variant['policy_net_size'], num_layer_blocks=variant['critic_num_blocks'], hid_dim=variant['critic_hid_dim'], hid_act=variant['critic_hid_act'], use_bn=variant['critic_use_bn']) algorithm = BC(env=env, training_env=training_env, exploration_policy=policy, critic=critic_model, expert_replay_buffer=expert_replay_buffer, **variant['adp_bc_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = BC(env=env, training_env=training_env, exploration_policy=policy, expert_replay_buffer=expert_replay_buffer, **variant['bc_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: print('Use minmax envs') assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) input_dim = obs_dim + action_dim if not variant['ebm_params']['state_only'] else 2*obs_dim # build the energy model if (variant['ebm_params']['mode']) == 'deen': ebm_model = MLPEBM( input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'] ) algorithm = EBMLearn( env=env, training_env=training_env, ebm=ebm_model, input_dim = input_dim, exploration_policy=policy, sigma=variant['sigma'], expert_replay_buffer=expert_replay_buffer, **variant['ebm_params'] ) # build the energy model elif (variant['ebm_params']['mode']) == 'ae': ebm_model = MLPAE( input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], ) algorithm = EBMLearn( env=env, training_env=training_env, ebm=ebm_model, input_dim = input_dim, exploration_policy=policy, sigma=None, expert_replay_buffer=expert_replay_buffer, **variant['ebm_params'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): expert_buffer = joblib.load(variant['xy_data_path'])['xy_data'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) if variant['scale_env_with_given_demo_stats']: assert False assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( # policy = ReparamMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, # std=0.1 ) # set up the discriminator models disc_model_class = ThreeWayResNetAIRLDisc if variant[ 'threeway'] else ResNetAIRLDisc disc_model = disc_model_class( 2, # obs is just x-y pos num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) # set up the RL algorithm used to train the policy policy_optimizer = EntConstSAC(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, action_dim=action_dim, **variant['policy_params']) # set up the AIRL algorithm alg_class = ThreewayStateMarginalMatchingAlg if variant[ 'threeway'] else StateMarginalMatchingAlg algorithm = alg_class(env, policy, disc_model, policy_optimizer, expert_buffer, training_env=training_env, **variant['algo_params']) print(algorithm.exploration_policy) print(algorithm.eval_policy) print(algorithm.policy_optimizer.policy_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf1_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf2_optimizer.defaults['lr']) print(algorithm.disc_optimizer.defaults['lr']) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): # NEW WAY OF DOING EXPERT REPLAY BUFFERS USING ExpertReplayBuffer class with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) print(listings.keys()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') expert_replay_buffer = joblib.load(file_to_load)['replay_buffer'] # this script is for the non-meta-learning GAIL expert_replay_buffer.policy_uses_task_params = variant['gail_params']['policy_uses_task_params'] expert_replay_buffer.concat_task_params_to_policy_obs = variant['gail_params']['concat_task_params_to_policy_obs'] # Now determine how many trajectories you want to use if 'num_expert_trajs' in variant: raise NotImplementedError('Not implemented during the transition away from ExpertReplayBuffer') # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) env.seed(variant['seed']) training_env.seed(variant['seed']) # if variant['wrap_absorbing_state']: # assert False, 'Not handling train_test_env' # env = WrappedAbsorbingEnv(env) print(env.observation_space) if isinstance(env.observation_space, Dict): if not variant['gail_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['gail_params']['policy_uses_task_params']: if variant['gail_params']['concat_task_params_to_policy_obs']: obs_dim += int(np.prod(env.observation_space.spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) if variant['gail_params']['state_only']: print('\n\nUSING STATE ONLY DISC\n\n') disc_model = ThirdVersionSingleColorFetchCustomDisc( clamp_magnitude=variant['disc_clamp_magnitude'], state_only=variant['gail_params']['state_only'], wrap_absorbing=variant['gail_params']['wrap_absorbing'] ) if variant['gail_params']['use_target_disc']: target_disc = disc_model.copy() else: target_disc = None print(disc_model) print(disc_model.clamp_magnitude) policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] qf1 = ObsPreprocessedQFunc( target_disc.obs_processor if target_disc is not None else disc_model.obs_processor, hidden_sizes=hidden_sizes, input_size=6 + 4 + 4 + 1*variant['gail_params']['wrap_absorbing'], output_size=1, wrap_absorbing=variant['gail_params']['wrap_absorbing'] ) qf2 = ObsPreprocessedQFunc( target_disc.obs_processor if target_disc is not None else disc_model.obs_processor, hidden_sizes=hidden_sizes, input_size=6 + 4 + 4 + 1*variant['gail_params']['wrap_absorbing'], output_size=1, wrap_absorbing=variant['gail_params']['wrap_absorbing'] ) vf = ObsPreprocessedVFunc( target_disc.obs_processor if target_disc is not None else disc_model.obs_processor, hidden_sizes=hidden_sizes, input_size=6 + 4 + 1*variant['gail_params']['wrap_absorbing'], output_size=1, wrap_absorbing=variant['gail_params']['wrap_absorbing'] ) policy = ObsPreprocessedReparamTanhMultivariateGaussianPolicy( target_disc.obs_processor if target_disc is not None else disc_model.obs_processor, hidden_sizes=hidden_sizes, obs_dim=6 + 4, action_dim=4, ) policy_optimizer = NewSoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, wrap_absorbing=variant['gail_params']['wrap_absorbing'], **variant['policy_params'] ) algorithm = GAIL( env, policy, disc_model, policy_optimizer, expert_replay_buffer, training_env=training_env, target_disc=target_disc, **variant['gail_params'] ) print(algorithm.use_target_disc) print(algorithm.soft_target_disc_tau) print(algorithm.exploration_policy) print(algorithm.eval_policy) print(algorithm.policy_optimizer.policy_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf1_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf2_optimizer.defaults['lr']) print(algorithm.policy_optimizer.vf_optimizer.defaults['lr']) print(algorithm.disc_optimizer.defaults['lr']) if variant['gail_params']['wrap_absorbing']: print('\n\nWRAP ABSORBING\n\n') # assert False, "Have not added new sac yet!" if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): # get the expert data with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) expert_buffer = extra_data['train'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # seed the env env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, batch_norm=variant['policy_uses_bn'], layer_norm=variant['policy_uses_layer_norm']) # policy = MlpPolicy( # hidden_sizes=hidden_sizes, # obs_dim=obs_dim, # action_dim=action_dim, # batch_norm=variant['policy_uses_bn'], # layer_norm=variant['policy_uses_layer_norm'] # ) # set up the AIRL algorithm algorithm = BC(env, policy, expert_buffer, training_env=training_env, wrap_absorbing=variant['wrap_absorbing_state'], **variant['algo_params']) print(algorithm.exploration_policy) print(algorithm.eval_policy) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): env_specs = variant['env_specs'] if variant['algo_params']['meta']: env, training_env = get_meta_env(env_specs) else: if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) if variant['algo_params']['meta']: train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) print(env.observation_space) if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: obs_dim += int( np.prod( env.observation_space.spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] hidden_sizes = [net_size] * variant['num_hidden_layers'] if variant['use_custom_ant_models']: assert isinstance(env.observation_space, Dict) print('CUSTOM ANT WITH LINEAR EMBEDDING OF THE TARGET POSITION') qf1 = AntRandGoalCustomQFunc( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, input_size=int(np.prod(env.observation_space.spaces['obs'].shape)) + action_dim, output_size=1, ) qf2 = AntRandGoalCustomQFunc( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, input_size=int(np.prod(env.observation_space.spaces['obs'].shape)) + action_dim, output_size=1, ) vf = AntRandGoalCustomVFunc( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, input_size=int(np.prod(env.observation_space.spaces['obs'].shape)), output_size=1, ) policy = AntRandGoalCustomReparamTanhMultivariateGaussianPolicy( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, obs_dim=int(np.prod(env.observation_space.spaces['obs'].shape)), action_dim=action_dim, ) # CUSTOM ANT WITH GATING ACTIVATIONS OF EACH LAYER # qf1 = AntCustomGatingQFuncV1() # qf2 = AntCustomGatingQFuncV1() # vf = AntCustomGatingVFuncV1() # policy = AntCustomGatingV1ReparamTanhMultivariateGaussianPolicy() else: print('Using simple model') qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ) if variant['algo_params']['meta']: algorithm = MetaNewSoftActorCritic( env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, true_env_obs_dim=int( np.prod(env.observation_space.spaces['obs'].shape)), **variant['algo_params']) else: algorithm = NewSoftActorCritic(env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] target_state_buffer /= variant['rescale'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) input_dim = len(state_indices) # build the energy model if (variant['ebm_params']['mode']) == 'deen': ebm_model = MLPEBM(input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude']) algorithm = EBMLearn(env=env, training_env=training_env, ebm=ebm_model, input_dim=input_dim, exploration_policy=policy, sigma=variant['sigma'], target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['ebm_params']) # build the energy model elif (variant['ebm_params']['mode']) == 'ae': ebm_model = MLPAE( input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], ) algorithm = EBMLearn(env=env, training_env=training_env, ebm=ebm_model, input_dim=input_dim, exploration_policy=policy, sigma=None, rescale=variant['rescale'], target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['ebm_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # env = NormalizedBoxEnv(InvertedPendulumEnv()) # --------- # env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # env = ReacherEnv() # training_env = ReacherEnv() # env = NormalizedBoxEnv(ReacherEnv()) # training_env = NormalizedBoxEnv(ReacherEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) print(env.observation_space) obs_space = env.observation_space if isinstance(env.observation_space, Dict): # possible keys: pixel, obs, obs_task_params if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(obs_space.spaces['obs'].shape)) else: raise NotImplementedError() if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(obs_space.spaces['obs_task_params'].shape)) else: raise NotImplementedError else: # OpenAI Gym Env or DMCS Env with only one obs obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) # if variant['reload_policy_from'] != '': # params = joblib.load(variant['reload_policy_from']) # qf1, qf2, vf, policy = params['qf1'], params['qf2'], params['vf'], params['policy'] # else: net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = NewSoftActorCritic(env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(specs): if not specs['use_scripted_policy']: policy_is_scripted = False policy = joblib.load(specs['expert_path'])['policy'] else: policy_is_scripted = True policy = get_scripted_policy(specs['scripted_policy_name']) if specs['use_deterministic_expert']: policy = MakeDeterministic(policy) if ptu.gpu_enabled(): policy.to(ptu.device) env = get_env(specs['env_specs']) env.seed(specs['env_specs']['env_seed']) # make the replay buffers max_path_length = specs['max_path_length'] if 'wrap_absorbing' in specs and specs['wrap_absorbing']: """ There was an intial implementation for this in v1.0 in gen_irl_expert_trajs.py """ raise NotImplementedError() _max_buffer_size = (max_path_length + 2) * specs['num_rollouts'] else: _max_buffer_size = max_path_length * specs['num_rollouts'] _max_buffer_size = int( np.ceil(_max_buffer_size / float(specs['subsample_factor']))) buffer_constructor = lambda: EnvReplayBuffer( _max_buffer_size, env, ) train_buffer = buffer_constructor() test_buffer = buffer_constructor() render = specs['render'] render_kwargs = specs['render_kwargs'] check_for_success = specs['check_for_success'] print('\n') # fill the train buffer fill_buffer(train_buffer, env, policy, specs['num_rollouts'], max_path_length, no_terminal=specs['no_terminal'], policy_is_scripted=policy_is_scripted, render=render, render_kwargs=render_kwargs, check_for_success=check_for_success, wrap_absorbing=False, subsample_factor=specs['subsample_factor']) # fill the test buffer fill_buffer(test_buffer, env, policy, specs['num_rollouts'], max_path_length, no_terminal=specs['no_terminal'], policy_is_scripted=policy_is_scripted, render=render, render_kwargs=render_kwargs, check_for_success=check_for_success, wrap_absorbing=False, subsample_factor=specs['subsample_factor']) # save the replay buffers logger.save_extra_data({ 'train': train_buffer, 'test': test_buffer }, name='expert_demos.pkl') return 1
def experiment(variant): env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the energy model if variant['ebil_params']['mode'] == 'deen': """ ebm_model = MLPEBM( obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'], ) """ ebm_exp_name = 'ebm-deen-smm-implementation-' + variant['env_specs'][ 'task_name'] ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) tmp = [] ebm_id_dic = ebm_id_dics[variant['env_specs']['env_name'] + '_' + variant['env_specs']['task_name']] if str(variant['ebm_sigma']) in ebm_id_dic['sigma'].keys(): ebm_id = ebm_id_dic['sigma'][str(variant['ebm_sigma'])] tmp = [_ for _ in ebm_id_dirs if ebm_id in _] else: raise NotImplementedError if len(tmp) > 0: ebm_id_dirs = tmp ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] print("loaded EBM from {}".format(load_ebm_path)) elif variant['ebil_params']['mode'] == 'ae': ebm_exp_name = 'ebm-ae-' + variant['env_specs'][ 'env_name'] + '-' + str(variant['expert_traj_num']) + '-train' ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] print("loaded EBM from {}".format(load_ebm_path)) else: raise NotImplementedError # Test if variant['test']: batch_data = target_state_buffer obs = torch.Tensor(batch_data[:100]) exp_input = torch.cat([obs, acts], dim=1).to(ptu.device) print("Not expert data", ebm_model(exp_input * 200).mean().item()) print("Expert data", ebm_model(exp_input).mean().item()) exit(1) x = np.linspace(-1.25, 20, 1000) y = np.linspace(-1.25, 10, 1000) rewards = [] for i in range(1000): data = [] for j in range(1000): coords = np.array((x[j], y[i])) data.append((x[j], y[i])) data = np.array(data) / variant['rescale'] data = torch.Tensor(data).to(ptu.device) reward = ebm_model(data).squeeze().detach().cpu().numpy() rewards.append(reward) #data = np.array(data) #data = torch.Tensor(data).to(ptu.device) rewards = np.array(rewards) print(rewards.shape) # rewards = np.reshape(rewards, (1000,1000)) fig, ax = plt.subplots(figsize=(6, 6)) # im = ax.imshow(rewards, cmap=plt.cm.hot_r) # plt.colorbar(im) h = plt.contourf(rewards, cmap=plt.cm.hot_r) cb = plt.colorbar(h) ax.set_xticks([58, 293, 528, 764, 999]) ax.set_xticklabels(['0', '5', '10', '15', '20']) ax.set_yticks([111, 555, 999]) ax.set_yticklabels(['0', '5', '10']) plt.savefig('./figs/' + variant['env_specs']['env_name'] + '_' + variant['env_specs']['task_name'] + '_' + str(variant['ebm_sigma']) + '_' + str(variant['ebm_epoch']) + '.pdf', bbox_inches='tight') return 1
from os import path as osp import yaml import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import rlkit.torch.pytorch_util as ptu from rlkit.envs import get_env from rlkit.core import eval_util from rlkit.samplers import PathSampler from rlkit.torch.sac.policies import MakeDeterministic from rlkit.envs.wrappers import ScaledEnv env_specs = {'env_name': 'halfcheetah', 'env_kwargs': {}, 'eval_env_seed': 3562} env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings['norm_halfcheetah_32_demos_sub_20']['file_paths'][0] buffer_save_dict = joblib.load(expert_demos_path) env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) bc_policy = joblib.load('/scratch/hdd001/home/kamyar/output/paper-version-hc-bc/paper_version_hc_bc_2019_05_19_00_32_05_0000--s-0/params.pkl')['exploration_policy'] bc_policy = MakeDeterministic(bc_policy)
def experiment(variant): env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] net_size = variant['net_size'] num_hidden = variant['num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) trainer = SoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params'] ) algorithm = TorchRLAlgorithm( trainer=trainer, env=env, training_env=training_env, exploration_policy=policy, **variant['rl_alg_params'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the discriminator model if variant['disc_model_type'] == 'resnet_disc': disc_model = ResNetAIRLDisc( len(state_indices), num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) else: disc_model = MLPDisc(len(state_indices), num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = AdvSMM(env=env, training_env=training_env, exploration_policy=policy, discriminator=disc_model, policy_trainer=trainer, target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['adv_smm_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: print('Use minmax envs') assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the energy model if variant['ebil_params']['mode'] == 'deen': """ ebm_model = MLPEBM( obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'], ) """ ebm_exp_name = 'ebm-deen-' + variant['env_specs'][ 'env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] elif variant['ebil_params']['mode'] == 'ae': ebm_exp_name = 'ebm-ae-' + variant['env_specs']['env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] else: raise NotImplementedError print("loaded EBM from {}".format(load_ebm_path)) # Test if variant['test']: batch_data = expert_replay_buffer.random_batch( 100, keys=['observations', 'actions']) print('ebm_obs: ', np.mean(batch_data['observations'], axis=0)) obs = torch.Tensor(batch_data['observations']) acts = torch.Tensor(batch_data['actions']) exp_input = torch.cat([obs, acts], dim=1).to(ptu.device) print("Not expert data", ebm_model(exp_input * 200).mean().item()) print("Expert data", ebm_model(exp_input).mean().item()) exit(1) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm_pretrain = BC(env=env, training_env=training_env, exploration_policy=policy, expert_replay_buffer=expert_replay_buffer, **variant['bc_params']) algorithm = EBIL(env=env, training_env=training_env, exploration_policy=policy, rew_func=variant['rew_func'], cons=variant['cons'], ebm=ebm_model, policy_trainer=trainer, expert_replay_buffer=expert_replay_buffer, **variant['ebil_params']) if ptu.gpu_enabled(): algorithm_pretrain.to(ptu.device) algorithm.to(ptu.device) else: algorithm_pretrain.to('cpu') algorithm.to('cpu') if variant['pretrain']: algorithm_pretrain.train() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the discriminator model disc_model = MLPDisc( obs_dim + action_dim if not variant['adv_irl_params']['state_only'] else 2 * obs_dim, num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = AdvIRL(env=env, training_env=training_env, exploration_policy=policy, discriminator=disc_model, policy_trainer=trainer, expert_replay_buffer=expert_replay_buffer, **variant['adv_irl_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1