def __init__(self, *args, observation_key=None, desired_goal_key=None, **kwargs): HER.__init__( self, observation_key=observation_key, desired_goal_key=desired_goal_key, ) SoftActorCritic.__init__(self, *args, **kwargs) assert isinstance(self.replay_buffer, ObsDictRelabelingBuffer)
def __init__( self, *args, her_kwargs, sac_kwargs, **kwargs ): HER.__init__(self, **her_kwargs) SoftActorCritic.__init__(self, *args, **kwargs, **sac_kwargs) assert isinstance( self.replay_buffer, RelabelingReplayBuffer ) or isinstance( self.replay_buffer, ObsDictRelabelingBuffer )
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym env = NormalizedBoxEnv(gym.make('Pointmass-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = SawyerXYZEnv(**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) vf = ConcatMlp( input_size=obs_dim, output_size=1, **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs'] ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(CartpoleSwingupSparseEnv()) #env = NormalizedBoxEnv(HalfCheetahEnv()) #env = NormalizedBoxEnv(Continuous_MountainCarEnv()) #env = DIAYNWrappedEnv(NormalizedBoxEnv(HumanoidEnv())) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) skill_dim = 0 #50 obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + skill_dim, action_dim=action_dim, #k=4, ) disc = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=skill_dim if skill_dim > 0 else 1, ) algorithm = SoftActorCritic( env=env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, #disc=disc, #skill_dim=skill_dim, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # env = NormalizedBoxEnv(InvertedPendulumEnv()) # --------- # env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) env = ReacherEnv() training_env = ReacherEnv() # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) total_meta_variable_dim = 0 for dims in exp_specs['true_meta_variable_dims']: total_meta_variable_dim += sum(dims) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + total_meta_variable_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + total_meta_variable_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + total_meta_variable_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=training_env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant, env_name, record_name, record_every_episode): #env = CartPoleEnv() env = gym.make(env_name) # A workaround to give this info later on # (Such naughty business...) randomize_settings = { "turnframes": [10, 10], "engagement_distance": [100, 200] } env.record_name = record_name env.record_every_episode = record_every_episode env.randomize_settings = randomize_settings env = OneHotsToDecimalsAndRecordAndRandomize(env) obs_dim = int(np.prod(env.observation_space.shape)) num_categoricals = len(env.action_space.nvec) num_categories = env.action_space.nvec[0] net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], # Action is fed in as a raveled one-hot vector input_size=obs_dim + int(np.sum(env.action_space.nvec)), output_size=1, hidden_activation=F.sigmoid, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, hidden_activation=F.sigmoid, ) # For multi-discrete policy = MultiCategoricalPolicy(hidden_sizes=[net_size, net_size], obs_dim=obs_dim, num_categoricals=num_categoricals, num_categories=num_categories, hidden_activation=F.sigmoid) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = ConcatMlp( hidden_sizes=[100, 100], input_size=obs_dim + action_dim, output_size=1, ) vf = ConcatMlp( hidden_sizes=[100, 100], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[100, 100], obs_dim=obs_dim, action_dim=action_dim, ) plotter = QFPolicyPlotter(qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, # plotter=plotter, # render_eval_paths=True, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class']() env = NormalizedBoxEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) vf = ConcatMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): farmlist_base = [('123.123.123.123', 4)] farmer = Farmer(farmlist_base) environment = acq_remote_env(farmer) env = NormalizedBoxEnv(environment) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=env, policy=policy, qf=qf, vf=vf, environment_farming=True, farmlist_base=farmlist_base, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=env, save_environment=False, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): logger.add_text_output('./d_text.txt') logger.add_tabular_output('./d_tabular.txt') logger.set_snapshot_dir('./snaps') farmer = Farmer([('0.0.0.0', 1)]) remote_env = farmer.force_acq_env() remote_env.set_spaces() env = NormalizedBoxEnv(remote_env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, training_env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): # env = normalize(GymEnv( # 'HalfCheetah-v1', # force_reset=True, # record_video=False, # record_log=False, # )) env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = ConcatMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = ConcatMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v2')) # env = gym.make('HalfCheetah-v2') env = MujocoManipEnv("SawyerBinsCanEnv") # wrap as a gym env obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, action_skip=ACTION_SKIP, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = ConcatMlp( hidden_sizes=[100, 100], input_size=obs_dim + action_dim, output_size=1, ) vf = ConcatMlp( hidden_sizes=[100, 100], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[100, 100], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) algorithm.to(ptu.device) with torch.autograd.profiler.profile() as prof: algorithm.train() prof.export_chrome_trace("tmp-torch-chrome-trace.prof")
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the discriminator model disc_model = MLPDisc( obs_dim + action_dim if not variant['adv_irl_params']['state_only'] else 2 * obs_dim, num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = AdvIRL(env=env, training_env=training_env, exploration_policy=policy, discriminator=disc_model, policy_trainer=trainer, expert_replay_buffer=expert_replay_buffer, **variant['adv_irl_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(log_dir, variant_overwrite, cpu=False): if not cpu: ptu.set_gpu_mode(True) # optionally set the GPU (default=False) # Load experiment from file. env, _, data, variant = load_experiment(log_dir, variant_overwrite) assert all([ a == b for a, b in zip(env.sampled_goal, variant['env_kwargs']['goal_prior']) ]) # Set log directory. exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format( variant['algo_kwargs']['num_episodes'], variant['algo_kwargs']['max_path_length'], ','.join(variant_overwrite['env_kwargs']['shaped_rewards']), variant['algo_kwargs']['reward_scale'], variant['historical_policies_kwargs']['num_historical_policies'], ) exp_id = create_exp_name(exp_id) out_dir = os.path.join(log_dir, exp_id) print('Logging to:', out_dir) setup_logger( log_dir=out_dir, variant=variant, snapshot_mode='none', snapshot_gap=50, ) # Load trained model from file. policy = data['policy'] vf = data['vf'] qf = data['qf'] algorithm = SoftActorCritic( env=env, training_env=env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'], ) # Overwrite algorithm for p(z) adaptation (if model is SMM). if variant['intrinsic_reward'] == 'smm': discriminator = data['discriminator'] density_model = data['density_model'] SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) # Overwrite algorithm for historical averaging. if variant['historical_policies_kwargs']['num_historical_policies'] > 0: HistoricalPoliciesHook( base_algorithm=algorithm, log_dir=log_dir, **variant['historical_policies_kwargs'], ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the discriminator model if variant['disc_model_type'] == 'resnet_disc': disc_model = ResNetAIRLDisc( len(state_indices), num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) else: disc_model = MLPDisc(len(state_indices), num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = AdvSMM(env=env, training_env=training_env, exploration_policy=policy, discriminator=disc_model, policy_trainer=trainer, target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['adv_smm_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def __init__(self, use_history, SMM_path, num_skills): self.use_history = use_history log_dir = SMM_path self.num_skills = num_skills from rlkit.torch.sac.sac import SoftActorCritic self.config = dict( env_kwargs=dict( goal_prior=[1.12871704, 0.46767739, 0.42], # Test-time object goal position sample_goal=False, shaped_rewards=[ 'object_off_table', 'object_goal_indicator', 'object_gripper_indicator', 'action_penalty' ], terminate_upon_success=False, terminate_upon_failure=False, ), test_goal=[1.12871704, 0.46767739, 0.42], algo_kwargs=dict( max_path_length=50, # Maximum path length in the environment num_episodes=100, # Number of test episodes reward_scale= 100, # Weight of the extrinsic reward relative to the SAC reward collection_mode='episodic', # Each epoch is one episode num_updates_per_episode= 0, # Evaluate without additional training ), smm_kwargs=dict( update_p_z_prior_coeff= 1, # p(z) coeff for SMM posterior adaptation (higher value corresponds to more uniform p(z)) # Turn off SMM reward. state_entropy_coeff=0, latent_entropy_coeff=0, latent_conditional_entropy_coeff=0, discriminator_lr=0, ), ) with open('/home/zj/Desktop/sample/smm/configs/test_no_ha_point.json' ) as f: exp_params = json.load(f) overwrite_dict(self.config, exp_params) ptu.set_gpu_mode(True) env, _, data, variant = load_experiment(log_dir, self.config) variant['historical_policies_kwargs'][ 'num_historical_policies'] = 10 if self.use_history else 0 self.policy = data['policy'] vf = data['vf'] qf = data['qf'] self.algorithm = SoftActorCritic( env=env, training_env=env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=self.policy, qf=qf, vf=vf, **variant['algo_kwargs'], ) self.policy.to('cuda') if variant['intrinsic_reward'] == 'smm': discriminator = data['discriminator'] density_model = data['density_model'] SMMHook(base_algorithm=self.algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) # Overwrite algorithm for historical averaging. if variant['historical_policies_kwargs']['num_historical_policies'] > 0: HistoricalPoliciesHook( base_algorithm=self.algorithm, log_dir=log_dir, **variant['historical_policies_kwargs'], )
def experiment(log_dir, variant_overwrite, cpu=False): if not cpu: ptu.set_gpu_mode(True) # optionally set the GPU (default=False) # Load experiment from file. env, _, data, variant = load_experiment(log_dir, variant_overwrite) #assert all([a == b for a, b in zip(print(samples)env.sampled_goal, variant['env_kwargs']['goal_prior'])]) # Set log directory. exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format( variant['algo_kwargs']['num_episodes'], variant['algo_kwargs']['max_path_length'], ','.join(variant_overwrite['env_kwargs']['shaped_rewards']), variant['algo_kwargs']['reward_scale'], variant['historical_policies_kwargs']['num_historical_policies'], ) exp_id = create_exp_name(exp_id) out_dir = os.path.join(log_dir, exp_id) print('Logging to:', out_dir) setup_logger( log_dir=out_dir, variant=variant, snapshot_mode='none', snapshot_gap=50, ) # Load trained model from file. policy = data['policy'] vf = data['vf'] qf = data['qf'] algorithm = SoftActorCritic( env=env, training_env=env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'], ) # Overwrite algorithm for p(z) adaptation (if model is SMM). if variant['intrinsic_reward'] == 'smm': discriminator = data['discriminator'] density_model = data['density_model'] SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) # Overwrite algorithm for historical averaging. if variant['historical_policies_kwargs']['num_historical_policies'] > 0: HistoricalPoliciesHook( base_algorithm=algorithm, log_dir=log_dir, **variant['historical_policies_kwargs'], ) algorithm.to(ptu.device) #algorithm.train() samples = algorithm.get_eval_paths() #for path in samples: # print(path['observations']) #plt.figure() #plt.plot(samples[0]['observations'][:, 0], samples[0]['observations'][:, 1]) #plt.plot(3, 2) #plt.show() print(env.reset()) print(samples[0]['observations']) i = 0 for path in samples: np.save('./outtem/out%i.npy' % i, path['observations']) i = i + 1 #print(algorithm.policy.get_action(np.array([0,0]))) from rlkit.samplers.util import rollout from rlkit.samplers.in_place import InPlacePathSampler #path=rollout(env,algorithm.eval_policy,50) eval_sampler = InPlacePathSampler( env=env, policy=algorithm.eval_policy, max_samples=100, max_path_length=50, ) path = algorithm.eval_sampler.obtain_samples() print(path[0]['observations'])
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: print('Use minmax envs') assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the energy model if variant['ebil_params']['mode'] == 'deen': """ ebm_model = MLPEBM( obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'], ) """ ebm_exp_name = 'ebm-deen-' + variant['env_specs'][ 'env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] elif variant['ebil_params']['mode'] == 'ae': ebm_exp_name = 'ebm-ae-' + variant['env_specs']['env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] else: raise NotImplementedError print("loaded EBM from {}".format(load_ebm_path)) # Test if variant['test']: batch_data = expert_replay_buffer.random_batch( 100, keys=['observations', 'actions']) print('ebm_obs: ', np.mean(batch_data['observations'], axis=0)) obs = torch.Tensor(batch_data['observations']) acts = torch.Tensor(batch_data['actions']) exp_input = torch.cat([obs, acts], dim=1).to(ptu.device) print("Not expert data", ebm_model(exp_input * 200).mean().item()) print("Expert data", ebm_model(exp_input).mean().item()) exit(1) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm_pretrain = BC(env=env, training_env=training_env, exploration_policy=policy, expert_replay_buffer=expert_replay_buffer, **variant['bc_params']) algorithm = EBIL(env=env, training_env=training_env, exploration_policy=policy, rew_func=variant['rew_func'], cons=variant['cons'], ebm=ebm_model, policy_trainer=trainer, expert_replay_buffer=expert_replay_buffer, **variant['ebil_params']) if ptu.gpu_enabled(): algorithm_pretrain.to(ptu.device) algorithm.to(ptu.device) else: algorithm_pretrain.to('cpu') algorithm.to('cpu') if variant['pretrain']: algorithm_pretrain.train() algorithm.train() return 1
def experiment(variant): env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] net_size = variant['net_size'] num_hidden = variant['num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) trainer = SoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params'] ) algorithm = TorchRLAlgorithm( trainer=trainer, env=env, training_env=training_env, exploration_policy=policy, **variant['rl_alg_params'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): intrinsic_reward = variant['intrinsic_reward'] # Create environment. num_skills = variant['smm_kwargs']['num_skills'] if variant[ 'intrinsic_reward'] == 'smm' else 0 env, training_env = create_env(variant['env_id'], variant['env_kwargs'], num_skills) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size # Initialize networks. net_size = variant['net_size'] qf = FlattenMlp( input_size=obs_dim + action_dim, hidden_sizes=[net_size, net_size], output_size=1, ) vf = FlattenMlp( input_size=obs_dim, hidden_sizes=[net_size, net_size], output_size=1, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, hidden_sizes=[net_size, net_size], action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=training_env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) if intrinsic_reward == 'smm': discriminator = FlattenMlp( input_size=obs_dim - num_skills, hidden_sizes=[net_size, net_size], output_size=num_skills, ) density_model = VAEDensity(input_size=obs_dim, num_skills=num_skills, code_dim=128, **variant['vae_density_kwargs']) # Overwrite appropriate functions of algorithm. smm_algorithm_hook = SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) elif intrinsic_reward == 'icm': embedding_model = FlattenMlp( input_size=obs_dim, hidden_sizes=[net_size, net_size], output_size=net_size, ) forward_model = FlattenMlp( input_size=net_size + action_dim, hidden_sizes=[net_size, net_size], output_size=net_size, ) inverse_model = FlattenMlp( input_size=net_size + net_size, hidden_sizes=[], output_size=action_dim, ) # Overwrite appropriate functions of algorithm. ICMHook(base_algorithm=algorithm, embedding_model=embedding_model, forward_model=forward_model, inverse_model=inverse_model, **variant['icm_kwargs']) elif intrinsic_reward == 'count': count_algorithm_hook = CountHook(base_algorithm=algorithm, **variant['count_kwargs']) elif intrinsic_reward == 'pseudocount': density_model = VAEDensity( input_size=obs_dim, num_skills=0, code_dim=128, **variant['vae_density_kwargs'], ) # Overwrite appropriate functions of algorithm. PseudocountHook(base_algorithm=algorithm, density_model=density_model, **variant['pseudocount_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] print(demos_path) buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] # target_state_buffer /= variant['rescale'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the energy model if variant['ebil_params']['mode'] == 'deen': """ ebm_model = MLPEBM( obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'], ) """ ebm_exp_name = 'ebm-deen-' + variant['env_specs'][ 'env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) load_ebm_dir = ebm_dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] else: raise NotImplementedError # Test if variant['test']: batch_data = target_state_buffer / variant['rescale'] obs = torch.Tensor(batch_data[:1000]).to(ptu.device) print("Not expert data", ebm_model(obs * 200).mean().item()) print("Expert data", ebm_model(obs).mean().item()) exit(1) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = EBIL(env=env, training_env=training_env, exploration_policy=policy, rew_func=variant['rew_func'], cons=variant['cons'], rescale=variant['rescale'], ebm=ebm_model, policy_trainer=trainer, target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['ebil_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(args): if not args.cpu: ptu.set_gpu_mode(True) # optionally set the GPU (default=False) variant_overwrite = dict( # Evaluate model on num_episodes. algo_kwargs=dict( reward_scale=args.reward_scale, collection_mode='episodic', num_episodes=args.num_episodes, max_path_length=args.max_path_length, render=args.render, # Evaluate without additional training num_updates_per_episode=0, min_num_steps_before_training=( args.max_path_length * args.num_episodes + 1), ), # Environment settings env_kwargs=dict( sample_goal=False, goal_prior=args.test_goal, shaped_rewards=[ 'object_off_table', 'object_goal_indicator', 'object_gripper_indicator', 'action_penalty' ], terminate_upon_success=False, terminate_upon_failure=False, ), # SMM settings smm_kwargs=dict( # Posterior adaptation of latent skills p(z) update_p_z_prior_coeff=args.update_p_z_prior_coeff, # Turn off SMM reward. state_entropy_coeff=0, latent_entropy_coeff=0, latent_conditional_entropy_coeff=0, discriminator_lr=0, ), ) # Load experiment from file. env, _, data, variant = load_experiment(args.logdir, variant_overwrite) assert all([a == b for a, b in zip(env.sampled_goal, args.test_goal)]) variant.update(test_goal=list(env.sampled_goal)) if args.num_historical_policies > 0: variant.update(historical_policies_kwargs=dict( log_dir=args.logdir, num_historical_policies=args.num_historical_policies, sample_strategy=args.sample_strategy, on_policy_prob=args.on_policy_prob, )) # Set log directory. exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}-{}-opp{}'.format( args.num_episodes, args.max_path_length, ','.join(variant_overwrite['env_kwargs']['shaped_rewards']), args.reward_scale, args.num_historical_policies, args.sample_strategy, args.on_policy_prob, ) exp_id = create_exp_name(exp_id) log_dir = os.path.join(args.logdir, exp_id) print('Logging to:', log_dir) setup_logger( log_dir=log_dir, variant=variant, snapshot_mode='none', snapshot_gap=50, ) # Load trained model from file. policy = data['policy'] vf = data['vf'] qf = data['qf'] algorithm = SoftActorCritic( env=env, training_env=env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'], ) # Overwrite algorithm for p(z) adaptation (if model is SMM). if 'smm_kwargs' in variant: discriminator = data['discriminator'] density_model = data['density_model'] SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) # Overwrite algorithm for historical averaging. if args.num_historical_policies > 0: HistoricalPoliciesHook( base_algorithm=algorithm, **variant['historical_policies_kwargs'], ) algorithm.to(ptu.device) algorithm.train()