def her_sac_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer(env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs']) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) qf = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) vf = ConcatMlp(input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = HerSac(env, qf=qf, vf=vf, policy=policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf.to(ptu.device) vf.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer(env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs']) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, max_sigma=0.1, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def _disentangled_her_twin_sac_experiment_v2( max_path_length, encoder_kwargs, disentangled_qf_kwargs, qf_kwargs, twin_sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, save_video=True, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', # Video parameters latent_dim=2, save_video_kwargs=None, **kwargs ): import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm if save_video_kwargs is None: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() train_env = gym.make(env_id) eval_env = gym.make(env_id) else: eval_env = env_class(**env_kwargs) train_env = env_class(**env_kwargs) obs_dim = train_env.observation_space.spaces[observation_key].low.size goal_dim = train_env.observation_space.spaces[desired_goal_key].low.size action_dim = train_env.action_space.low.size encoder = ConcatMlp( input_size=goal_dim, output_size=latent_dim, **encoder_kwargs ) qf1 = DisentangledMlpQf( encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) qf2 = DisentangledMlpQf( encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) target_qf1 = DisentangledMlpQf( encoder=Detach(encoder), preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) target_qf2 = DisentangledMlpQf( encoder=Detach(encoder), preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **policy_kwargs ) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **replay_buffer_kwargs ) sac_trainer = SACTrainer( env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs ) trainer = HERTrainer(sac_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=evaluation_goal_sampling_mode, ) expl_path_collector = GoalConditionedPathCollector( train_env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=exploration_goal_sampling_mode, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs, ) algorithm.to(ptu.device) if save_video: save_vf_heatmap = save_video_kwargs.get('save_vf_heatmap', True) def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action, return_individual_q_vals=True) add_heatmap = partial(add_heatmap_imgs_to_o_dict, v_function=v_function) rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, full_o_postprocess_func=add_heatmap if save_vf_heatmap else None, ) img_keys = ['v_vals'] + [ 'v_vals_dim_{}'.format(dim) for dim in range(latent_dim) ] eval_video_func = get_save_video_function( rollout_function, eval_env, MakeDeterministic(policy), tag="eval", get_extra_imgs=partial(get_extra_imgs, img_keys=img_keys), **save_video_kwargs ) train_video_func = get_save_video_function( rollout_function, train_env, policy, tag="train", get_extra_imgs=partial(get_extra_imgs, img_keys=img_keys), **save_video_kwargs ) decoder = ConcatMlp( input_size=obs_dim, output_size=obs_dim, hidden_sizes=[128, 128], ) decoder.to(ptu.device) # algorithm.post_train_funcs.append(train_decoder(variant, encoder, decoder)) # algorithm.post_train_funcs.append(plot_encoder_function(variant, encoder)) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_achieved_goal')) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_desired_goal')) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def grill_her_sac_experiment(variant): env = variant["env_class"](**variant['env_kwargs']) render = variant["render"] rdim = variant["rdim"] vae_path = variant["vae_paths"][str(rdim)] reward_params = variant.get("reward_params", dict()) init_camera = variant.get("init_camera", None) if init_camera is None: camera_name = "topview" else: camera_name = None env = ImageEnv( env, 84, init_camera=init_camera, camera_name=camera_name, transpose=True, normalize=True, ) env = VAEWrappedEnv(env, vae_path, decode_goals=render, render_goals=render, render_rollouts=render, reward_params=reward_params, **variant.get('vae_wrapped_env_kwargs', {})) if variant['normalize']: env = NormalizedBoxEnv(env) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size hidden_sizes = variant.get('hidden_sizes', [400, 300]) qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) vf = ConcatMlp( input_size=obs_dim, output_size=1, hidden_sizes=hidden_sizes, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes, ) training_mode = variant.get("training_mode", "train") testing_mode = variant.get("testing_mode", "test") testing_env = pickle.loads(pickle.dumps(env)) testing_env.mode(testing_mode) training_env = pickle.loads(pickle.dumps(env)) training_env.mode(training_mode) relabeling_env = pickle.loads(pickle.dumps(env)) relabeling_env.mode(training_mode) relabeling_env.disable_render() video_vae_env = pickle.loads(pickle.dumps(env)) video_vae_env.mode("video_vae") video_goal_env = pickle.loads(pickle.dumps(env)) video_goal_env.mode("video_env") replay_buffer = ObsDictRelabelingBuffer( env=relabeling_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_kwargs']) variant["algo_kwargs"]["replay_buffer"] = replay_buffer algorithm = HerSac(testing_env, training_env=training_env, qf=qf, vf=vf, policy=policy, render=render, render_during_eval=render, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs']) if ptu.gpu_enabled(): print("using GPU") qf.to(ptu.device) vf.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) for e in [testing_env, training_env, video_vae_env, video_goal_env]: e.vae.to(ptu.device) algorithm.train() if variant.get("save_video", True): logdir = logger.get_snapshot_dir() policy.train(False) filename = osp.join(logdir, 'video_final_env.mp4') rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=algorithm.observation_key, desired_goal_key=algorithm.desired_goal_key, ) dump_video(video_goal_env, policy, filename, rollout_function) filename = osp.join(logdir, 'video_final_vae.mp4') dump_video(video_vae_env, policy, filename, rollout_function)