def experiment(variant): import multiworld multiworld.register_all_envs() eval_env = gym.make('SawyerPickupEnv-v0') expl_env = gym.make('SawyerPickupEnv-v0') observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(get_env(variant['env'], variant['seed'])) eval_env = NormalizedBoxEnv(get_env(variant['env'], variant['seed'])) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] num_layer = variant['num_layer'] network_structure = [M] * num_layer NUM_ENSEMBLE = variant['num_ensemble'] L_qf1, L_qf2, L_target_qf1, L_target_qf2, L_policy, L_eval_policy = [], [], [], [], [], [] for _ in range(NUM_ENSEMBLE): qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=network_structure, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=network_structure, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=network_structure, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=network_structure, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=network_structure, ) eval_policy = MakeDeterministic(policy) L_qf1.append(qf1) L_qf2.append(qf2) L_target_qf1.append(target_qf1) L_target_qf2.append(target_qf2) L_policy.append(policy) L_eval_policy.append(eval_policy) eval_path_collector = EnsembleMdpPathCollector( eval_env, L_eval_policy, NUM_ENSEMBLE, eval_flag=True, ) expl_path_collector = EnsembleMdpPathCollector( expl_env, L_policy, NUM_ENSEMBLE, ber_mean=variant['ber_mean'], eval_flag=False, critic1=L_qf1, critic2=L_qf2, inference_type=variant['inference_type'], feedback_type=1, ) replay_buffer = EnsembleEnvReplayBuffer( variant['replay_buffer_size'], expl_env, NUM_ENSEMBLE, log_dir=variant['log_dir'], ) trainer = NeurIPS20SACEnsembleTrainer( env=eval_env, policy=L_policy, qf1=L_qf1, qf2=L_qf2, target_qf1=L_target_qf1, target_qf2=L_target_qf2, num_ensemble=NUM_ENSEMBLE, feedback_type=1, temperature=variant['temperature'], temperature_act=0, expl_gamma=0, log_dir=variant['log_dir'], **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def her_sac_experiment( max_path_length, qf_kwargs, twin_sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, save_video=True, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', # Video parameters save_video_kwargs=None, exploration_policy_kwargs=None, **kwargs ): if exploration_policy_kwargs is None: exploration_policy_kwargs = {} import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm if not save_video_kwargs: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() train_env = gym.make(env_id) eval_env = gym.make(env_id) else: eval_env = env_class(**env_kwargs) train_env = env_class(**env_kwargs) obs_dim = ( train_env.observation_space.spaces[observation_key].low.size + train_env.observation_space.spaces[desired_goal_key].low.size ) action_dim = train_env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs ) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **replay_buffer_kwargs ) trainer = SACTrainer( env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs ) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=evaluation_goal_sampling_mode, ) exploration_policy = create_exploration_policy( train_env, policy, **exploration_policy_kwargs) expl_path_collector = GoalConditionedPathCollector( train_env, exploration_policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=exploration_goal_sampling_mode, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs ) algorithm.to(ptu.device) if save_video: rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, return_dict_obs=True, ) eval_video_func = get_save_video_function( rollout_function, eval_env, MakeDeterministic(policy), tag="eval", **save_video_kwargs ) train_video_func = get_save_video_function( rollout_function, train_env, exploration_policy, tag="expl", **save_video_kwargs ) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_achieved_goal')) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_desired_goal')) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from sequential_differential_game import SequentialDifferentialGame expl_env = SequentialDifferentialGame(**variant['env_kwargs']) eval_env = SequentialDifferentialGame(**variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, eval_policy_n, expl_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \ [], [], [], [], [], [], [] for i in range(num_agent): from rlkit.torch.layers import SplitLayer, ReshapeLayer weight_head = nn.Linear(variant['policy_kwargs']['hidden_dim'], variant['policy_kwargs']['m']) mean_head = nn.Sequential( nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim * variant['policy_kwargs']['m']), ReshapeLayer(shape=[variant['policy_kwargs']['m'], action_dim])) logstd_head = nn.Sequential( nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim * variant['policy_kwargs']['m']), ReshapeLayer(shape=[variant['policy_kwargs']['m'], action_dim])) policy = nn.Sequential( nn.Linear(obs_dim, variant['policy_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['policy_kwargs']['hidden_dim'], variant['policy_kwargs']['hidden_dim']), nn.ReLU(), SplitLayer(layers=[weight_head, mean_head, logstd_head])) from rlkit.torch.policies.mix_tanh_gaussian_policy import MixTanhGaussianPolicy policy = MixTanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy from rlkit.torch.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2, ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2, ) target_qf2 = copy.deepcopy(qf2) policy_n.append(policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.masac.masac import MASACTrainer trainer = MASACTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet gnn1 = GNNNet( graph_builder_1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp qf1 = nn.Sequential( gnn1, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf1 = copy.deepcopy(qf1) from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet gnn2 = GNNNet( graph_builder_2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) qf2 = nn.Sequential( gnn2, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf2 = copy.deepcopy(qf2) policy_n, eval_policy_n, expl_policy_n = [], [], [] for i in range(num_agent): from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.masac.masac_gnn import MASACGNNTrainer trainer = MASACGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet gnn1 = GNNNet( graph_builder_1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp qf1 = nn.Sequential( gnn1, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf1 = copy.deepcopy(qf1) from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet gnn2 = GNNNet( graph_builder_2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) qf2 = nn.Sequential( gnn2, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cgca = GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn9 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def probabilistic_goal_reaching_experiment( max_path_length, qf_kwargs, policy_kwargs, pgr_trainer_kwargs, replay_buffer_kwargs, algo_kwargs, env_id, discount_factor, reward_type, # Dynamics model dynamics_model_version, dynamics_model_config, dynamics_delta_model_config=None, dynamics_adam_config=None, dynamics_ensemble_kwargs=None, # Discount model learn_discount_model=False, discount_adam_config=None, discount_model_config=None, prior_discount_weight_schedule_kwargs=None, # Environment env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', exploration_policy_kwargs=None, action_noise_scale=0., num_presampled_goals=4096, success_threshold=0.05, # Video / visualization parameters save_video=True, save_video_kwargs=None, video_renderer_kwargs=None, plot_renderer_kwargs=None, eval_env_ids=None, # Debugging params visualize_dynamics=False, visualize_discount_model=False, visualize_all_plots=False, plot_discount=False, plot_reward=False, plot_bootstrap_value=False, # env specific-params normalize_distances_for_full_state_ant=False, ): if dynamics_ensemble_kwargs is None: dynamics_ensemble_kwargs = {} if eval_env_ids is None: eval_env_ids = {'eval': env_id} if discount_model_config is None: discount_model_config = {} if dynamics_delta_model_config is None: dynamics_delta_model_config = {} if dynamics_adam_config is None: dynamics_adam_config = {} if discount_adam_config is None: discount_adam_config = {} if exploration_policy_kwargs is None: exploration_policy_kwargs = {} if not save_video_kwargs: save_video_kwargs = {} if not video_renderer_kwargs: video_renderer_kwargs = {} if not plot_renderer_kwargs: plot_renderer_kwargs = video_renderer_kwargs.copy() plot_renderer_kwargs['dpi'] = 48 context_key = desired_goal_key stub_env = get_gym_env( env_id, env_class=env_class, env_kwargs=env_kwargs, unwrap_timed_envs=True, ) is_gym_env = ( isinstance(stub_env, FetchEnv) or isinstance(stub_env, AntXYGoalEnv) or isinstance(stub_env, AntFullPositionGoalEnv) # or isinstance(stub_env, HopperFullPositionGoalEnv) ) is_ant_full_pos = isinstance(stub_env, AntFullPositionGoalEnv) if is_gym_env: achieved_goal_key = desired_goal_key.replace('desired', 'achieved') ob_keys_to_save_in_buffer = [observation_key, achieved_goal_key] elif isinstance(stub_env, SawyerPickAndPlaceEnvYZ): achieved_goal_key = desired_goal_key.replace('desired', 'achieved') ob_keys_to_save_in_buffer = [observation_key, achieved_goal_key] else: achieved_goal_key = observation_key ob_keys_to_save_in_buffer = [observation_key] # TODO move all env-specific code to other file if isinstance(stub_env, SawyerDoorHookEnv): init_camera = sawyer_door_env_camera_v0 elif isinstance(stub_env, SawyerPushAndReachXYEnv): init_camera = sawyer_init_camera_zoomed_in elif isinstance(stub_env, SawyerPickAndPlaceEnvYZ): init_camera = sawyer_pick_and_place_camera else: init_camera = None full_ob_space = stub_env.observation_space action_space = stub_env.action_space state_to_goal = StateToGoalFn(stub_env) dynamics_model = create_goal_dynamics_model( full_ob_space[observation_key], action_space, full_ob_space[achieved_goal_key], dynamics_model_version, state_to_goal, dynamics_model_config, dynamics_delta_model_config, ensemble_model_kwargs=dynamics_ensemble_kwargs, ) sample_context_from_obs_dict_fn = RemapKeyFn( {context_key: achieved_goal_key}) def contextual_env_distrib_reward(_env_id, _env_class=None, _env_kwargs=None): base_env = get_gym_env( _env_id, env_class=env_class, env_kwargs=env_kwargs, unwrap_timed_envs=True, ) if init_camera: base_env.initialize_camera(init_camera) if (isinstance(stub_env, AntFullPositionGoalEnv) and normalize_distances_for_full_state_ant): base_env = NormalizeAntFullPositionGoalEnv(base_env) normalize_env = base_env else: normalize_env = None env = NoisyAction(base_env, action_noise_scale) diag_fns = [] if is_gym_env: goal_distribution = GoalDictDistributionFromGymGoalEnv( env, desired_goal_key=desired_goal_key, ) diag_fns.append( GenericGoalConditionedContextualDiagnostics( desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, success_threshold=success_threshold, )) else: goal_distribution = GoalDictDistributionFromMultitaskEnv( env, desired_goal_keys=[desired_goal_key], ) diag_fns.append( GoalConditionedDiagnosticsToContextualDiagnostics( env.goal_conditioned_diagnostics, desired_goal_key=desired_goal_key, observation_key=observation_key, )) if isinstance(stub_env, AntFullPositionGoalEnv): diag_fns.append( AntFullPositionGoalEnvDiagnostics( desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, success_threshold=success_threshold, normalize_env=normalize_env, )) # if isinstance(stub_env, HopperFullPositionGoalEnv): # diag_fns.append( # HopperFullPositionGoalEnvDiagnostics( # desired_goal_key=desired_goal_key, # achieved_goal_key=achieved_goal_key, # success_threshold=success_threshold, # ) # ) achieved_from_ob = IndexIntoAchievedGoal(achieved_goal_key, ) if reward_type == 'sparse': distance_fn = L2Distance( achieved_goal_from_observation=achieved_from_ob, desired_goal_key=desired_goal_key, ) reward_fn = ThresholdDistanceReward(distance_fn, success_threshold) elif reward_type == 'negative_distance': reward_fn = NegativeL2Distance( achieved_goal_from_observation=achieved_from_ob, desired_goal_key=desired_goal_key, ) else: reward_fn = ProbabilisticGoalRewardFn( dynamics_model, state_key=observation_key, context_key=context_key, reward_type=reward_type, discount_factor=discount_factor, ) goal_distribution = PresampledDistribution(goal_distribution, num_presampled_goals) final_env = ContextualEnv( env, context_distribution=goal_distribution, reward_fn=reward_fn, observation_key=observation_key, contextual_diagnostics_fns=diag_fns, update_env_info_fn=delete_info, ) return final_env, goal_distribution, reward_fn expl_env, expl_context_distrib, reward_fn = contextual_env_distrib_reward( env_id, env_class, env_kwargs, ) obs_dim = (expl_env.observation_space.spaces[observation_key].low.size + expl_env.observation_space.spaces[context_key].low.size) action_dim = expl_env.action_space.low.size def create_qf(): return ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) qf1 = create_qf() qf2 = create_qf() target_qf1 = create_qf() target_qf2 = create_qf() def create_policy(): obs_processor = MultiHeadedMlp(input_size=obs_dim, output_sizes=[action_dim, action_dim], **policy_kwargs) return PolicyFromDistributionGenerator(TanhGaussian(obs_processor)) policy = create_policy() def concat_context_to_obs(batch, replay_buffer, obs_dict, next_obs_dict, new_contexts): obs = batch['observations'] next_obs = batch['next_observations'] batch['original_observations'] = obs batch['original_next_observations'] = next_obs context = batch[context_key] batch['observations'] = np.concatenate([obs, context], axis=1) batch['next_observations'] = np.concatenate([next_obs, context], axis=1) return batch replay_buffer = ContextualRelabelingReplayBuffer( env=expl_env, context_keys=[context_key], observation_keys=ob_keys_to_save_in_buffer, context_distribution=expl_context_distrib, sample_context_from_obs_dict_fn=sample_context_from_obs_dict_fn, reward_fn=reward_fn, post_process_batch_fn=concat_context_to_obs, **replay_buffer_kwargs) def create_trainer(): trainers = OrderedDict() if learn_discount_model: discount_model = create_discount_model( ob_space=stub_env.observation_space[observation_key], goal_space=stub_env.observation_space[context_key], action_space=stub_env.action_space, model_kwargs=discount_model_config) optimizer = optim.Adam(discount_model.parameters(), **discount_adam_config) discount_trainer = DiscountModelTrainer( discount_model, optimizer, observation_key='observations', next_observation_key='original_next_observations', goal_key=context_key, state_to_goal_fn=state_to_goal, ) trainers['discount_trainer'] = discount_trainer else: discount_model = None if prior_discount_weight_schedule_kwargs is not None: schedule = create_schedule(**prior_discount_weight_schedule_kwargs) else: schedule = None pgr_trainer = PGRTrainer(env=expl_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, discount=discount_factor, discount_model=discount_model, prior_discount_weight_schedule=schedule, **pgr_trainer_kwargs) trainers[''] = pgr_trainer optimizers = [ pgr_trainer.qf1_optimizer, pgr_trainer.qf2_optimizer, pgr_trainer.alpha_optimizer, pgr_trainer.policy_optimizer, ] if dynamics_model_version in { 'learned_model', 'learned_model_ensemble', 'learned_model_laplace', 'learned_model_laplace_global_variance', 'learned_model_gaussian_global_variance', }: model_opt = optim.Adam(dynamics_model.parameters(), **dynamics_adam_config) elif dynamics_model_version in { 'fixed_standard_laplace', 'fixed_standard_gaussian', }: model_opt = None else: raise NotImplementedError() model_trainer = GenerativeGoalDynamicsModelTrainer( dynamics_model, model_opt, state_to_goal=state_to_goal, observation_key='original_observations', next_observation_key='original_next_observations', ) trainers['dynamics_trainer'] = model_trainer optimizers.append(model_opt) return JointTrainer(trainers), pgr_trainer trainer, pgr_trainer = create_trainer() eval_policy = MakeDeterministic(policy) def create_eval_path_collector(some_eval_env): return ContextualPathCollector( some_eval_env, eval_policy, observation_key=observation_key, context_keys_for_policy=[context_key], ) path_collectors = dict() eval_env_name_to_env_and_context_distrib = dict() for name, extra_env_id in eval_env_ids.items(): env, context_distrib, _ = contextual_env_distrib_reward(extra_env_id) path_collectors[name] = create_eval_path_collector(env) eval_env_name_to_env_and_context_distrib[name] = (env, context_distrib) eval_path_collector = JointPathCollector(path_collectors) exploration_policy = create_exploration_policy(expl_env, policy, **exploration_policy_kwargs) expl_path_collector = ContextualPathCollector( expl_env, exploration_policy, observation_key=observation_key, context_keys_for_policy=[context_key], ) def get_eval_diagnostics(key_to_paths): stats = OrderedDict() for eval_env_name, paths in key_to_paths.items(): env, _ = eval_env_name_to_env_and_context_distrib[eval_env_name] stats.update( add_prefix( env.get_diagnostics(paths), eval_env_name, divider='/', )) stats.update( add_prefix( eval_util.get_generic_path_information(paths), eval_env_name, divider='/', )) return stats algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=None, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, evaluation_get_diagnostic_functions=[get_eval_diagnostics], **algo_kwargs) algorithm.to(ptu.device) if normalize_distances_for_full_state_ant and is_ant_full_pos: qpos_weights = expl_env.unwrapped.presampled_qpos.std(axis=0) else: qpos_weights = None if save_video: if is_gym_env: video_renderer = GymEnvRenderer(**video_renderer_kwargs) def set_goal_for_visualization(env, policy, o): goal = o[desired_goal_key] if normalize_distances_for_full_state_ant and is_ant_full_pos: unnormalized_goal = goal * qpos_weights env.unwrapped.goal = unnormalized_goal else: env.unwrapped.goal = goal rollout_function = partial( rf.contextual_rollout, max_path_length=max_path_length, observation_key=observation_key, context_keys_for_policy=[context_key], reset_callback=set_goal_for_visualization, ) else: video_renderer = EnvRenderer(**video_renderer_kwargs) rollout_function = partial( rf.contextual_rollout, max_path_length=max_path_length, observation_key=observation_key, context_keys_for_policy=[context_key], reset_callback=None, ) renderers = OrderedDict(image_observation=video_renderer, ) state_env = expl_env.env state_space = state_env.observation_space[observation_key] low = state_space.low.min() high = state_space.high.max() y = np.linspace(low, high, num=video_renderer.image_chw[1]) x = np.linspace(low, high, num=video_renderer.image_chw[2]) all_xy_np = np.transpose([np.tile(x, len(y)), np.repeat(y, len(x))]) all_xy_torch = ptu.from_numpy(all_xy_np) num_states = all_xy_torch.shape[0] if visualize_dynamics: def create_dynamics_visualizer(show_prob, vary_state=False): def get_prob(obs_dict, action): obs = obs_dict['state_observation'] obs_torch = ptu.from_numpy(obs)[None] action_torch = ptu.from_numpy(action)[None] if vary_state: action_repeated = torch.zeros((num_states, 2)) dist = dynamics_model(all_xy_torch, action_repeated) goal = ptu.from_numpy( obs_dict['state_desired_goal'][None]) log_probs = dist.log_prob(goal) else: dist = dynamics_model(obs_torch, action_torch) log_probs = dist.log_prob(all_xy_torch) if show_prob: return log_probs.exp() else: return log_probs return get_prob renderers['log_prob'] = ValueRenderer( create_dynamics_visualizer(False), **video_renderer_kwargs) # renderers['prob'] = ValueRenderer( # create_dynamics_visualizer(True), **video_renderer_kwargs # ) renderers['log_prob_vary_state'] = ValueRenderer( create_dynamics_visualizer(False, vary_state=True), only_get_image_once_per_episode=True, max_out_walls=isinstance(stub_env, PickAndPlaceEnv), **video_renderer_kwargs) # renderers['prob_vary_state'] = ValueRenderer( # create_dynamics_visualizer(True, vary_state=True), # **video_renderer_kwargs) if visualize_discount_model and pgr_trainer.discount_model: def get_discount_values(obs, action): obs = obs['state_observation'] obs_torch = ptu.from_numpy(obs)[None] combined_obs = torch.cat([ obs_torch.repeat(num_states, 1), all_xy_torch, ], dim=1) action_torch = ptu.from_numpy(action)[None] action_repeated = action_torch.repeat(num_states, 1) return pgr_trainer.discount_model(combined_obs, action_repeated) renderers['discount_model'] = ValueRenderer( get_discount_values, states_to_eval=all_xy_torch, **video_renderer_kwargs) if 'log_prob' in renderers and 'discount_model' in renderers: renderers['log_prob_time_discount'] = ProductRenderer( renderers['discount_model'], renderers['log_prob'], **video_renderer_kwargs) def get_reward(obs_dict, action, next_obs_dict): o = batchify(obs_dict) a = batchify(action) next_o = batchify(next_obs_dict) reward = reward_fn(o, a, next_o, next_o) return reward[0] def get_bootstrap(obs_dict, action, next_obs_dict, return_float=True): context_pt = ptu.from_numpy(obs_dict[context_key][None]) o_pt = ptu.from_numpy(obs_dict[observation_key][None]) next_o_pt = ptu.from_numpy(next_obs_dict[observation_key][None]) action_torch = ptu.from_numpy(action[None]) bootstrap, *_ = pgr_trainer.get_bootstrap_stats( torch.cat((o_pt, context_pt), dim=1), action_torch, torch.cat((next_o_pt, context_pt), dim=1), ) if return_float: return ptu.get_numpy(bootstrap)[0, 0] else: return bootstrap def get_discount(obs_dict, action, next_obs_dict): bootstrap = get_bootstrap(obs_dict, action, next_obs_dict, return_float=False) reward_np = get_reward(obs_dict, action, next_obs_dict) reward = ptu.from_numpy(reward_np[None, None]) context_pt = ptu.from_numpy(obs_dict[context_key][None]) o_pt = ptu.from_numpy(obs_dict[observation_key][None]) obs = torch.cat((o_pt, context_pt), dim=1) actions = ptu.from_numpy(action[None]) discount = pgr_trainer.get_discount_factor( bootstrap, reward, obs, actions, ) if isinstance(discount, torch.Tensor): discount = ptu.get_numpy(discount)[0, 0] return np.clip(discount, a_min=1e-3, a_max=1) def create_modify_fn( title, set_params=None, scientific=True, ): def modify(ax): ax.set_title(title) if set_params: ax.set(**set_params) if scientific: scaler = ScalarFormatter(useOffset=True) scaler.set_powerlimits((1, 1)) ax.yaxis.set_major_formatter(scaler) ax.ticklabel_format(axis='y', style='sci') return modify def add_left_margin(fig): fig.subplots_adjust(left=0.2) if visualize_all_plots or plot_discount: renderers['discount'] = DynamicNumberEnvRenderer( dynamic_number_fn=get_discount, modify_ax_fn=create_modify_fn( title='discount', set_params=dict( # yscale='log', ylim=[-0.05, 1.1], ), # scientific=False, ), modify_fig_fn=add_left_margin, # autoscale_y=False, **plot_renderer_kwargs) if visualize_all_plots or plot_reward: renderers['reward'] = DynamicNumberEnvRenderer( dynamic_number_fn=get_reward, modify_ax_fn=create_modify_fn(title='reward', # scientific=False, ), modify_fig_fn=add_left_margin, **plot_renderer_kwargs) if visualize_all_plots or plot_bootstrap_value: renderers['bootstrap-value'] = DynamicNumberEnvRenderer( dynamic_number_fn=get_bootstrap, modify_ax_fn=create_modify_fn(title='bootstrap value', # scientific=False, ), modify_fig_fn=add_left_margin, **plot_renderer_kwargs) def add_images(env, state_distribution): state_env = env.env if is_gym_env: goal_distribution = state_distribution else: goal_distribution = AddImageDistribution( env=state_env, base_distribution=state_distribution, image_goal_key='image_desired_goal', renderer=video_renderer, ) context_env = ContextualEnv( state_env, context_distribution=goal_distribution, reward_fn=reward_fn, observation_key=observation_key, update_env_info_fn=delete_info, ) return InsertDebugImagesEnv( context_env, renderers=renderers, ) img_expl_env = add_images(expl_env, expl_context_distrib) if is_gym_env: imgs_to_show = list(renderers.keys()) else: imgs_to_show = ['image_desired_goal'] + list(renderers.keys()) img_formats = [video_renderer.output_image_format] img_formats += [r.output_image_format for r in renderers.values()] expl_video_func = get_save_video_function( rollout_function, img_expl_env, exploration_policy, tag="xplor", imsize=video_renderer.image_chw[1], image_formats=img_formats, keys_to_show=imgs_to_show, **save_video_kwargs) algorithm.post_train_funcs.append(expl_video_func) for eval_env_name, (env, context_distrib) in ( eval_env_name_to_env_and_context_distrib.items()): img_eval_env = add_images(env, context_distrib) eval_video_func = get_save_video_function( rollout_function, img_eval_env, eval_policy, tag=eval_env_name, imsize=video_renderer.image_chw[1], image_formats=img_formats, keys_to_show=imgs_to_show, **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \ [], [], [], [], [] log_alpha_n = None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, alpha_optimizer_n = \ None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) qf1 = FlattenMlp( input_size=(obs_dim + action_dim), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim + action_dim), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy_n = [ PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) for i in range(num_agent) ] else: expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.irl.irl_sac import IRLSACTrainer trainer = IRLSACTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, log_alpha_n=log_alpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from rlkit.torch.td3.td3 import TD3 as TD3Trainer from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from rlkit.torch.networks import ConcatMlp, TanhMlpPolicy # preprocess_rl_variant(variant) env = get_envs(variant) expl_env = env eval_env = env es = get_exploration_strategy(variant, env) if variant.get("use_masks", False): mask_wrapper_kwargs = variant.get("mask_wrapper_kwargs", dict()) expl_mask_distribution_kwargs = variant[ "expl_mask_distribution_kwargs"] expl_mask_distribution = DiscreteDistribution( **expl_mask_distribution_kwargs) expl_env = RewardMaskWrapper(env, expl_mask_distribution, **mask_wrapper_kwargs) eval_mask_distribution_kwargs = variant[ "eval_mask_distribution_kwargs"] eval_mask_distribution = DiscreteDistribution( **eval_mask_distribution_kwargs) eval_env = RewardMaskWrapper(env, eval_mask_distribution, **mask_wrapper_kwargs) env = eval_env max_path_length = variant['max_path_length'] observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = variant.get('achieved_goal_key', 'latent_achieved_goal') # achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) if variant.get("use_subgoal_policy", False): from rlkit.policies.timed_policy import SubgoalPolicyWrapper subgoal_policy_kwargs = variant.get('subgoal_policy_kwargs', {}) policy = SubgoalPolicyWrapper(wrapped_policy=policy, env=env, episode_length=max_path_length, **subgoal_policy_kwargs) target_policy = SubgoalPolicyWrapper(wrapped_policy=target_policy, env=env, episode_length=max_path_length, **subgoal_policy_kwargs) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, # use_masks=variant.get("use_masks", False), **variant['replay_buffer_kwargs']) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_trainer_kwargs']) # if variant.get("use_masks", False): # from rlkit.torch.her.her import MaskedHERTrainer # trainer = MaskedHERTrainer(trainer) # else: trainer = HERTrainer(trainer) if variant.get("do_state_exp", False): eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, # use_masks=variant.get("use_masks", False), # full_mask=True, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, # use_masks=variant.get("use_masks", False), ) else: eval_path_collector = VAEWrappedEnvPathCollector( env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=['evaluation_goal_sampling_mode'], ) expl_path_collector = VAEWrappedEnvPathCollector( env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=['exploration_goal_sampling_mode'], ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **variant['algo_kwargs']) vis_variant = variant.get('vis_kwargs', {}) vis_list = vis_variant.get('vis_list', []) if variant.get("save_video", True): if variant.get("do_state_exp", False): rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, # use_masks=variant.get("use_masks", False), # full_mask=True, # vis_list=vis_list, ) video_func = get_video_save_func( rollout_function, env, policy, variant, ) else: video_func = VideoSaveFunction( env, variant, ) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) if not variant.get("do_state_exp", False): env.vae.to(ptu.device) algorithm.train()
def experiment(variant): # from softlearning.environments.gym import register_image_reach # register_image_reach() # env = gym.envs.make( # 'Pusher2d-ImageReach-v0', # ) from softlearning.environments.gym.mujoco.image_pusher_2d import ( ImageForkReacher2dEnv) env_kwargs = { 'image_shape': (32, 32, 3), 'arm_goal_distance_cost_coeff': 1.0, 'arm_object_distance_cost_coeff': 0.0, } eval_env = ImageForkReacher2dEnv(**env_kwargs) expl_env = ImageForkReacher2dEnv(**env_kwargs) input_width, input_height, input_channels = eval_env.image_shape image_dim = input_width * input_height * input_channels action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=input_channels, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) non_image_dim = int(np.prod(eval_env.observation_space.shape)) - image_dim if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( Split(qf_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size + non_image_dim) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( Split(target_qf_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size + non_image_dim) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor(obs_processor=qf1_cnn, output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf1 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( Split(policy_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) policy = TanhGaussianPolicyAdapter( policy_obs_processor, policy_cnn.conv_output_flat_size + non_image_dim, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def offpolicy_main(variant): print("offpolicy main") if args.algo == 'sac': algo = "SAC" elif args.algo == 'td3': algo = "TD3" setup_logger('{0}_{1}'.format(args.env_name, args.save_name), variant=variant) ptu.set_gpu_mode(True) # optionally set the GPU (default=True) expl_env, eval_env, env_obj = prepare_env(args.env_name, args.visionmodel_path, **env_kwargs) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size expl_policy, eval_policy, trainer = prepare_trainer( algo, expl_env, obs_dim, action_dim, args.pretrained_policy_load, variant) if args.env_name.find('doorenv') > -1: expl_policy.knob_noisy = eval_policy.knob_noisy = args.knob_noisy expl_policy.nn = eval_policy.nn = env_obj.nn expl_policy.visionnet_input = eval_policy.visionnet_input = env_obj.visionnet_input if args.visionnet_input: visionmodel = load_visionmodel(expl_env._wrapped_env.xml_path, args.visionmodel_path, VisionModelXYZ()) visionmodel.to(ptu.device) expl_policy.visionmodel = visionmodel.eval() else: expl_policy.visionmodel = None eval_path_collector = MdpPathCollector( eval_env, eval_policy, doorenv=args.env_name.find('doorenv') > -1, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, doorenv=args.env_name.find('doorenv') > -1, ) if not args.replaybuffer_load: replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) else: replay_buffer = pickle.load(open(args.replaybuffer_load, "rb")) replay_buffer._env_info_keys = replay_buffer.env_info_sizes.keys() print("Loaded the replay buffer that has length of {}".format( replay_buffer.get_diagnostics())) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.save_interval = args.save_interval algorithm.save_dir = args.save_dir algorithm.algo = args.algo algorithm.env_name = args.env_name algorithm.save_name = args.save_name algorithm.env_kwargs = env_kwargs summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) algorithm.writer = writer algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from differential_game import DifferentialGame expl_env = DifferentialGame(game_name=args.exp_name) eval_env = DifferentialGame(game_name=args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] qf2_n, target_qf2_n = [], [] for i in range(num_agent): from rlkit.torch.networks import FlattenMlp qf = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2, ) target_qf = copy.deepcopy(qf) from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * 2, ) target_policy = copy.deepcopy(policy) eval_policy = policy from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: from rlkit.exploration_strategies.ou_strategy import OUStrategy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=expl_env.action_space), policy=policy, ) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) if variant['trainer_kwargs']['double_q']: qf2 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2, ) target_qf2 = copy.deepcopy(qf2) qf2_n.append(qf2) target_qf2_n.append(target_qf2) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.maddpg.maddpg import MADDPGTrainer trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = roboverse.make(variant['env'], transpose_image=True) expl_env = eval_env action_dim = eval_env.action_space.low.size cnn_params = variant['cnn_params'] cnn_params.update( input_width=48, input_height=48, input_channels=3, output_size=1, added_fc_input_size=action_dim, ) cnn_params.update( output_size=256, added_fc_input_size=0, hidden_sizes=[1024, 512], ) policy_obs_processor = CNN(**cnn_params) policy = TanhGaussianPolicy( obs_dim=cnn_params['output_size'], action_dim=action_dim, hidden_sizes=[256, 256, 256], obs_processor=policy_obs_processor, ) if variant['stoch_eval_policy']: eval_policy = policy else: eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector( eval_env, ) observation_key = 'image' replay_buffer = load_data_from_npy_chaining( variant, expl_env, observation_key) trainer = BCTrainer( env=eval_env, policy=policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=False, batch_rl=True, **variant['algorithm_kwargs'] ) video_func = VideoSaveFunction(variant) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # unwrap the TimeLimitEnv wrapper since we manually termiante after 50 steps # eval_env = gym.make('FetchPickAndPlace-v1').env # expl_env = gym.make('FetchPickAndPlace-v1').env eval_env = make_env() expl_env = make_env() print(eval_env.observation_space) observation_key = 'observation' desired_goal_key = 'desired_goal' # achieved_goal_key = desired_goal_key.replace("desired", "achieved") # replay_buffer = ObsDictRelabelingBuffer( # env=eval_env, # observation_key=observation_key, # desired_goal_key=desired_goal_key, # achieved_goal_key=achieved_goal_key, # **variant['replay_buffer_kwargs'] # ) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size # goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size print(obs_dim) print(action_dim) # print(goal_dim) qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer, use_per=False) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def encoder_wrapped_td3bc_experiment(variant): representation_size = 128 output_classes = 20 model_class = variant.get('model_class', TimestepPredictionModel) model = model_class( representation_size, # decoder_output_activation=decoder_activation, output_classes=output_classes, **variant['model_kwargs'], ) # model = torch.nn.DataParallel(model) model_path = variant.get("model_path") # model = load_local_or_remote_file(model_path) state_dict = torch.load(model_path) model.load_state_dict(state_dict) model.to(ptu.device) model.eval() traj = np.load(variant.get("desired_trajectory"), allow_pickle=True)[0] goal_image = traj["observations"][-1]["image_observation"] goal_image = goal_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0 # goal_image = goal_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # BECAUSE RLBENCH DEMOS ARENT IMAGE_ENV WRAPPED # goal_image = goal_image[:, :, :240, 60:500] goal_image = goal_image[:, :, 60:, 60:500] goal_image_pt = ptu.from_numpy(goal_image) save_image(goal_image_pt.data.cpu(), 'demos/goal.png', nrow=1) goal_latent = model.encode(goal_image_pt).detach().cpu().numpy().flatten() initial_image = traj["observations"][0]["image_observation"] initial_image = initial_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0 # initial_image = initial_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # initial_image = initial_image[:, :, :240, 60:500] initial_image = initial_image[:, :, 60:, 60:500] initial_image_pt = ptu.from_numpy(initial_image) save_image(initial_image_pt.data.cpu(), 'demos/initial.png', nrow=1) initial_latent = model.encode(initial_image_pt).detach().cpu().numpy().flatten() # Move these to td3_bc and bc_v3 (or at least type for reward_params) reward_params = dict( goal_latent=goal_latent, initial_latent=initial_latent, type=variant["reward_params_type"], ) config_params = variant.get("config_params") env = variant['env_class'](**variant['env_kwargs']) env = ImageEnv(env, recompute_reward=False, transpose=True, image_length=450000, reward_type="image_distance", # init_camera=sawyer_pusher_camera_upright_v2, ) env = EncoderWrappedEnv( env, model, reward_params, config_params, **variant.get("encoder_wrapped_env_kwargs", dict()) ) expl_env = env # variant['env_class'](**variant['env_kwargs']) eval_env = env # variant['env_class'](**variant['env_kwargs']) observation_key = variant.get("observation_key", 'state_observation') # one of 'state_observation', 'latent_observation', 'concat_observation' desired_goal_key = 'latent_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, **variant["exploration_kwargs"], ) obs_dim = expl_env.observation_space.spaces[observation_key].low.size goal_dim = expl_env.observation_space.spaces[desired_goal_key].low.size action_dim = expl_env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, # output_activation=TorchMaxClamp(0.0), **variant['qf_kwargs'] ) qf2 = ConcatMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, # output_activation=TorchMaxClamp(0.0), **variant['qf_kwargs'] ) target_qf1 = ConcatMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, # output_activation=TorchMaxClamp(0.0), **variant['qf_kwargs'] ) target_qf2 = ConcatMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, # output_activation=TorchMaxClamp(0.0), **variant['qf_kwargs'] ) # Support for CNNPolicy based policy/target policy # Defaults to TanhMlpPolicy unless cnn_params is supplied in variant if 'cnn_params' in variant.keys(): imsize = 48 policy = CNNPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=3, **variant['cnn_params'], output_activation=torch.tanh, ) target_policy = CNNPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=3, **variant['cnn_params'], output_activation=torch.tanh, ) else: policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) demo_train_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) demo_test_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) td3bc_trainer = TD3BCTrainer( env=env, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs'] ) trainer = HERTrainer(td3bc_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) if variant.get("save_video", True): video_func = VideoSaveFunction( env, variant, ) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) td3bc_trainer.load_demos() td3bc_trainer.pretrain_policy_with_bc() td3bc_trainer.pretrain_q_with_bc_data() algorithm.train()
def twin_sac_experiment(variant): import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from rlkit.torch.sac.policies import MakeDeterministic from rlkit.torch.sac.sac import SACTrainer preprocess_rl_variant(variant) env = get_envs(variant) max_path_length = variant['max_path_length'] observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['twin_sac_trainer_kwargs']) trainer = HERTrainer(trainer) if variant.get("do_state_exp", False): eval_path_collector = GoalConditionedPathCollector( env, MakeDeterministic(policy), observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) else: eval_path_collector = VAEWrappedEnvPathCollector( variant['evaluation_goal_sampling_mode'], env, MakeDeterministic(policy), observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = VAEWrappedEnvPathCollector( variant['exploration_goal_sampling_mode'], env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **variant['algo_kwargs']) if variant.get("save_video", True): video_func = VideoSaveFunction( env, variant, ) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) if not variant.get("do_state_exp", False): env.vae.to(ptu.device) algorithm.train()
def experiment(variant): img_size = 64 train_top10 = VisualRandomizationConfig( image_directory='./experiment_textures/train/top10', whitelist=[ 'Floor', 'Roof', 'Wall1', 'Wall2', 'Wall3', 'Wall4', 'diningTable_visible' ], apply_arm=False, apply_gripper=False, apply_floor=True) expl_env = gym.make('reach_target_easy-vision-v0', sparse=False, img_size=img_size, force_randomly_place=True, force_change_position=False, blank=True) expl_env = wrappers.FlattenDictWrapper(expl_env, dict_keys=['observation']) t_fn = variant["t_fn"] expl_env = TransformObservationWrapper(expl_env, t_fn) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size conv_args = { "input_width": 64, "input_height": 64, "input_channels": 3, "kernel_sizes": [4, 4, 3], "n_channels": [32, 64, 64], "strides": [2, 1, 1], "paddings": [0, 0, 0], "hidden_sizes": [1024, 512], "batch_norm_conv": False, "batch_norm_fc": False, 'init_w': 1e-4, "hidden_init": nn.init.orthogonal_, "hidden_activation": nn.ReLU(), } qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) target_policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) # es = GaussianStrategy( # action_space=expl_env.action_space, # max_sigma=0.3, # min_sigma=0.1, # Constant sigma # ) es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, epsilon=0.3, max_sigma=0.0, min_sigma=0.0, #constant sigma 0 decay_period=1000000) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=None, exploration_data_collector=expl_path_collector, evaluation_data_collector=None, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf1_n, qf2_n, cactor_n, policy_n = [], [], [], [] target_qf1_n, target_qf2_n = [], [] log_alpha_n, log_calpha_n = None, None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, cactor_optimizer_n, alpha_optimizer_n, calpha_optimizer_n = \ None, None, None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.networks.layers import SplitLayer if variant['trainer_kwargs']['dec_cactor']: input_size = obs_dim + action_dim * (num_agent - 1) else: input_size = obs_dim * num_agent + action_dim * (num_agent - 1) cactor = nn.Sequential( FlattenMlp( input_size=input_size, output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) qf1_n.append(qf1) qf2_n.append(qf2) cactor_n.append(cactor) policy_n.append(policy) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.r2g.r2g import R2GTrainer trainer = R2GTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, cactor_n=cactor_n, log_alpha_n=log_alpha_n, log_calpha_n=log_calpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, cactor_optimizer_n=cactor_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, calpha_optimizer_n=calpha_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=True, discrete_action_input=True)) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=True, discrete_action_input=True)) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy_n, target_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [], [] for i in range(num_agent): policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = copy.deepcopy(policy) qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf2) eval_policy = ArgmaxDiscretePolicy(policy) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) policy_n.append(policy) target_policy_n.append(target_policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = PRGDiscreteTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, target_policy_n=target_policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet( graph_builder_1, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs'] ) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet( graph_builder_2, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs'] ) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cgca = GraphContextNet( graph_builder_ca, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs'] ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( FlattenMlp(input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn7 import R2GGNNTrainer trainer = R2GGNNTrainer( env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cgca=cgca, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) else: load_hdf5(d4rl.qlearning_dataset(eval_env), replay_buffer) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from sequential_differential_game import SequentialDifferentialGame expl_env = SequentialDifferentialGame(**variant['env_kwargs']) eval_env = SequentialDifferentialGame(**variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cgca = GraphContextNet( graph_builder_ca, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp(input_size=variant['graph_kwargs']['hidden_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), ), nn.ReLU(), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] qf1_n, qf2_n, target_qf1_n, target_qf2_n = [], [], [], [] for i in range(num_agent): qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) qf1_n.append(qf1) qf2_n.append(qf2) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3_onlyca import R2GGNNTrainer trainer = R2GGNNTrainer( env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def run_rlkit(env, seed, log_dir): """ Create rlkit model and training. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return result csv file """ reset_execution_environment() gt.reset() setup_logger(log_dir=log_dir) expl_env = NormalizedBoxEnv(env) eval_env = NormalizedBoxEnv(env) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=params['qf_hidden_sizes']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=params['qf_hidden_sizes']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=params['qf_hidden_sizes']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=params['qf_hidden_sizes']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, hidden_sizes=params['policy_hidden_sizes']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, hidden_sizes=params['policy_hidden_sizes']) es = RLkitGaussianStrategy( action_space=expl_env.action_space, max_sigma=params['sigma'], min_sigma=params['sigma'], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( params['replay_buffer_size'], expl_env, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, discount=params['discount']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, num_epochs=params['n_epochs'], num_train_loops_per_epoch=params['steps_per_epoch'], num_trains_per_train_loop=params['n_train_steps'], num_expl_steps_per_train_loop=params['n_rollout_steps'], num_eval_steps_per_epoch=params['n_rollout_steps'], min_num_steps_before_training=params['min_buffer_size'], max_path_length=params['n_rollout_steps'], batch_size=params['buffer_batch_size'], ) algorithm.to(ptu.device) algorithm.train() return osp.join(log_dir, 'progress.csv')
def experiment(variant): expl_env = NormalizedBoxEnv(CartPoleEnv(mode=1)) eval_env = NormalizedBoxEnv(CartPoleEnv(mode=1)) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] vf1 = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) vf2 = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) target_vf1 = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) target_vf2 = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], return_raw_action=True, ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, store_raw_action=True, ) trainer = FlowQTrainer(env=eval_env, policy=policy, vf1=vf1, vf2=vf2, target_vf1=target_vf1, target_vf2=target_vf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_name = variant['env_name'] if env_name in ENVS: eval_env = NormalizedBoxEnv(ENVS[env_name]()) expl_env = eval_env else: eval_env = NormalizedBoxEnv(gym.make(variant['env_name'])) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
batch_size=256, num_actions_sample=100, ) trainer = BEARTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg=True, **algorithm_kwargs) from flow.controllers.base_controller import BaseController class RLTestConntroller(BaseController): def __init__(self, veh_id, car_following_params): """Instantiate an RL Controller.""" BaseController.__init__(self, veh_id, car_following_params) def get_accel(self, env): action = algorithm.policy_fn(env.states)
def _disentangled_her_twin_sac_experiment_v2( max_path_length, encoder_kwargs, disentangled_qf_kwargs, qf_kwargs, twin_sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, save_video=True, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', # Video parameters latent_dim=2, save_video_kwargs=None, **kwargs ): import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm if save_video_kwargs is None: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() train_env = gym.make(env_id) eval_env = gym.make(env_id) else: eval_env = env_class(**env_kwargs) train_env = env_class(**env_kwargs) obs_dim = train_env.observation_space.spaces[observation_key].low.size goal_dim = train_env.observation_space.spaces[desired_goal_key].low.size action_dim = train_env.action_space.low.size encoder = ConcatMlp( input_size=goal_dim, output_size=latent_dim, **encoder_kwargs ) qf1 = DisentangledMlpQf( encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) qf2 = DisentangledMlpQf( encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) target_qf1 = DisentangledMlpQf( encoder=Detach(encoder), preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) target_qf2 = DisentangledMlpQf( encoder=Detach(encoder), preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **policy_kwargs ) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **replay_buffer_kwargs ) sac_trainer = SACTrainer( env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs ) trainer = HERTrainer(sac_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=evaluation_goal_sampling_mode, ) expl_path_collector = GoalConditionedPathCollector( train_env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=exploration_goal_sampling_mode, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs, ) algorithm.to(ptu.device) if save_video: save_vf_heatmap = save_video_kwargs.get('save_vf_heatmap', True) def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action, return_individual_q_vals=True) add_heatmap = partial(add_heatmap_imgs_to_o_dict, v_function=v_function) rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, full_o_postprocess_func=add_heatmap if save_vf_heatmap else None, ) img_keys = ['v_vals'] + [ 'v_vals_dim_{}'.format(dim) for dim in range(latent_dim) ] eval_video_func = get_save_video_function( rollout_function, eval_env, MakeDeterministic(policy), tag="eval", get_extra_imgs=partial(get_extra_imgs, img_keys=img_keys), **save_video_kwargs ) train_video_func = get_save_video_function( rollout_function, train_env, policy, tag="train", get_extra_imgs=partial(get_extra_imgs, img_keys=img_keys), **save_video_kwargs ) decoder = ConcatMlp( input_size=obs_dim, output_size=obs_dim, hidden_sizes=[128, 128], ) decoder.to(ptu.device) # algorithm.post_train_funcs.append(train_decoder(variant, encoder, decoder)) # algorithm.post_train_funcs.append(plot_encoder_function(variant, encoder)) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_achieved_goal')) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_desired_goal')) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def experiment(variant): eval_env = gym.make('FetchReach-v1') expl_env = gym.make('FetchReach-v1') observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(HumanoidEnv()) eval_env = NormalizedBoxEnv(HumanoidEnv()) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def goal_conditioned_sac_experiment( max_path_length, qf_kwargs, sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, algo_kwargs, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', exploration_policy_kwargs=None, evaluation_goal_sampling_mode=None, exploration_goal_sampling_mode=None, # Video parameters save_video=True, save_video_kwargs=None, renderer_kwargs=None, ): if exploration_policy_kwargs is None: exploration_policy_kwargs = {} if not save_video_kwargs: save_video_kwargs = {} if not renderer_kwargs: renderer_kwargs = {} context_key = desired_goal_key sample_context_from_obs_dict_fn = RemapKeyFn( {context_key: observation_key}) def contextual_env_distrib_and_reward(env_id, env_class, env_kwargs, goal_sampling_mode): env = get_gym_env(env_id, env_class=env_class, env_kwargs=env_kwargs) env.goal_sampling_mode = goal_sampling_mode goal_distribution = GoalDictDistributionFromMultitaskEnv( env, desired_goal_keys=[desired_goal_key], ) reward_fn = ContextualRewardFnFromMultitaskEnv( env=env, achieved_goal_from_observation=IndexIntoAchievedGoal( observation_key), desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, ) diag_fn = GoalConditionedDiagnosticsToContextualDiagnostics( env.goal_conditioned_diagnostics, desired_goal_key=desired_goal_key, observation_key=observation_key, ) env = ContextualEnv( env, context_distribution=goal_distribution, reward_fn=reward_fn, observation_key=observation_key, contextual_diagnostics_fns=[diag_fn], update_env_info_fn=delete_info, ) return env, goal_distribution, reward_fn expl_env, expl_context_distrib, expl_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, exploration_goal_sampling_mode) eval_env, eval_context_distrib, eval_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, evaluation_goal_sampling_mode) obs_dim = (expl_env.observation_space.spaces[observation_key].low.size + expl_env.observation_space.spaces[context_key].low.size) action_dim = expl_env.action_space.low.size def create_qf(): return ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) qf1 = create_qf() qf2 = create_qf() target_qf1 = create_qf() target_qf2 = create_qf() policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs) def concat_context_to_obs(batch, *args, **kwargs): obs = batch['observations'] next_obs = batch['next_observations'] context = batch[context_key] batch['observations'] = np.concatenate([obs, context], axis=1) batch['next_observations'] = np.concatenate([next_obs, context], axis=1) return batch replay_buffer = ContextualRelabelingReplayBuffer( env=eval_env, context_keys=[context_key], observation_keys_to_save=[observation_key], context_distribution=eval_context_distrib, sample_context_from_obs_dict_fn=sample_context_from_obs_dict_fn, reward_fn=eval_reward, post_process_batch_fn=concat_context_to_obs, **replay_buffer_kwargs) trainer = SACTrainer(env=expl_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **sac_trainer_kwargs) eval_path_collector = ContextualPathCollector( eval_env, MakeDeterministic(policy), observation_key=observation_key, context_keys_for_policy=[context_key], ) exploration_policy = create_exploration_policy(policy=policy, env=expl_env, **exploration_policy_kwargs) expl_path_collector = ContextualPathCollector( expl_env, exploration_policy, observation_key=observation_key, context_keys_for_policy=[context_key], ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs) algorithm.to(ptu.device) if save_video: rollout_function = partial( rf.contextual_rollout, max_path_length=max_path_length, observation_key=observation_key, context_keys_for_policy=[context_key], ) renderer = EnvRenderer(**renderer_kwargs) def add_images(env, state_distribution): state_env = env.env image_goal_distribution = AddImageDistribution( env=state_env, base_distribution=state_distribution, image_goal_key='image_desired_goal', renderer=renderer, ) img_env = InsertImageEnv(state_env, renderer=renderer) return ContextualEnv( img_env, context_distribution=image_goal_distribution, reward_fn=eval_reward, observation_key=observation_key, update_env_info_fn=delete_info, ) img_eval_env = add_images(eval_env, eval_context_distrib) img_expl_env = add_images(expl_env, expl_context_distrib) eval_video_func = get_save_video_function( rollout_function, img_eval_env, MakeDeterministic(policy), tag="eval", imsize=renderer.width, image_format=renderer.output_image_format, **save_video_kwargs) expl_video_func = get_save_video_function( rollout_function, img_expl_env, exploration_policy, tag="train", imsize=renderer.width, image_format=renderer.output_image_format, **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(expl_video_func) algorithm.train()