def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet(graph_builder_1, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet(graph_builder_2, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cgca = GNNNet( pre_graph_builder=graph_builder_ca, node_dim=variant['graph_kwargs']['node_dim'], conv_type='GSage', num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='lrelu0.2', output_activation='lrelu0.2', ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cgca=cgca, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=3) eval_env = CartPoleEnv(mode=3) num_agent = expl_env.num_agents obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_obs = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet obs_gnn_1 = GNNNet( graph_builder_obs, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) graph_builder_eval = FullGraphBuilder( input_node_dim=graph_builder_obs.output_node_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) if variant['concat_emb']: gnn_out_dim = int(obs_dim + variant['graph_kwargs']['node_dim'] * variant['graph_kwargs']['num_conv_layers']) else: gnn_out_dim = variant['graph_kwargs']['node_dim'] from rlkit.torch.networks.networks import FlattenMlp post_mlp1 = FlattenMlp( input_size=gnn_out_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) from rlkit.torch.networks.graph_r2g_qnet2 import R2GQNet qf1 = R2GQNet( obs_gnn=obs_gnn_1, pre_graph_builder=graph_builder_eval, obs_dim=obs_dim, action_dim=action_dim, post_mlp=post_mlp1, normalize_emb=False, output_activation=None, concat_emb=variant['concat_emb'], **variant['graph_kwargs'], ) target_qf1 = copy.deepcopy(qf1) obs_gnn_2 = GNNNet( graph_builder_obs, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) post_mlp2 = FlattenMlp( input_size=gnn_out_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) qf2 = R2GQNet( obs_gnn=obs_gnn_2, pre_graph_builder=graph_builder_eval, obs_dim=obs_dim, action_dim=action_dim, post_mlp=post_mlp2, normalize_emb=False, output_activation=None, concat_emb=variant['concat_emb'], **variant['graph_kwargs'], ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cgca = GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) graph_builder_policy = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_encoder=obs_gnn_1) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_encoder=obs_gnn_1) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn12 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet(graph_builder_1, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet(graph_builder_2, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.masac.masac_gnn_gcontext import MASACGNNTrainer trainer = MASACGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): import sys sys.path.append("./particle-graph-envs") from make_env import make_env expl_env = make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args']) eval_env = make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args']) num_agent = expl_env.num_agents obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from particle_graph import ParticleGraphBuilder graph_builder_cg1 = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, contain_self_loop=False, ) from rlkit.torch.networks.gnn_networks import GNNNet from rlkit.torch.networks.layers import SelectLayer cg1 = nn.Sequential( GNNNet( graph_builder_cg1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_cg2 = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, contain_self_loop=False, ) from rlkit.torch.networks.gnn_networks import GNNNet cg2 = nn.Sequential( GNNNet( graph_builder_cg2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, contain_self_loop=False, ) cgca = nn.Sequential( GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for agent in range(num_agent): graph_builder_policy = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, contain_self_loop=False, ) gnn_policy = GNNNet( graph_builder_policy, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.layers import SplitLayer, FlattenLayer policy = nn.Sequential( gnn_policy, SelectLayer(dim=1, index=agent), FlattenLayer(), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_obs=True) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_obs=True) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent, shared_obs=True) from rlkit.torch.r2g.r2g_gnn10 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, shared_obs=True, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def awac_rig_experiment( max_path_length, qf_kwargs, trainer_kwargs, replay_buffer_kwargs, policy_kwargs, algo_kwargs, train_vae_kwargs, policy_class=TanhGaussianPolicy, env_id=None, env_class=None, env_kwargs=None, reward_kwargs=None, observation_key='latent_observation', desired_goal_key='latent_desired_goal', state_observation_key='state_observation', state_goal_key='state_desired_goal', image_goal_key='image_desired_goal', path_loader_class=MDPPathLoader, demo_replay_buffer_kwargs=None, path_loader_kwargs=None, env_demo_path='', env_offpolicy_data_path='', debug=False, epsilon=1.0, exploration_policy_kwargs=None, evaluation_goal_sampling_mode=None, exploration_goal_sampling_mode=None, add_env_demos=False, add_env_offpolicy_data=False, save_paths=False, load_demos=False, pretrain_policy=False, pretrain_rl=False, save_pretrained_algorithm=False, # Video parameters save_video=True, save_video_kwargs=None, renderer_kwargs=None, imsize=84, pretrained_vae_path="", presampled_goals_path="", init_camera=None, qf_class=ConcatMlp, ): #Kwarg Definitions if exploration_policy_kwargs is None: exploration_policy_kwargs = {} if demo_replay_buffer_kwargs is None: demo_replay_buffer_kwargs = {} if path_loader_kwargs is None: path_loader_kwargs = {} if not save_video_kwargs: save_video_kwargs = {} if not renderer_kwargs: renderer_kwargs = {} if debug: max_path_length = 5 algo_kwargs['batch_size'] = 5 algo_kwargs['num_epochs'] = 5 algo_kwargs['num_eval_steps_per_epoch'] = 100 algo_kwargs['num_expl_steps_per_train_loop'] = 100 algo_kwargs['num_trains_per_train_loop'] = 10 algo_kwargs['min_num_steps_before_training'] = 100 algo_kwargs['min_num_steps_before_training'] = 100 trainer_kwargs['bc_num_pretrain_steps'] = min( 10, trainer_kwargs.get('bc_num_pretrain_steps', 0)) trainer_kwargs['q_num_pretrain1_steps'] = min( 10, trainer_kwargs.get('q_num_pretrain1_steps', 0)) trainer_kwargs['q_num_pretrain2_steps'] = min( 10, trainer_kwargs.get('q_num_pretrain2_steps', 0)) #Enviorment Wrapping renderer = EnvRenderer(init_camera=init_camera, **renderer_kwargs) def contextual_env_distrib_and_reward(env_id, env_class, env_kwargs, goal_sampling_mode, presampled_goals_path): state_env = get_gym_env(env_id, env_class=env_class, env_kwargs=env_kwargs) renderer = EnvRenderer(init_camera=init_camera, **renderer_kwargs) img_env = InsertImageEnv(state_env, renderer=renderer) # encoded_env = EncoderWrappedEnv( # img_env, # model, # dict(image_observation="latent_observation", ), # ) # if goal_sampling_mode == "vae_prior": # latent_goal_distribution = PriorDistribution( # model.representation_size, # desired_goal_key, # ) # diagnostics = StateImageGoalDiagnosticsFn({}, ) # elif goal_sampling_mode == "presampled": # diagnostics = state_env.get_contextual_diagnostics # image_goal_distribution = PresampledPathDistribution( # presampled_goals_path, # ) # latent_goal_distribution = AddLatentDistribution( # image_goal_distribution, # image_goal_key, # desired_goal_key, # model, # ) # elif goal_sampling_mode == "reset_of_env": # state_goal_env = get_gym_env(env_id, env_class=env_class, env_kwargs=env_kwargs) # state_goal_distribution = GoalDictDistributionFromMultitaskEnv( # state_goal_env, # desired_goal_keys=[state_goal_key], # ) # image_goal_distribution = AddImageDistribution( # env=state_env, # base_distribution=state_goal_distribution, # image_goal_key=image_goal_key, # renderer=renderer, # ) # latent_goal_distribution = AddLatentDistribution( # image_goal_distribution, # image_goal_key, # desired_goal_key, # model, # ) # no_goal_distribution = PriorDistribution( # representation_size=0, # key="no_goal", # ) # diagnostics = state_goal_env.get_contextual_diagnostics # else: # error diagnostics = StateImageGoalDiagnosticsFn({}, ) no_goal_distribution = PriorDistribution( representation_size=0, key="no_goal", ) reward_fn = GraspingRewardFn( # img_env, # state_env, # observation_key=observation_key, # desired_goal_key=desired_goal_key, # **reward_kwargs ) env = ContextualEnv( img_env, # state_env, context_distribution=no_goal_distribution, reward_fn=reward_fn, observation_key=observation_key, contextual_diagnostics_fns=[diagnostics], ) return env, no_goal_distribution, reward_fn #VAE Setup if pretrained_vae_path: model = load_local_or_remote_file(pretrained_vae_path) else: model = train_vae(train_vae_kwargs, env_kwargs, env_id, env_class, imsize, init_camera) path_loader_kwargs['model_path'] = pretrained_vae_path #Enviorment Definitions expl_env, expl_context_distrib, expl_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, exploration_goal_sampling_mode, presampled_goals_path) eval_env, eval_context_distrib, eval_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, evaluation_goal_sampling_mode, presampled_goals_path) path_loader_kwargs['env'] = eval_env #AWAC Code if add_env_demos: path_loader_kwargs["demo_paths"].append(env_demo_path) if add_env_offpolicy_data: path_loader_kwargs["demo_paths"].append(env_offpolicy_data_path) #Key Setting context_key = desired_goal_key obs_dim = (expl_env.observation_space.spaces[observation_key].low.size + expl_env.observation_space.spaces[context_key].low.size) action_dim = expl_env.action_space.low.size state_rewards = reward_kwargs.get('reward_type', 'dense') == 'wrapped_env' # if state_rewards: # mapper = RemapKeyFn({context_key: observation_key, state_goal_key: state_observation_key}) # obs_keys = [state_observation_key, observation_key] # cont_keys = [state_goal_key, context_key] # else: mapper = RemapKeyFn({context_key: observation_key}) obs_keys = [observation_key] cont_keys = [context_key] #Replay Buffer def concat_context_to_obs(batch, replay_buffer, obs_dict, next_obs_dict, new_contexts): obs = batch['observations'] next_obs = batch['next_observations'] context = batch[context_key] batch['observations'] = np.concatenate([obs, context], axis=1) batch['next_observations'] = np.concatenate([next_obs, context], axis=1) return batch replay_buffer = ContextualRelabelingReplayBuffer( env=eval_env, context_keys=cont_keys, observation_keys=obs_keys, observation_key=observation_key, context_distribution=expl_context_distrib, sample_context_from_obs_dict_fn=mapper, reward_fn=eval_reward, post_process_batch_fn=concat_context_to_obs, **replay_buffer_kwargs) replay_buffer_kwargs.update(demo_replay_buffer_kwargs) demo_train_buffer = ContextualRelabelingReplayBuffer( env=eval_env, context_keys=cont_keys, observation_keys=obs_keys, observation_key=observation_key, context_distribution=expl_context_distrib, sample_context_from_obs_dict_fn=mapper, reward_fn=eval_reward, post_process_batch_fn=concat_context_to_obs, **replay_buffer_kwargs) demo_test_buffer = ContextualRelabelingReplayBuffer( env=eval_env, context_keys=cont_keys, observation_keys=obs_keys, observation_key=observation_key, context_distribution=expl_context_distrib, sample_context_from_obs_dict_fn=mapper, reward_fn=eval_reward, post_process_batch_fn=concat_context_to_obs, **replay_buffer_kwargs) #Neural Network Architecture def create_qf(): # return ConcatMlp( # input_size=obs_dim + action_dim, # output_size=1, # **qf_kwargs # ) if qf_class is ConcatMlp: qf_kwargs["input_size"] = obs_dim + action_dim if qf_class is ConcatCNN: qf_kwargs["added_fc_input_size"] = action_dim return qf_class(output_size=1, **qf_kwargs) qf1 = create_qf() qf2 = create_qf() target_qf1 = create_qf() target_qf2 = create_qf() policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs, ) #Path Collectors eval_path_collector = ContextualPathCollector( eval_env, MakeDeterministic(policy), observation_key=observation_key, context_keys_for_policy=[ context_key, ], ) exploration_policy = create_exploration_policy(expl_env, policy, **exploration_policy_kwargs) expl_path_collector = ContextualPathCollector( expl_env, exploration_policy, observation_key=observation_key, context_keys_for_policy=[ context_key, ], ) #Algorithm trainer = AWACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **trainer_kwargs) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs) algorithm.to(ptu.device) #Video Saving if save_video: expl_video_func = RIGVideoSaveFunction( model, expl_path_collector, "train", # decode_goal_image_key="image_decoded_goal", # reconstruction_key="image_reconstruction", rows=2, columns=5, unnormalize=True, imsize=imsize, image_format=renderer.output_image_format, **save_video_kwargs) algorithm.post_train_funcs.append(expl_video_func) eval_video_func = RIGVideoSaveFunction( model, eval_path_collector, "eval", # goal_image_key=image_goal_key, # decode_goal_image_key="image_decoded_goal", # reconstruction_key="image_reconstruction", num_imgs=4, rows=2, columns=5, unnormalize=True, imsize=imsize, image_format=renderer.output_image_format, **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) #AWAC CODE if save_paths: algorithm.post_train_funcs.append(save_paths) if load_demos: path_loader = path_loader_class( trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, # reward_fn=eval_reward, # omit reward because its recomputed later **path_loader_kwargs) path_loader.load_demos() if pretrain_policy: trainer.pretrain_policy_with_bc( policy, demo_train_buffer, demo_test_buffer, trainer.bc_num_pretrain_steps, ) if pretrain_rl: trainer.pretrain_q_with_bc_data() if save_pretrained_algorithm: p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p') pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt') data = algorithm._get_snapshot() data['algorithm'] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) algorithm.train()