def _create_networks(env, config): """ Creates all networks necessary for SAC. These networks have to be created before instantiating this class and used in the constructor. TODO: Maybe this should be reworked one day... Args: config: A configuration dictonary. Returns: A dictonary which contains the networks. """ obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = config['rl_algorithm_config']['net_size'] hidden_sizes = [net_size] * config['rl_algorithm_config']['network_depth'] # hidden_sizes = [net_size, net_size, net_size] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ).to(device=ptu.device) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ).to(device=ptu.device) qf1_target = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ).to(device=ptu.device) qf2_target = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ).to(device=ptu.device) policy = TanhGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ).to(device=ptu.device) clip_value = 1.0 for p in qf1.parameters(): p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) for p in qf2.parameters(): p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) for p in policy.parameters(): p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) return {'qf1' : qf1, 'qf2' : qf2, 'qf1_target' : qf1_target, 'qf2_target' : qf2_target, 'policy' : policy}
def experiment(variant): env = SawyerXYZEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) vf = ConcatMlp(input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) replay_buffer = SimpleHerReplayBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerSac(env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = SawyerXYZEnv(**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) vf = ConcatMlp( input_size=obs_dim, output_size=1, **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs'] ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym env = NormalizedBoxEnv(gym.make('Pointmass-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): wrapped_env = gym.make(variant['env_name']) obs_dim = wrapped_env.observation_space.spaces['observation'].low.size net_size = variant['net_size'] disc = Discriminator(input_size=obs_dim, output_size=variant['disc_kwargs']['num_skills'], hidden_sizes=[net_size, net_size], **variant['disc_kwargs']) env = DiscriminatorWrappedEnv(wrapped_env=wrapped_env, disc=disc, **variant['env_kwargs']) context_dim = env.context_dim action_dim = wrapped_env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + context_dim + action_dim, output_size=1, hidden_sizes=[net_size, net_size], ) qf2 = FlattenMlp( input_size=obs_dim + context_dim + action_dim, output_size=1, hidden_sizes=[net_size, net_size], ) vf = FlattenMlp( input_size=obs_dim + context_dim, hidden_sizes=[net_size, net_size], output_size=1, ) policy = TanhGaussianPolicy( obs_dim=obs_dim + context_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) replay_buffer = ObsDictPathReplayBuffer( env=env, max_path_length=variant['algo_kwargs']['max_path_length'], observation_key='observation', context_key='context', **variant['replay_buffer_kwargs']) algorithm = UrlTwinSac(replay_buffer=replay_buffer, url_kwargs=dict(observation_key='observation', context_key='context', fitting_period=1, env_loss_key='discriminator loss'), tsac_kwargs=dict( env=env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, ), **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(PointEnv(**variant['task_params'])) ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id']) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) latent_dim = 5 task_enc_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim reward_dim = 1 net_size = variant['net_size'] # start with linear task encoding recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder task_enc = encoder_model( hidden_sizes=[200, 200, 200], # deeper net + higher dim space generalize better input_size=obs_dim + action_dim + reward_dim, output_size=task_enc_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = ProtoAgent( latent_dim, [task_enc, policy, qf1, qf2, vf], **variant['algo_params'] ) algorithm = ProtoSoftActorCritic( env=env, train_tasks=list(tasks[:-20]), eval_tasks=list(tasks[-20:]), nets=[agent, task_enc, policy, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.to() algorithm.train()
def experiment(variant): # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env_specs_vg = VariantGenerator() env_spec_constants = {} for k, v in env_specs.items(): if isinstance(v, list): env_specs_vg.add(k, v) else: env_spec_constants[k] = v env_specs_list = [] for es in env_specs_vg.variants(): del es['_hidden_keys'] es.update(env_spec_constants) env_specs_list.append(es) print(env_specs_list) print(env_specs_list[0]) env_sampler = EnvSampler(env_specs_list) # set up similar to non-meta version sample_env, _ = env_sampler() if variant['algo_params']['concat_env_params_to_obs']: meta_params_dim = sample_env.env_meta_params.shape[0] else: meta_params_dim = 0 obs_dim = int(np.prod(sample_env.observation_space.shape)) action_dim = int(np.prod(sample_env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + meta_params_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + meta_params_dim, action_dim=action_dim, ) algorithm = MetaSoftActorCritic(env_sampler=env_sampler, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=3) eval_env = CartPoleEnv(mode=3) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, eval_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \ [], [], [], [], [], [] for i in range(num_agent): policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf1) policy_n.append(policy) eval_policy_n.append(eval_policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MASACTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = Point2DEnv(**variant['env_kwargs']) env = FlatGoalEnv(env) env = NormalizedBoxEnv(env) action_dim = int(np.prod(env.action_space.shape)) obs_dim = int(np.prod(env.observation_space.shape)) qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TwinSACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, data_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def get_sac(evaluation_environment, parameters): """ :param env - environment to get action shape :param parameters: dict with keys - hidden_sizes, sac_trainer_parameters :return: sac_policy, eval_policy, trainer """ obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) sac_policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes_policy, ) eval_policy = MakeDeterministic(sac_policy) trainer = SACTrainer(env=evaluation_environment, policy=sac_policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **parameters['trainer_params']) return sac_policy, eval_policy, trainer
def her_twin_sac_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer(env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs']) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) vf = ConcatMlp(input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = HerTwinSac(env, qf1=qf1, qf2=qf2, vf=vf, policy=policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) vf.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(MultiGoalEnv( # actuation_cost_coeff=10, # distance_cost_coeff=1, # goal_reward=10, # )) env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) # qf = ExpectableQF( # obs_dim=obs_dim, # action_dim=action_dim, # hidden_size=100, # ) net_size = variant['net_size'] qf = ConcatMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = ConcatMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) # TODO(vitchyr): just creating the plotter crashes EC2 # plotter = QFPolicyPlotter( # qf=qf, # policy=policy, # obs_lst=np.array([[-2.5, 0.0], # [0.0, 0.0], # [2.5, 2.5]]), # default_action=[np.nan, np.nan], # n_samples=100 # ) algorithm = ExpectedSAC( env=env, policy=policy, qf=qf, vf=vf, # plotter=plotter, # render_eval_paths=True, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def run_sac(base_expl_env, base_eval_env, variant): expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True) eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] num_hidden = variant["num_hidden_layers"] qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M] * num_hidden) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant["replay_buffer_size"], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(CartpoleSwingupSparseEnv()) #env = NormalizedBoxEnv(HalfCheetahEnv()) #env = NormalizedBoxEnv(Continuous_MountainCarEnv()) #env = DIAYNWrappedEnv(NormalizedBoxEnv(HumanoidEnv())) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) skill_dim = 0 #50 obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + skill_dim, action_dim=action_dim, #k=4, ) disc = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=skill_dim if skill_dim > 0 else 1, ) algorithm = SoftActorCritic( env=env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, #disc=disc, #skill_dim=skill_dim, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # env = NormalizedBoxEnv(InvertedPendulumEnv()) # --------- # env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) env = ReacherEnv() training_env = ReacherEnv() # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) total_meta_variable_dim = 0 for dims in exp_specs['true_meta_variable_dims']: total_meta_variable_dim += sum(dims) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + total_meta_variable_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + total_meta_variable_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + total_meta_variable_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=training_env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def vanilla_nets(env, n_lay_nodes, n_depth, clip_val=1): hidden = [n_lay_nodes] * n_depth obs_size = env.observation_space.shape[0] act_size = env.action_space.shape[0] q1_net = FlattenMlp( hidden_sizes=hidden, input_size=obs_size + act_size, output_size=1, ).to(device=torch_util.device) q2_net = FlattenMlp( hidden_sizes=hidden, input_size=obs_size + act_size, output_size=1, ).to(device=torch_util.device) policy_net = TanhGaussianPolicy( hidden_sizes=hidden, obs_dim=obs_size, action_dim=act_size).to(device=torch_util.device) target_q1_net = FlattenMlp( hidden_sizes=hidden, input_size=obs_size + act_size, output_size=1, ).to(device=torch_util.device) target_q2_net = FlattenMlp( hidden_sizes=hidden, input_size=obs_size + act_size, output_size=1, ).to(device=torch_util.device) nets = [q1_net, q2_net, policy_net] for n in nets: for p in n.parameters(): p.register_hook( lambda grad: torch.clamp(grad, -clip_val, clip_val)) return dict(policy_net=policy_net, q1_net=q1_net, q2_net=q2_net, target_q1_net=target_q1_net, target_q2_net=target_q2_net)
def experiment(variant): env = SawyerHumanControlEnv(action_mode='joint_space_impd', position_action_scale=1, max_speed=0.015) # max_speed does not actually do anything, it is now included in the function request_angle_action of sawyer_env_base. training_env = env obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3BC(env=env, policy=policy, qf1=qf1, qf2=qf2, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M]) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy) expl_path_collector = MdpPathCollector(expl_env, policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = ConcatMlp( hidden_sizes=[100, 100], input_size=obs_dim + action_dim, output_size=1, ) vf = ConcatMlp( hidden_sizes=[100, 100], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[100, 100], obs_dim=obs_dim, action_dim=action_dim, ) plotter = QFPolicyPlotter(qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, # plotter=plotter, # render_eval_paths=True, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v2')) num_skills = variant['num_skills'] '''observation dim includes dim of latent variable''' obs_dim = int(np.prod(env.observation_space.shape)) + num_skills action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) # TODO: VERIFY THIS # num_skills=variant['num_skills'] discrim = FlattenMlp(hidden_sizes=[net_size, net_size], input_size=obs_dim - num_skills, output_size=num_skills, output_activation=nn.Sigmoid()) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = DIAYN(env=env, policy=policy, qf=qf, vf=vf, discrim=discrim, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def get_sac_trainer(env, hidden_sizes=[256, 256], reward_scale=1): obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes, ) trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, discount=0.99, soft_target_tau=5e-3, target_update_period=1, policy_lr=3E-4, qf_lr=3E-4, reward_scale=reward_scale, use_automatic_entropy_tuning=True) return trainer
def experiment(variant): env = variant['env_class']() env = NormalizedBoxEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) vf = ConcatMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def __init__(self, variant, goal, candidate_size=10): ptu.set_gpu_mode(True) torch.set_num_threads(1) import sys sys.argv = [''] del sys self.env = env_producer(variant['env_name'], seed=0, goal=goal) obs_dim = int(np.prod(self.env.observation_space.shape)) action_dim = int(np.prod(self.env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) self.agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) self.num_evals = variant['num_evals'] self.max_path_length = variant['max_path_length']
def experiment(variant): farmlist_base = [('123.123.123.123', 4)] farmer = Farmer(farmlist_base) environment = acq_remote_env(farmer) env = NormalizedBoxEnv(environment) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=env, policy=policy, qf=qf, vf=vf, environment_farming=True, farmlist_base=farmlist_base, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) vf = ConcatMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = TwinSAC(env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import gym env = NormalizedBoxEnv(gym.make('HalfCheetah-v2')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = TwinSAC( env=env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): logger.add_text_output('./d_text.txt') logger.add_tabular_output('./d_tabular.txt') logger.set_snapshot_dir('./snaps') farmer = Farmer([('0.0.0.0', 1)]) remote_env = farmer.force_acq_env() remote_env.set_spaces() env = NormalizedBoxEnv(remote_env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, training_env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): # env = normalize(GymEnv( # 'HalfCheetah-v1', # force_reset=True, # record_video=False, # record_log=False, # )) env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = ConcatMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = ConcatMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v2')) # env = gym.make('HalfCheetah-v2') env = MujocoManipEnv("SawyerBinsCanEnv") # wrap as a gym env obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, action_skip=ACTION_SKIP, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = ConcatMlp( hidden_sizes=[100, 100], input_size=obs_dim + action_dim, output_size=1, ) vf = ConcatMlp( hidden_sizes=[100, 100], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[100, 100], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) algorithm.to(ptu.device) with torch.autograd.profiler.profile() as prof: algorithm.train() prof.export_chrome_trace("tmp-torch-chrome-trace.prof")