def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=4) eval_env = CartPoleEnv(mode=4) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [] for i in range(num_agent): policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf1) eval_policy = ArgmaxDiscretePolicy(policy) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MASACDiscreteTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym env = NormalizedBoxEnv(gym.make('Pointmass-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def example(variant): env = variant['env_class']() if variant['normalize']: env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['vf_params']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_params']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = N3DPG(env, qf=qf, vf=vf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def __init__(self, env, n_layers=3, hidden_layer_size=64, optimizer_class=optim.Adam, learning_rate=1e-3, reward_weight=1, **kwargs): super().__init__(env=env, **kwargs) self.env = env obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) self.input_dim = obs_dim self.action_dim = action_dim self.next_obs_dim = obs_dim self.n_layers = n_layers self.hidden_layer_size = hidden_layer_size self.learning_rate = learning_rate self.reward_weight = reward_weight self.reset() self.reward_dim = 1 #terminal_dim = 1 self.net = FlattenMlp( hidden_sizes=[hidden_layer_size] * n_layers, input_size=self.input_dim + self.action_dim, output_size=self.next_obs_dim + self.reward_dim, ) self.net_optimizer = optimizer_class(self.net.parameters(), lr=learning_rate)
def get_td3pg(evaluation_environment, parameters): """ :param evaluation_environment: :param parameters: :return: """ obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) es = GaussianStrategy( action_space=evaluation_environment.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **parameters['trainer_params']) return exploration_policy, policy, trainer
def gen_network(variant, action_dim, layer_size, policy=False): return FoodNetworkMedium( img_network=CNN(**variant['img_conv_kwargs']), full_img_network=CNN(**variant['full_img_conv_kwargs']), inventory_network=FlattenMlp(**variant['inventory_network_kwargs']), final_network=FlattenMlp( input_size=variant['img_conv_kwargs']['output_size'] + variant['full_img_conv_kwargs']['output_size'] + variant['inventory_network_kwargs']['output_size'], output_size=action_dim, hidden_sizes=[layer_size, layer_size], output_activation=F.softmax if policy else identity), sizes=[ variant['img_conv_kwargs']['input_width'] * variant['img_conv_kwargs']['input_height'] * variant['img_conv_kwargs']['input_channels'], variant['full_img_conv_kwargs']['input_width'] * variant['full_img_conv_kwargs']['input_height'] * variant['full_img_conv_kwargs']['input_channels'], # health dim 1, # pantry dim 400, # shelf dim 40 ])
def experiment(variant): env = NormalizedBoxEnv(PointEnv(**variant['task_params'])) ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id']) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) latent_dim = 5 task_enc_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim reward_dim = 1 net_size = variant['net_size'] # start with linear task encoding recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder task_enc = encoder_model( hidden_sizes=[200, 200, 200], # deeper net + higher dim space generalize better input_size=obs_dim + action_dim + reward_dim, output_size=task_enc_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = ProtoAgent( latent_dim, [task_enc, policy, qf1, qf2, vf], **variant['algo_params'] ) algorithm = ProtoSoftActorCritic( env=env, train_tasks=list(tasks[:-20]), eval_tasks=list(tasks[-20:]), nets=[agent, task_enc, policy, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.to() algorithm.train()
def experiment(variant): wrapped_env = gym.make(variant['env_name']) obs_dim = wrapped_env.observation_space.spaces['observation'].low.size net_size = variant['net_size'] disc = Discriminator(input_size=obs_dim, output_size=variant['disc_kwargs']['num_skills'], hidden_sizes=[net_size, net_size], **variant['disc_kwargs']) env = DiscriminatorWrappedEnv(wrapped_env=wrapped_env, disc=disc, **variant['env_kwargs']) context_dim = env.context_dim action_dim = wrapped_env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + context_dim + action_dim, output_size=1, hidden_sizes=[net_size, net_size], ) qf2 = FlattenMlp( input_size=obs_dim + context_dim + action_dim, output_size=1, hidden_sizes=[net_size, net_size], ) vf = FlattenMlp( input_size=obs_dim + context_dim, hidden_sizes=[net_size, net_size], output_size=1, ) policy = TanhGaussianPolicy( obs_dim=obs_dim + context_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) replay_buffer = ObsDictPathReplayBuffer( env=env, max_path_length=variant['algo_kwargs']['max_path_length'], observation_key='observation', context_key='context', **variant['replay_buffer_kwargs']) algorithm = UrlTwinSac(replay_buffer=replay_buffer, url_kwargs=dict(observation_key='observation', context_key='context', fitting_period=1, env_loss_key='discriminator loss'), tsac_kwargs=dict( env=env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, ), **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): task_mode = variant['task_mode'] # train, test, eval task_idx = variant['task_idx'] if task_mode == 'train': task_sampler = WalkerTrainParamsSampler() elif task_mode == 'test': task_sampler = WalkerTestParamsSampler() else: raise NotImplementedError() task_params = task_sampler.get_task(task_idx) obs_task_params = task_sampler.get_obs_task_params(task_params) env = SingleTaskWalkerEnv(task_params, obs_task_params) training_env = SingleTaskWalkerEnv(task_params, obs_task_params) print(env.observation_space) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] hidden_sizes = [net_size] * variant['num_hidden_layers'] print('Using simple model') qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ) algorithm = NewSoftActorCritic( env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env_specs_vg = VariantGenerator() env_spec_constants = {} for k, v in env_specs.items(): if isinstance(v, list): env_specs_vg.add(k, v) else: env_spec_constants[k] = v env_specs_list = [] for es in env_specs_vg.variants(): del es['_hidden_keys'] es.update(env_spec_constants) env_specs_list.append(es) print(env_specs_list) print(env_specs_list[0]) env_sampler = EnvSampler(env_specs_list) # set up similar to non-meta version sample_env, _ = env_sampler() if variant['algo_params']['concat_env_params_to_obs']: meta_params_dim = sample_env.env_meta_params.shape[0] else: meta_params_dim = 0 obs_dim = int(np.prod(sample_env.observation_space.shape)) action_dim = int(np.prod(sample_env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + meta_params_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + meta_params_dim, action_dim=action_dim, ) algorithm = MetaSoftActorCritic(env_sampler=env_sampler, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def get_sac(evaluation_environment, parameters): """ :param env - environment to get action shape :param parameters: dict with keys - hidden_sizes, sac_trainer_parameters :return: sac_policy, eval_policy, trainer """ obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) sac_policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes_policy, ) eval_policy = MakeDeterministic(sac_policy) trainer = SACTrainer(env=evaluation_environment, policy=sac_policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **parameters['trainer_params']) return sac_policy, eval_policy, trainer
def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=3) eval_env = CartPoleEnv(mode=3) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, eval_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \ [], [], [], [], [], [] for i in range(num_agent): policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf1) policy_n.append(policy) eval_policy_n.append(eval_policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MASACTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make('GoalGridworld-v0') eval_env = gym.make('GoalGridworld-v0') obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.n qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) target_qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) eval_policy = ArgmaxDiscretePolicy(qf) exploration_strategy = EpsilonGreedy(action_space=expl_env.action_space, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=exploration_strategy, policy=eval_policy, ) replay_buffer = ObsDictRelabelingBuffer(env=eval_env, **variant['replay_buffer_kwargs']) observation_key = 'observation' desired_goal_key = 'desired_goal' eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, **variant['trainer_kwargs']) trainer = HERTrainer(trainer) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def run_sac(base_expl_env, base_eval_env, variant): expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True) eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] num_hidden = variant["num_hidden_layers"] qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M] * num_hidden) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant["replay_buffer_size"], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(CartpoleSwingupSparseEnv()) #env = NormalizedBoxEnv(HalfCheetahEnv()) #env = NormalizedBoxEnv(Continuous_MountainCarEnv()) #env = DIAYNWrappedEnv(NormalizedBoxEnv(HumanoidEnv())) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) skill_dim = 0 #50 obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + skill_dim, action_dim=action_dim, #k=4, ) disc = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=skill_dim if skill_dim > 0 else 1, ) algorithm = SoftActorCritic( env=env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, #disc=disc, #skill_dim=skill_dim, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy) eval_path_collector = MdpPathCollector(eval_env, policy) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # env = NormalizedBoxEnv(InvertedPendulumEnv()) # --------- # env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) env = ReacherEnv() training_env = ReacherEnv() # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) total_meta_variable_dim = 0 for dims in exp_specs['true_meta_variable_dims']: total_meta_variable_dim += sum(dims) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + total_meta_variable_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + total_meta_variable_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + total_meta_variable_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=training_env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def gen_network(variant, action_dim, layer_size, policy=False): return FoodNetworkMediumPartialObsTask( img_network=Mlp(**variant['full_img_network_kwargs']), inventory_network=FlattenMlp(**variant['inventory_network_kwargs']), final_network=FlattenMlp( input_size=variant['full_img_network_kwargs']['output_size'] + variant['inventory_network_kwargs']['output_size'], output_size=action_dim, hidden_sizes=[layer_size, layer_size], output_activation=F.softmax if policy else identity), sizes=[ variant['full_img_network_kwargs']['input_size'], # shelf dim 64 ])
def experiment(variant): env = gym.make('replab-v0')._start_rospy(goal_oriented=False) #SIM #env = gym.make('replab-v0')._start_sim(goal_oriented=False, render=False) env = NormalizedBoxEnv(env) es = GaussianStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): farmlist_base = [('123.123.123.123', 4)] farmer = Farmer(farmlist_base) environment = acq_remote_env(farmer) env = NormalizedBoxEnv(environment) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(CartpoleSwingupSparseEnv()) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) heads = 5 net_size = variant['net_size'] qf1 = EnsembleFlattenMlp( heads, hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = EnsembleFlattenMlp( heads, hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) pqf1 = EnsembleFlattenMlp( heads, hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) pqf2 = EnsembleFlattenMlp( heads, hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[1], input_size=obs_dim, output_size=1, ) policy = MultiTanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, heads=heads, ) algorithm = BigThompsonSoftActorCritic( env=env, policy=policy, qf1=qf1, qf2=qf2, pqf1=pqf1, pqf2=pqf2, prior_coef=10, vf=vf, #disc=disc, #skill_dim=skill_dim, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) env = NormalizedBoxEnv(create_swingup()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def get_ddpg(evaluation_environment, parameters): obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=evaluation_environment.action_space), policy=policy, ) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **parameters['trainer_params']) return exploration_policy, policy, trainer
def experiment(variant): # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env_sampler = MazeSampler(env_specs) sample_env, _ = env_sampler() meta_params_dim = 0 obs_dim = int(np.prod(sample_env.observation_space.shape)) if isinstance(sample_env.action_space, Discrete): action_dim = int(sample_env.action_space.n) else: action_dim = int(np.prod(sample_env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + meta_params_dim, output_size=action_dim, ) policy = DiscreteQWrapperPolicy(qf) algorithm = MetaSoftQLearning(env_sampler=env_sampler, qf=qf, policy=policy, **variant['algo_params']) # assert False, "Have not added new sac yet!" if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant, env_name, record_name, record_every_episode): #env = CartPoleEnv() env = gym.make(env_name) # A workaround to give this info later on # (Such naughty business...) randomize_settings = { "turnframes": [10, 10], "engagement_distance": [100, 200] } env.record_name = record_name env.record_every_episode = record_every_episode env.randomize_settings = randomize_settings env = OneHotsToDecimalsAndRecordAndRandomize(env) obs_dim = int(np.prod(env.observation_space.shape)) num_categoricals = len(env.action_space.nvec) num_categories = env.action_space.nvec[0] net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], # Action is fed in as a raveled one-hot vector input_size=obs_dim + int(np.sum(env.action_space.nvec)), output_size=1, hidden_activation=F.sigmoid, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, hidden_activation=F.sigmoid, ) # For multi-discrete policy = MultiCategoricalPolicy(hidden_sizes=[net_size, net_size], obs_dim=obs_dim, num_categoricals=num_categoricals, num_categories=num_categories, hidden_activation=F.sigmoid) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('SawyerReachXYZEnv-v0') es = GaussianAndEpislonStrategy( action_space=env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = env.observation_space.spaces['observation'].low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=env, achieved_goal_key='state_achieved_goal', desired_goal_key='state_desired_goal', **variant['replay_buffer_kwargs'] ) algorithm = HerTd3( env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): torch.autograd.set_detect_anomaly(True) #expl_env = NormalizedBoxEnv(HalfCheetahEnv()) #eval_env = NormalizedBoxEnv(HalfCheetahEnv()) # expl_env = NormalizedBoxEnv(PendulumEnv()) # eval_env = NormalizedBoxEnv(PendulumEnv()) expl_env = NormalizedBoxEnv(gym.make("BipedalWalker-v2")) eval_env = NormalizedBoxEnv(gym.make("BipedalWalker-v2")) #expl_env = NormalizedBoxEnv(gym.make("LunarLanderContinuous-v2")) #eval_env = NormalizedBoxEnv(gym.make("LunarLanderContinuous-v2")) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_step_collector = PPOMdpPathCollector( eval_env, eval_policy, calculate_advantages=False ) expl_step_collector = PPOMdpPathCollector( expl_env, policy, calculate_advantages=True, vf=vf, gae_lambda=0.97, discount=0.995, ) replay_buffer = PPOEnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = PPOTrainer( env=eval_env, policy=policy, vf=vf, **variant['trainer_kwargs'] ) algorithm = PPOTorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_step_collector, evaluation_data_collector=eval_step_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('FetchReach-v1') es = GaussianAndEpsilonStrategy( action_space=env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = env.observation_space.spaces['observation'].low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(her_kwargs={ "observation_key": "observation", "desired_goal_key": "desired_goal" }, td3_kwargs={ "env": env, "qf1": qf1, "qf2": qf2, "policy": policy, "exploration_policy": exploration_policy, "replay_buffer": replay_buffer, }**variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('replab-v0')._start_rospy(goal_oriented=True) #SIM #env = gym.make('replab-v0')._start_sim(goal_oriented=True, render=False) env = NormalizedBoxEnv(env) es = GaussianAndEpislonStrategy( action_space=env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = env.observation_space.spaces['observation'].low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(her_kwargs=dict(observation_key='observation', desired_goal_key='desired_goal'), td3_kwargs=dict(env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy), replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()