def __init__(self): self.max_iters = 500 self.horizon = 50000 # Batch size: time steps per batch self.episode_long = 5000 self.max_iters = 1000 self.horizon = 100000 # Batch size: time steps per batch self.episode_long = 1000 self.l2_reg = 1E-4 # L2 regularization lambda for value loss function self.max_KL = 0.01 # Max KL divergence threshold for TRPO update # Environemnt self.env = ant_v3.AntEnv(ctrl_cost_weight=1E-6, contact_cost_weight=1E-3, healthy_reward=0.05) self.env.seed(seed) self.agent = Ant(self.env, self.horizon) # create agent self.env = Walker2dEnv() #ant_v3.AntEnv(ctrl_cost_weight=1E-6, contact_cost_weight=1E-3, healthy_reward=0.05) self.env.seed(seed) self.agent = Ant(self.env, self.horizon,self.episode_long) # create agent self.pi_net = Policy_Net(self.agent.ob_dim, self.agent.ac_dim) # Create Policy Network self.value_net = Value_Net(self.agent.ob_dim, 1) # Create Value Network self.value_net_lr = 1 # Declare value net learning rate self.LBFGS_iters = 20 # Declare the number of update times for value_net parameters in one TRPO update self.cg_iters = 500 # Declare the number of iterations for conjugate gradient algorithm self.cg_threshold = 1e-2 # Eearly stopping threshold for conjugate gradient self.line_search_alpha = 0.5 # Line search decay rate for TRPO self.search_iters = 10 # Line search iterations self.gamma = 1 self.Lambda = 0.95
def __init__(self, **kwargs): Walker2dEnv.__init__(self,) offline_env.OfflineEnv.__init__(self, **kwargs)
def __init__(self): self._base_pos = 1. Walker2dEnv.__init__(self)
def experiment(variant): # Or for a specific version (Daniel: doesn't work): # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) if 'Ant' in args.env: expl_env = NormalizedBoxEnv(AntEnv()) eval_env = NormalizedBoxEnv(AntEnv()) elif 'InvertedPendulum' in args.env: expl_env = NormalizedBoxEnv(InvertedPendulumEnv()) eval_env = NormalizedBoxEnv(InvertedPendulumEnv()) elif 'HalfCheetah' in args.env: expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) elif 'Hopper' in args.env: expl_env = NormalizedBoxEnv(HopperEnv()) eval_env = NormalizedBoxEnv(HopperEnv()) elif 'Reacher' in args.env: expl_env = NormalizedBoxEnv(ReacherEnv()) eval_env = NormalizedBoxEnv(ReacherEnv()) elif 'Swimmer' in args.env: expl_env = NormalizedBoxEnv(SwimmerEnv()) eval_env = NormalizedBoxEnv(SwimmerEnv()) elif 'Walker2d' in args.env: expl_env = NormalizedBoxEnv(Walker2dEnv()) eval_env = NormalizedBoxEnv(Walker2dEnv()) else: raise ValueError(args.env) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ th = 1.8 g_max = 0.1 #delta = 1e-7 if args.env == 'CartPole': #CartPole env = TfEnv(normalize(CartPoleEnv())) runner = LocalRunner(snapshot_config) batch_size = 5000 max_length = 100 n_timestep = 5e5 n_counts = 5 name = 'CartPole' grad_factor = 5 th = 1.2 #batchsize: 1 # lr = 0.1 # w = 2 # c = 50 #batchsize: 50 lr = 0.75 c = 3 w = 2 discount = 0.995 path = './init/CartPole_policy.pth' if args.env == 'Walker': #Walker_2d env = TfEnv(normalize(Walker2dEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 500 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 2 c = 12 grad_factor = 6 discount = 0.999 name = 'Walk' path = './init/Walk_policy.pth' if args.env == 'HalfCheetah': env = TfEnv(normalize(HalfCheetahEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 500 n_timestep = 1e7 n_counts = 5 lr = 0.6 w = 1 c = 4 grad_factor = 5 th = 1.2 g_max = 0.06 discount = 0.999 name = 'HalfCheetah' path = './init/HalfCheetah_policy.pth' if args.env == 'Hopper': #Hopper env = TfEnv(normalize(HopperEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 1000 th = 1.5 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 1 c = 3 grad_factor = 6 g_max = 0.15 discount = 0.999 name = 'Hopper' path = './init/Hopper_policy.pth' for i in range(n_counts): # print(env.spec) if args.env == 'CartPole': policy = CategoricalMLPPolicy(env.spec, hidden_sizes=[8, 8], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) else: policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy.load_state_dict(torch.load(path)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MBPG_HA(env_spec=env.spec, env = env, env_name= name, policy=policy, baseline=baseline, max_path_length=max_length, discount=discount, grad_factor=grad_factor, policy_lr= lr, c = c, w = w, th=th, g_max=g_max, n_timestep=n_timestep, batch_size=batch_size, center_adv=True, # delta=delta #decay_learning_rate=d_lr, ) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size)
def experiment(variant, args): # Doesn't work :( #import gym #expl_env = NormalizedBoxEnv( gym.make(args.env) ) #eval_env = NormalizedBoxEnv( gym.make(args.env) ) if 'Ant' in args.env: expl_env = NormalizedBoxEnv( AntEnv() ) eval_env = NormalizedBoxEnv( AntEnv() ) elif 'InvertedPendulum' in args.env: expl_env = NormalizedBoxEnv( InvertedPendulumEnv() ) eval_env = NormalizedBoxEnv( InvertedPendulumEnv() ) elif 'HalfCheetah' in args.env: expl_env = NormalizedBoxEnv( HalfCheetahEnv() ) eval_env = NormalizedBoxEnv( HalfCheetahEnv() ) elif 'Hopper' in args.env: expl_env = NormalizedBoxEnv( HopperEnv() ) eval_env = NormalizedBoxEnv( HopperEnv() ) elif 'Reacher' in args.env: expl_env = NormalizedBoxEnv( ReacherEnv() ) eval_env = NormalizedBoxEnv( ReacherEnv() ) elif 'Swimmer' in args.env: expl_env = NormalizedBoxEnv( SwimmerEnv() ) eval_env = NormalizedBoxEnv( SwimmerEnv() ) elif 'Walker2d' in args.env: expl_env = NormalizedBoxEnv( Walker2dEnv() ) eval_env = NormalizedBoxEnv( Walker2dEnv() ) else: raise ValueError(args.env) # Back to normal. obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer( policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()