def __init__(self, ctrl_cost_coeff=1e-2, *args, **kwargs): MultiDirectionBaseEnv.__init__(self, ctrl_cost_coeff=ctrl_cost_coeff, *args, **kwargs) SwimmerEnv.__init__(self, ctrl_cost_coeff=ctrl_cost_coeff, *args, **kwargs)
def create_env(which_agent): # setup environment if (which_agent == 0): env = PointEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) elif (which_agent == 1): env = AntEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) elif (which_agent == 2): env = SwimmerEnv() dt_from_xml = env.model.opt.timestep env = normalize(SwimmerEnv()) #dt 0.001 and frameskip=150 elif (which_agent == 3): env = ReacherEnv() dt_from_xml = env.model.opt.timestep elif (which_agent == 4): env = HalfCheetahEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) # elif(which_agent==5): # env = RoachEnv() #this is a personal vrep env # dt_from_xml = env.VREP_DT elif (which_agent == 6): env = HopperEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) elif (which_agent == 7): env = Walker2DEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) #get dt value from env - DOES NOT WORK !!!! # if(which_agent==5): # dt_from_xml = env.VREP_DT # else: # dt_from_xml = env.model.opt.timestep print("\n\n the dt is: ", dt_from_xml, "\n\n") #set vars tf.set_random_seed(2) gym.logger.setLevel(gym.logging.WARNING) dimO = env.observation_space.shape dimA = env.action_space.shape print('--------------------------------- \nState space dimension: ', dimO) print('Action space dimension: ', dimA, "\n -----------------------------------") return env, dt_from_xml
def run_task(*_): env = normalize(SwimmerEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=200, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=(M, M), ) df = DFunction( env_spec=env.spec, hidden_layer_sizes=[M, M]) # discriminator, input is the actions. vf = VFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False, df=df, vf=vf, df_lr=1e-3, dist=variant['dist'], ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) elif variant['env_name'] == 'ant-rllab': env = normalize(AntEnv()) elif variant['env_name'] == 'BlocksSimpleXYQ-v0': target = [-1.0, 0.0] env = bsmp.BlocksSimpleXYQ(multi_goal=variant['blocks_multigoal'], time_limit=variant['max_path_length'], env_config=variant['blocks_simple_xml'], goal=target) env = env_wrap.obsTupleWrap(env, add_action_to_obs=False) env = gym_env.GymEnv( env, video_schedule=glob.video_scheduler.video_schedule, log_dir=".") else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer(env=env, max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler(max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict(epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) M = variant['layer_size'] qf = NNQFunction(env=env, hidden_layer_sizes=(M, M)) policy = StochasticNNPolicy(env=env, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False) algorithm.train()
def run_task(*_): """ DPG on Swimmer environment """ env = normalize(SwimmerEnv()) """ Initialise the policy as a neural network policy """ policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) """ Defining exploration strategy : OUStrategy - """ """ This strategy implements the Ornstein-Uhlenbeck process, which adds time-correlated noise to the actions taken by the deterministic policy. The OU process satisfies the following stochastic differential equation: dxt = theta*(mu - xt)*dt + sigma*dWt where Wt denotes the Wiener process """ es = OUStrategy(env_spec=env.spec) """ Defining the Q network """ qf = ContinuousMLPQFunction(env_spec=env.spec) w = qf.get_param_values(regularizable=True) """ Persistence Length Exploration """ lp = Persistence_Length_Exploration(env=env, qf=qf, policy=policy) """ Using the DDPG algorithm """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, lp=lp, batch_size=32, max_path_length=1000, epoch_length=1000, min_pool_size=10000, n_epochs=15000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) """ Training the networks based on the DDPG algorithm """ algo.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) elif variant['env_name'] == 'ant-rllab': env = normalize(AntEnv()) elif variant['env_name'] == 'sawyer-rllab': env = normalize(SawyerTestEnv()) elif variant['env_name'] == 'arm3Ddisc-rllab': env = normalize(Arm3dDiscEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) M = variant['layer_size'] qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False) algorithm.train()
def run_experiment(variant): env = normalize(SwimmerEnv()) pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=1e6) sampler = SimpleSampler(max_path_length=1000, min_pool_size=1000, batch_size=128) base_kwargs = dict(epoch_length=1000, n_epochs=500, n_train_repeat=1, eval_render=False, eval_n_episodes=1, sampler=sampler) with tf.Session().as_default(): data = joblib.load(variant['file']) if 'algo' in data.keys(): saved_qf = data['algo'].qf saved_policy = data['algo'].policy else: saved_qf = data['qf'] saved_policy = data['policy'] algorithm = SQL(base_kwargs=base_kwargs, env=env, pool=pool, qf=saved_qf, policy=saved_policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=3E-4, policy_lr=3E-4, discount=0.99, reward_scale=30, use_saved_qf=True, use_saved_policy=True, save_full_state=False) algorithm.train()
def __init__(self, args): self.args = args self.device = torch.device( 'cuda' ) if args.cuda and torch.cuda.is_available() else torch.device('cpu') if self.args.env_name == 'ant': from rllab.envs.mujoco.ant_env import AntEnv env = AntEnv() # set the target velocity direction (for learning sub-policies) env.velocity_dir = self.args.velocity_dir env.penalty = self.args.penalty # use gym environment observation env.use_gym_obs = self.args.use_gym_obs # use gym environment reward env.use_gym_reward = self.args.use_gym_reward elif self.args.env_name == 'swimmer': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = SwimmerEnv() env.velocity_dir = self.args.velocity_dir else: raise NotImplementedError self.env = normalize(env) self.reset_env() self.obs_shape = self.env.observation_space.shape self.actor_critic = self.select_network().to(self.device) self.optimizer = self.select_optimizer() # list of RolloutStorage objects self.episodes_rollout = [] # concatenation of all episodes' rollout self.rollouts = RolloutStorage(self.device) # this directory is used for tensorboardX only self.writer = SummaryWriter(args.log_dir + self.args.velocity_dir) self.episodes = 0 self.episode_steps = [] self.train_rewards = []
def init(env_name, size, opts): global envs envs = [] for i in range(int(size)): if env_name == 'SPSwimmer': from envs.sp_swimmer_env import SPSwimmerEnv envs.append(SPSwimmerEnv()) elif env_name == 'SPSwimmerGather': from envs.sp_swimmer_gather_env import SPSwimmerGatherEnv envs.append(SPSwimmerGatherEnv(opts)) elif env_name == 'SPMountainCar': from envs.sp_mountain_car import SPMountainCarEnv envs.append(SPMountainCarEnv(opts)) elif env_name == 'Swimmer': from rllab.envs.mujoco.swimmer_env import SwimmerEnv envs.append(SwimmerEnv()) else: raise RuntimeError("wrong env name") if opts['rllab_normalize_rllab']: envs[-1] = NormalizedEnv(env=envs[-1], normalize_obs=True)
def _setup_world(self, filename): """ Helper method for handling setup of the MuJoCo world. Args: filename: Path to XML file containing the world information. """ self._world = [] self._model = [] # Initialize Mujoco worlds. If there's only one xml file, create a single world object, # otherwise create a different world for each condition. for i in range(self._hyperparams['conditions']): self._world.append(SwimmerEnv()) # Initialize x0. self.x0 = [] self._full_init_state = [] # pdb.set_trace() for i in range(self._hyperparams['conditions']): self.x0.append(self._world[i].reset()) self._full_init_state.append(self._world[i].get_full_state())
def create_env(which_agent): # setup environment if (which_agent == 0): env = normalize(PointEnv()) elif (which_agent == 1): env = normalize(AntEnv()) elif (which_agent == 2): env = normalize(SwimmerEnv()) #dt 0.001 and frameskip=150 elif (which_agent == 3): env = gym.make("modified_gym_env:ReacherPyBulletEnv-v1") elif (which_agent == 4): env = normalize(HalfCheetahEnv()) elif (which_agent == 5): env = RoachEnv() #this is a personal vrep env elif (which_agent == 6): env = normalize(HopperEnv()) elif (which_agent == 7): env = normalize(Walker2DEnv()) #get dt value from env if (which_agent == 5): dt_from_xml = env.VREP_DT elif (which_agent == 3): dt_from_xml = 0.02 else: dt_from_xml = env.model.opt.timestep print("\n\n the dt is: ", dt_from_xml, "\n\n") #set vars tf.set_random_seed(2) gym.logger.setLevel(logging.WARN) dimO = env.observation_space.shape dimA = env.action_space.shape print('--------------------------------- \nState space dimension: ', dimO) print('Action space dimension: ', dimA, "\n -----------------------------------") return env, dt_from_xml
from rllab.algos.ddpg import DDPG from rllab.envs.normalized_env import normalize from rllab.misc.instrument import run_experiment_lite from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) def run_task(*_): """ DPG on Swimmer environment """ env = normalize(SwimmerEnv()) """ Initialise the policy as a neural network policy """ policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) """ Defining exploration strategy : OUStrategy - """ """ This strategy implements the Ornstein-Uhlenbeck process, which adds time-correlated noise to the actions taken by the deterministic policy. The OU process satisfies the following stochastic differential equation:
def get(perm): name = perm["problem"] if name.lower() == "cartpole": from rllab.envs.box2d.cartpole_env import CartpoleEnv return normalize(CartpoleEnv()) elif name.lower() == "mountain car height bonus": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv()) elif name.lower() == "mountain car": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv(height_bonus=0)) elif name.lower() == "gym mountain car": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("MountainCarContinuous-v0", record_video=False)) elif name.lower() == "pendulum": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("Pendulum-v0", record_video=False)) elif name.lower() == "mujoco double pendulum": from rllab.envs.mujoco.inverted_double_pendulum_env import InvertedDoublePendulumEnv return normalize(InvertedDoublePendulumEnv()) elif name.lower() == "double pendulum": from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv return normalize(DoublePendulumEnv()) elif name.lower() == "hopper": from rllab.envs.mujoco.hopper_env import HopperEnv return normalize(HopperEnv()) elif name.lower() == "swimmer": from rllab.envs.mujoco.swimmer_env import SwimmerEnv return normalize(SwimmerEnv()) elif name.lower() == "2d walker": from rllab.envs.mujoco.walker2d_env import Walker2DEnv return normalize(Walker2DEnv()) elif name.lower() == "half cheetah": from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv return normalize(HalfCheetahEnv()) elif name.lower() == "ant": from rllab.envs.mujoco.ant_env import AntEnv return normalize(AntEnv()) elif name.lower() == "simple humanoid": from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv return normalize(SimpleHumanoidEnv()) elif name.lower() == "full humanoid": from rllab.envs.mujoco.humanoid_env import HumanoidEnv return normalize(HumanoidEnv()) else: raise NotImplementedError(f"Environment {name} unknown")
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=aug_env_spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reg=0.001, ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN_BD( base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], # Additional params for behaviour tracking metric=variant['metric'], env_id=variant['prefix'], eval_freq=variant['eval_freq'], log_dir=get_logdir(args, variant), ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) env = DelayedEnv(env, delay=0.01) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) sampler = RemoteSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size'] ) base_kwargs = dict( sampler=sampler, epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=env.spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reparameterize=variant['reparameterize'], reg=0.001, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], reparameterize=variant['reparameterize'], save_full_state=False, ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) elif variant["env_name"] == "Point2D-v0": import sac.envs.point2d_env env = GymEnv(variant["env_name"]) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict(min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=SimpleSampler( max_path_length=variant["max_path_length"], min_pool_size=variant["max_path_length"], batch_size=variant["batch_size"])) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GaussianPolicy( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], reg=0.001, ) # policy = GMMPolicy( # env_spec=aug_env_spec, # K=variant['K'], # hidden_layer_sizes=[M, M], # qf=qf, # reg=0.001, # ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN(base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], reparametrize=variant["reparametrize"]) algorithm.train()
def __init__(self): super(OccludedSwimmerEnv, self).__init__(SwimmerEnv(), [2,3,4]) # joint angles
from rllab.envs.mujoco.swimmer_env import SwimmerEnv swimmer = SwimmerEnv() swimmer.reset() swimmer.get_current_obs()
#from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv stub(globals()) oracle = False random = True if oracle: env = TfEnv(normalize(SwimmerRandGoalOracleEnv())) batch_size = 200 elif random: env = TfEnv(normalize(SwimmerRandGoalEnv())) batch_size = 200 else: env = TfEnv(normalize(SwimmerEnv())) batch_size = 20 policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(100,100), ) baseline = LinearFeatureBaseline(env_spec=env.spec) #baseline = ZeroBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=500*batch_size, max_path_length=500, n_itr=500,