def example(variant): env = HalfCheetahEnv() if variant['normalize']: env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def test_dead_grads(self): self.env = HalfCheetahEnv() algo = NAF( self.env, self.es, QuadraticNAF(name_or_scope='qf', env_spec=self.env.spec), n_epochs=0, ) qf = algo.qf af = qf.advantage_function L_param_gen = af._L_computer L = af.L last_bs = L_param_gen.get_params_internal()[-1] grads_ops = tf.gradients(af.output, last_bs) a = np.random.rand(1, algo.action_dim) o = np.random.rand(1, algo.observation_dim) grads = self.sess.run(grads_ops, { qf.action_input: a, qf.observation_input: o, })[0] bs = self.sess.run(last_bs) num_elems = bs.size length = int(math.sqrt(float(num_elems))) expected_zero = length * (length - 1) / 2 num_zero = np.sum((grads == 0.)) self.assertAlmostEqual(expected_zero, num_zero)
def run_task(*_): env = normalize(HalfCheetahEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(H_layer_first[h], H_layer_second[h])) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=size_of_batch, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=number_of_episodes, discount=discount_factor, scale_reward=reward_scaling[r], qf_learning_rate=critic_learning_rate[c], policy_learning_rate=actor_learning_rate[c], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def main(): stub(globals()) for seed in range(3): env = TfEnv(HalfCheetahEnv()) es = GaussianStrategy(env.spec) policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) qf = ContinuousMLPQFunction(name="qf", env_spec=env.spec, hidden_sizes=(100, 100)) ddpg_params = dict( batch_size=4, n_epochs=100, epoch_length=50, eval_samples=50, max_path_length=10, min_pool_size=5, ) algorithm = DDPG(env, policy, qf, es, **ddpg_params) for _ in range(3): run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="check-rllab-ddpg-seed", seed=seed, variant={"seed": seed}, )
def main(): stub(globals()) env = TfEnv(HalfCheetahEnv()) for seed in range(3): ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=10000, eval_samples=10000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.0, ) vitchyr_es = OUStrategy(env_spec=env.spec) vitchyr_qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) vitchyr_policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf, **ddpg_params) shane_es = GaussianStrategy(env.spec) shane_policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) shane_qf = ContinuousMLPQFunction(name="qf", env_spec=env.spec, hidden_sizes=(100, 100)) shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es, **ddpg_params) names_and_algos = [ ("Vitchyr_DDPG", vitchyr_ddpg), ("Shane_DDPG", shane_ddpg), ] for name, algorithm in names_and_algos: env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="ddpg-comparison-cheetah", seed=seed, )
def example(variant): load_policy_file = variant.get('load_policy_file', None) if load_policy_file is not None and exists(load_policy_file): with tf.Session(): data = joblib.load(load_policy_file) print(data) policy = data['policy'] qf = data['qf'] replay_buffer = data['pool'] env = HalfCheetahEnv() es = OUStrategy(action_space=env.action_space) use_new_version = variant['use_new_version'] algorithm = DDPG( env, es, policy, qf, n_epochs=2, batch_size=1024, replay_pool=replay_buffer, use_new_version=use_new_version, ) algorithm.train() else: env = HalfCheetahEnv() es = OUStrategy(action_space=env.action_space) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) use_new_version = variant['use_new_version'] algorithm = DDPG( env, es, policy, qf, n_epochs=2, batch_size=1024, use_new_version=use_new_version, ) algorithm.train()
def run_task(*_): env = normalize(HalfCheetahEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(32, 32)) """ Persistence Length Exploration """ lp = Persistence_Length_Exploration( env=env, qf=qf, policy=policy, L_p=L_p_param[l_p_ind], b_step_size=b_step_size[b_ind], sigma=sigma_param[s_ind], max_exploratory_steps=max_exploratory_steps_iters, batch_size=batch_size_value, n_epochs=num_episodes, scale_reward=0.01, epoch_length=steps_per_episode, qf_learning_rate=0.001, policy_learning_rate=0.0001, ) """ DDPG """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, lp=lp, batch_size=batch_size_value, max_path_length=100, epoch_length=steps_per_episode, min_pool_size=10000, n_epochs=num_episodes, discount=0.99, scale_reward=0.01, qf_learning_rate=0.001, policy_learning_rate=0.0001, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def create_env(which_agent): # setup environment if (which_agent == 0): env = PointEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) elif (which_agent == 1): env = AntEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) elif (which_agent == 2): env = SwimmerEnv() dt_from_xml = env.model.opt.timestep env = normalize(SwimmerEnv()) #dt 0.001 and frameskip=150 elif (which_agent == 3): env = ReacherEnv() dt_from_xml = env.model.opt.timestep elif (which_agent == 4): env = HalfCheetahEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) # elif(which_agent==5): # env = RoachEnv() #this is a personal vrep env # dt_from_xml = env.VREP_DT elif (which_agent == 6): env = HopperEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) elif (which_agent == 7): env = Walker2DEnv() dt_from_xml = env.model.opt.timestep env = normalize(env) #get dt value from env - DOES NOT WORK !!!! # if(which_agent==5): # dt_from_xml = env.VREP_DT # else: # dt_from_xml = env.model.opt.timestep print("\n\n the dt is: ", dt_from_xml, "\n\n") #set vars tf.set_random_seed(2) gym.logger.setLevel(gym.logging.WARNING) dimO = env.observation_space.shape dimA = env.action_space.shape print('--------------------------------- \nState space dimension: ', dimO) print('Action space dimension: ', dimA, "\n -----------------------------------") return env, dt_from_xml
def experiment(variant): # env = NormalizedBoxEnv(MultiGoalEnv( # actuation_cost_coeff=10, # distance_cost_coeff=1, # goal_reward=10, # )) env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) # qf = ExpectableQF( # obs_dim=obs_dim, # action_dim=action_dim, # hidden_size=100, # ) net_size = variant['net_size'] qf = ConcatMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = ConcatMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) # TODO(vitchyr): just creating the plotter crashes EC2 # plotter = QFPolicyPlotter( # qf=qf, # policy=policy, # obs_lst=np.array([[-2.5, 0.0], # [0.0, 0.0], # [2.5, 2.5]]), # default_action=[np.nan, np.nan], # n_samples=100 # ) algorithm = ExpectedSAC( env=env, policy=policy, qf=qf, vf=vf, # plotter=plotter, # render_eval_paths=True, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def run_task(*_): """ DPG on HalfCheetah environment """ env = normalize(HalfCheetahEnv()) """ Initialise the policy as a neural network policy """ policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) """ Defining exploration strategy : OUStrategy - """ """ This strategy implements the Ornstein-Uhlenbeck process, which adds time-correlated noise to the actions taken by the deterministic policy. The OU process satisfies the following stochastic differential equation: dxt = theta*(mu - xt)*dt + sigma*dWt where Wt denotes the Wiener process """ es = OUStrategy(env_spec=env.spec) """ Defining the Q network """ qf = ContinuousMLPQFunction(env_spec=env.spec) """ Persistence Length Exploration """ lp = Persistence_Length_Exploration(env=env, qf=qf, policy=policy) algo = DDPG( env=env, policy=policy, es=es, qf=qf, lp=lp, batch_size=32, max_path_length=1000, epoch_length=1000, min_pool_size=10000, n_epochs=15000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def run_task(variant): import tensorflow as tf from railrl.railrl.algos.ddpg import DDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv env = TfEnv(HalfCheetahEnv()) algo_name = variant['Algorithm'] if algo_name == 'Quadratic-DDPG': qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) elif algo_name == 'DDPG': qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, embedded_hidden_sizes=(100, ), observation_hidden_sizes=(100, ), hidden_nonlinearity=tf.nn.relu, ) else: raise Exception('Algo name not recognized: {0}'.format(algo_name)) es = OUStrategy(env_spec=env.spec) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=128, n_epochs=20, epoch_length=10000, eval_samples=10000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train()
def main(): stub(globals()) ddpg_params = dict( batch_size=64, n_epochs=2000, epoch_length=1000, eval_samples=1000, discount=0.99, qf_learning_rate=1e-3, policy_learning_rate=1e-4, soft_target_tau=0.001, replay_pool_size=1000000, min_pool_size=1000, scale_reward=0.1, ) env = TfEnv(HalfCheetahEnv()) es = OUStrategy(env_spec=env.spec) policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) qf = ContinuousMLPQFunction( name="qf", env_spec=env.spec, hidden_sizes=(100, 100), bn=False, ) algorithm = DDPG( env, policy, qf, es, **ddpg_params ) run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="ddpg-shane-half-cheetah-script", seed=1, variant=ddpg_params, )
def _setup_world(self, filename): """ Helper method for handling setup of the MuJoCo world. Args: filename: Path to XML file containing the world information. """ self._world = [] self._model = [] # Initialize Mujoco worlds. If there's only one xml file, create a single world object, # otherwise create a different world for each condition. for i in range(self._hyperparams['conditions']): self._world.append(HalfCheetahEnv()) # Initialize x0. self.x0 = [] self._full_init_state = [] # pdb.set_trace() for i in range(self._hyperparams['conditions']): self.x0.append(self._world[i].reset()) self._full_init_state.append(self._world[i].get_full_state())
def get_env_settings(env_id="", normalize_env=True, gym_name="", env_params=None): if env_params is None: env_params = {} if env_id == 'cart': env = CartpoleEnv() name = "Cartpole" elif env_id == 'cheetah': env = HalfCheetahEnv() name = "HalfCheetah" elif env_id == 'ant': env = AntEnv() name = "Ant" elif env_id == 'point': env = gym_env("OneDPoint-v0") name = "OneDPoint" elif env_id == 'reacher': env = gym_env("Reacher-v1") name = "Reacher" elif env_id == 'idp': env = InvertedDoublePendulumEnv() name = "InvertedDoublePendulum" elif env_id == 'ocm': env = OneCharMemory(**env_params) name = "OneCharMemory" elif env_id == 'gym': if gym_name == "": raise Exception("Must provide a gym name") env = gym_env(gym_name) name = gym_name else: raise Exception("Unknown env: {0}".format(env_id)) if normalize_env: env = normalize(env) name += "-normalized" return dict( env=env, name=name, was_env_normalized=normalize_env, )
def example(*_): env = HalfCheetahEnv() es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, n_epochs=25, batch_size=1024, replay_pool_size=10000, ) algorithm.train()
def init(env_name, args): if env_name == 'SparseMountainCar': from rllab_env.sparse_mountain_car import SparseMountainCarEnv env = RLLabWrapper(SparseMountainCarEnv()) elif env_name == 'Ant': from rllab_env.ant_env import AntEnv env = RLLabWrapper(AntEnv(args)) elif env_name == 'AntGather': from rllab_env.ant_gather_env import AntGatherEnv env = RLLabWrapper(AntGatherEnv(args)) elif env_name == 'HalfCheetah': from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv env = RLLabWrapper(HalfCheetahEnv()) elif env_name == 'MountainCar': from rllab.envs.box2d.mountain_car_env import MountainCarEnv env = RLLabWrapper(MountainCarEnv()) elif env_name == 'Cartpole': from rllab.envs.box2d.cartpole_env import CartpoleEnv env = RLLabWrapper(CartpoleEnv()) elif env_name == 'SingleGoal': from mazebase import single_goal from mazebase_env import single_goal as config env = MazeBaseWrapper('SingleGoal', single_goal, config) elif env_name == 'sp_goal': from mazebase_env import sp_goal env = MazeBaseWrapper('sp_goal', sp_goal, sp_goal) elif env_name == 'sp_switch': from mazebase_env import sp_switch config = sp_switch.get_opts_with_args(args) sp_switch.get_opts = lambda: config env = MazeBaseWrapper('sp_switch', sp_switch, sp_switch) elif env_name == 'sp_pick': from mazebase_env import sp_pick env = MazeBaseWrapper('sp_pick', sp_pick, sp_pick) elif "MiniGrid" in env_name: env = MinigridWrapper(env_name) else: raise RuntimeError("wrong env name") return env
def create_env(which_agent): # setup environment if (which_agent == 0): env = normalize(PointEnv()) elif (which_agent == 1): env = normalize(AntEnv()) elif (which_agent == 2): env = normalize(SwimmerEnv()) #dt 0.001 and frameskip=150 elif (which_agent == 3): env = gym.make("modified_gym_env:ReacherPyBulletEnv-v1") elif (which_agent == 4): env = normalize(HalfCheetahEnv()) elif (which_agent == 5): env = RoachEnv() #this is a personal vrep env elif (which_agent == 6): env = normalize(HopperEnv()) elif (which_agent == 7): env = normalize(Walker2DEnv()) #get dt value from env if (which_agent == 5): dt_from_xml = env.VREP_DT elif (which_agent == 3): dt_from_xml = 0.02 else: dt_from_xml = env.model.opt.timestep print("\n\n the dt is: ", dt_from_xml, "\n\n") #set vars tf.set_random_seed(2) gym.logger.setLevel(logging.WARN) dimO = env.observation_space.shape dimA = env.action_space.shape print('--------------------------------- \nState space dimension: ', dimO) print('Action space dimension: ', dimA, "\n -----------------------------------") return env, dt_from_xml
def run_task(_): for seed in range(3): env = TfEnv(HalfCheetahEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=16, n_epochs=100, epoch_length=100, eval_samples=100, max_path_length=10, min_pool_size=2, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train(),
def main(): stub(globals()) env = TfEnv(HalfCheetahEnv()) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=10000, eval_samples=10000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="test-qddpg-cheetah", seed=1, )
from rllab.algos.ddpg import DDPG from rllab.envs.normalized_env import normalize from rllab.misc.instrument import run_experiment_lite from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv env = normalize(HalfCheetahEnv()) def run_task(*_): """ DPG on Hopper environment """ env = normalize(HalfCheetahEnv()) """ Initialise the policy as a neural network policy """ # policy = DeterministicMLPPolicy( # env_spec=env.spec, # # The neural network policy should have two hidden layers, each with 32 hidden units. # hidden_sizes=(32, 32) # ) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(400, 300))
def get(perm): name = perm["problem"] if name.lower() == "cartpole": from rllab.envs.box2d.cartpole_env import CartpoleEnv return normalize(CartpoleEnv()) elif name.lower() == "mountain car height bonus": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv()) elif name.lower() == "mountain car": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv(height_bonus=0)) elif name.lower() == "gym mountain car": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("MountainCarContinuous-v0", record_video=False)) elif name.lower() == "pendulum": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("Pendulum-v0", record_video=False)) elif name.lower() == "mujoco double pendulum": from rllab.envs.mujoco.inverted_double_pendulum_env import InvertedDoublePendulumEnv return normalize(InvertedDoublePendulumEnv()) elif name.lower() == "double pendulum": from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv return normalize(DoublePendulumEnv()) elif name.lower() == "hopper": from rllab.envs.mujoco.hopper_env import HopperEnv return normalize(HopperEnv()) elif name.lower() == "swimmer": from rllab.envs.mujoco.swimmer_env import SwimmerEnv return normalize(SwimmerEnv()) elif name.lower() == "2d walker": from rllab.envs.mujoco.walker2d_env import Walker2DEnv return normalize(Walker2DEnv()) elif name.lower() == "half cheetah": from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv return normalize(HalfCheetahEnv()) elif name.lower() == "ant": from rllab.envs.mujoco.ant_env import AntEnv return normalize(AntEnv()) elif name.lower() == "simple humanoid": from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv return normalize(SimpleHumanoidEnv()) elif name.lower() == "full humanoid": from rllab.envs.mujoco.humanoid_env import HumanoidEnv return normalize(HumanoidEnv()) else: raise NotImplementedError(f"Environment {name} unknown")
def run_task(*_): """ DPG on Hopper environment """ env = normalize(HalfCheetahEnv()) """ Initialise the policy as a neural network policy """ # policy = DeterministicMLPPolicy( # env_spec=env.spec, # # The neural network policy should have two hidden layers, each with 32 hidden units. # hidden_sizes=(32, 32) # ) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(400, 300)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) """ Using the DDPG algorithm """ # algo = DDPG( # env=env, # policy=policy, # es=es, # qf=qf, # batch_size=32, # max_path_length=500, # epoch_length=500, # min_pool_size=10000, # n_epochs=20000, # discount=0.99, # scale_reward=0.01, # qf_learning_rate=1e-3, # policy_learning_rate=1e-4, # #Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, # ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=1000, epoch_length=1000, min_pool_size=10000, n_epochs=20000, discount=0.99, scale_reward=0.01, qf_learning_rate=10e-3, policy_learning_rate=10e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()