def main(root_dir, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, regularize): tf.set_random_seed(seed=seed) env = GymEnv('MountainCarContinuous-v0') env.min_action = env.action_space.low[0] env.max_action = env.action_space.high[0] env.env.seed(seed) max_replay_buffer_size = int(1e6) sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128} sampler = SimpleSampler(**sampler_params) entropy_coeff = entropy_coeff dynamic_coeff = dynamic_coeff # env_id = 'ContinuousSpaceMaze{}_{}_RB{}_entropy_{}__Normalize'.format(goal[0], goal[1], max_replay_buffer_size, entropy_coeff) env_id = 'MountainCarContinuous_RB1e6_entropy{}_epoch{}__Normalize_uniform'.format(entropy_coeff, n_epochs) env_id = env_id + '_dynamicCoeff' if dynamic_coeff else env_id os.makedirs(root_dir, exist_ok=True) env_dir = os.path.join(root_dir, env_id) os.makedirs(env_dir, exist_ok=True) current_log_dir = os.path.join(env_dir, 'seed{}'.format(seed)) mylogger.make_log_dir(current_log_dir) # env_id = 'Test' print(env_id) print('environment set done') # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) # use GMM policy policy = GMMPolicy( env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True ) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=n_epochs, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=20, eval_deterministic=True, ) pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=1., discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff, clip_norm=clip_norm, ) # name = env_id + datetime.now().strftime("-%m%d-%Hh-%Mm-%ss") # mylogger.make_log_dir(name) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(param): # instructive = 0.5 # decay = 3e-6 decay = 5e-4 instructive = 0.5 random_arm_init = [-0.05, 0.05] render = False reward_shaping = False horizon = 250 env = normalize( CRL4DOFWrapper( # IKWrapper( SawyerPrimitivePick( instructive=instructive, decay=decay, random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ), use_gripper=True)) # ) replay_buffer_params = { 'max_replay_buffer_size': 1e6, } sampler_params = { 'max_path_length': horizon - 1, 'min_pool_size': 1000, 'batch_size': 256, } pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict( { 'epoch_length': 1500, 'n_train_repeat': 1, 'n_initial_exploration_steps': 5000, 'eval_render': False, 'eval_n_episodes': 1, 'eval_deterministic': True, 'n_epochs': 3e3 }, sampler=sampler) M = 64 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(64, 64), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): tf.logging.set_verbosity(tf.logging.INFO) with tf.Session() as sess: data = joblib.load(variant['snapshot_filename']) policy = data['policy'] env = data['env'] num_skills = data['policy'].observation_space.flat_dim - data[ 'env'].spec.observation_space.flat_dim best_z = get_best_skill(policy, env, num_skills, variant['max_path_length']) fixed_z_env = FixedOptionEnv(env, num_skills, best_z) tf.logging.info('Finetuning best skill...') pool = SimpleReplayBuffer( env_spec=fixed_z_env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] if variant['use_pretrained_values']: qf = data['qf'] vf = data['vf'] else: del data['qf'] del data['vf'] qf = NNQFunction( env_spec=fixed_z_env.spec, hidden_layer_sizes=[M, M], var_scope='qf-finetune', ) vf = NNVFunction( env_spec=fixed_z_env.spec, hidden_layer_sizes=[M, M], var_scope='vf-finetune', ) algorithm = SAC( base_kwargs=base_kwargs, env=fixed_z_env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], save_full_state=False, ) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) elif variant["env_name"] == "Point2D-v0": import sac.envs.point2d_env env = GymEnv(variant["env_name"]) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict(min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=SimpleSampler( max_path_length=variant["max_path_length"], min_pool_size=variant["max_path_length"], batch_size=variant["batch_size"])) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GaussianPolicy( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], reg=0.001, ) # policy = GMMPolicy( # env_spec=aug_env_spec, # K=variant['K'], # hidden_layer_sizes=[M, M], # qf=qf, # reg=0.001, # ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN(base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], reparametrize=variant["reparametrize"]) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) env = DelayedEnv(env, delay=0.01) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) sampler = RemoteSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size'] ) base_kwargs = dict( sampler=sampler, epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=env.spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reparameterize=variant['reparameterize'], reg=0.001, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], reparameterize=variant['reparameterize'], save_full_state=False, ) algorithm.train()
class EAC(Serializable): """ CG: the class that implements the EAC algorithm. """ def __init__( self, environment_name, algorithm_name, lr, scale_reward, scale_entropy, discount, tau, max_replay_buffer_size, sampler_params, value_func_layers_number, value_func_layer_size, policy_func_layers_number, policy_func_layer_size, base_ac_alg_params, q_param_list, use_ucb=False, evaluation_strategy='ensemble', ): """ CG: the constructor. :param environment_name: the name of the environment in string. :param algorithm_name: the name of the AC algorithm to be used in the ensemble. :param lr: the learning rate to be used in the ensemble. :param scale_reward: the reward scaling factor. :param scale_entropy: the entropy scaling factor. :param discount: the reward discount factor. :param tau: the target value function updating factor. :param max_replay_buffer_size: the maximum size of the replay buffer. :param sampler_params: extra parameter settings for the random sampler. :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function. :param value_func_layer_size: the number of neurons of each hidden layer of the value network. :param policy_func_layers_number: th number of hidden layers for the policy network. :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network. :param base_ac_alg_params: base parameters for the AC algorithm. :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble. :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration. :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'. """ # Set up the environment. self._environment_name = environment_name self._env = GymEnv(self._environment_name) # Set up the algorithm parameters. self._algorithm_name = algorithm_name self._lr = lr self._scale_reward = scale_reward self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._use_ucb = use_ucb self._evaluation_strategy = evaluation_strategy # Set up the replay buffer. self._max_replay_buffer_size = max_replay_buffer_size self._pool = SimpleReplayBuffer( env_spec=self._env.spec, max_replay_buffer_size=self._max_replay_buffer_size) # Set up the environment sampler. self._sampler_params = sampler_params self._sampler = SimpleSampler(**self._sampler_params) # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network. self._alg_instances = [] self._base_ac_params = base_ac_alg_params self._base_alg_params = dict(self._base_ac_params, sampler=self._sampler) for id, q_val in enumerate(q_param_list): # Set up the value function network for an AC instance. qf1 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf1') qf2 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf2') vf = NNVFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'vf') # Set up the policy network for an AC instance. policy = GaussianPolicy( env_spec=self._env.spec, hidden_layer_sizes=tuple([ policy_func_layer_size for _ in range(policy_func_layers_number) ]), squash=True, reparameterize=False, reg=1.e-3, name=str(id) + 'gaussian_policy') initial_exploration_policy = policy # Set up an AC instance. if self._algorithm_name == 'sac': algorithm = SACV1( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, ) elif self._algorithm_name == 'tac': algorithm = TAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, tsallisQ=q_val, ) elif self._algorithm_name == 'rac': algorithm = RAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, renyiQ=q_val, ) else: raise NotImplementedError # Initialize the AC instance. # algorithm._sess.run(tf.global_variables_initializer()) # Put the initialized AC instance into the algorithm instance list. # Each element of the algorithm instance list is made up of # the algorithm instance, # the moving average performance of the instance, # the number of times the instance has been used for exploration previously, and # the UCB bound. self._alg_instances.append([algorithm, 0.0, 0.0, 0.0]) # Set up the ensemble Q-function for action selection. self._Q_ensemble = NNQFunction( env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name='ensqf') # ======================================================================== # Set up the training target for the ensemble Q-function for action selection. # ======================================================================== # Create the observation placeholder. self._observations_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='obv_ens', ) # Create the next observation placeholder. self._observations_ens_next_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='next_obv_ens', ) # Create a list of next action placeholders. self._acts_next_phs = [] for i in range(len(q_param_list)): act_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name=str(i) + '_next_act_ens', ) self._acts_next_phs.append(act_ens_ph) # Create the observed action placeholder. self._obv_act_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name='act_obv_ens', ) # Create the reward placeholder. self._rewards_ph = tf.placeholder( tf.float32, shape=(None, ), name='rew_ens', ) # Create the terminal placeholder. self._terminals_ph = tf.placeholder( tf.float32, shape=(None, ), name='ter_ens', ) # Determine the target Q-value for next step. self._q_ens_targets = [] for act_next_ph in self._acts_next_phs: qt = self._Q_ensemble.get_output_for( self._observations_ens_next_ph, act_next_ph, reuse=True) self._q_ens_targets.append(qt) for i, q_t in enumerate(self._q_ens_targets): if i == 0: self._q_ens_next = q_t else: self._q_ens_next = tf.maximum(self._q_ens_next, q_t) # self._q_ens_next = self._q_ens_next + q_t # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets) # Determine the Q-loss. self._q_train = self._Q_ensemble.get_output_for( self._observations_ens_ph, self._obv_act_ph, reuse=True) self._q_ens_loss = 0.5 * tf.reduce_mean( (self._q_train - tf.stop_gradient(self._scale_reward * self._rewards_ph + (1 - self._terminals_ph) * self._discount * self._q_ens_next))**2) # Determine the Q-training operator. self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize( loss=self._q_ens_loss, var_list=self._Q_ensemble.get_params_internal()) # Set up the tensor flow session. self._sess = tf_utils.get_default_session() self._sess.run(tf.global_variables_initializer()) def train(self): """ CG: the function that conducts ensemble training. :return: """ # Set up parameters for the training process. self._n_epochs = self._base_ac_params['n_epochs'] self._epoch_length = self._base_ac_params['epoch_length'] self._n_train_repeat = self._base_ac_params['n_train_repeat'] self._n_initial_exploration_steps = self._base_ac_params[ 'n_initial_exploration_steps'] self._eval_render = self._base_ac_params['eval_render'] self._eval_n_episodes = self._base_ac_params['eval_n_episodes'] self._eval_deterministic = self._base_ac_params['eval_deterministic'] # Set up the evaluation environment. if self._eval_n_episodes > 0: with tf.variable_scope("low_level_policy", reuse=True): self._eval_env = deep_clone(self._env) # Import required libraries for training. import random import math import operator import numpy as np # Initialize the sampler. alg_ins = random.choice(self._alg_instances) self._sampler.initialize(self._env, alg_ins[0].policy, self._pool) # Perform the training/evaluation process. num_episode = 0. with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.log('Epoch #%d | ' % epoch) for t in range(self._epoch_length): isEpisodeEnd = self._sampler.sample() # If an episode is ended, we need to update performance statistics for each AC instance and # pick randomly another AC instance for next episode of exploration. if isEpisodeEnd: num_episode = num_episode + 1. alg_ins[1] = 0.9 * alg_ins[ 1] + 0.1 * self._sampler._last_path_return alg_ins[2] = alg_ins[2] + 1. if self._use_ucb: # Select an algorithm instance based on UCB. selected = False for ains in self._alg_instances: if ains[2] < 1.: alg_ins = ains selected = True break else: ains[3] = ains[1] + math.sqrt( 2.0 * math.log(num_episode) / ains[2]) if not selected: alg_ins = max(self._alg_instances, key=operator.itemgetter(3)) else: # Select an algorithm instance uniformly at random. alg_ins = random.choice(self._alg_instances) self._sampler.set_policy(alg_ins[0].policy) if not self._sampler.batch_ready(): continue gt.stamp('sample') # ================ # Perform training. # ================ for i in range(self._n_train_repeat): batch = self._sampler.random_batch() # ==================================== # Perform training over all AC instances. # ==================================== for ains in self._alg_instances: ains[0]._do_training(iteration=t + epoch * self._epoch_length, batch=batch) # ================================================= # Perform training of the action-selection Q-function. # ================================================= # Set up the feed dictionary. feed_dict = { self._observations_ens_ph: batch['observations'], self._obv_act_ph: batch['actions'], self._observations_ens_next_ph: batch['next_observations'], self._rewards_ph: batch['rewards'], self._terminals_ph: batch['terminals'], } for i, ains in enumerate(self._alg_instances): with ains[0].policy.deterministic( self._eval_deterministic): feed_dict[self._acts_next_phs[i]] = ains[ 0].policy.get_actions( batch['next_observations']) # Perform training on the action-selection Q-function. self._sess.run(self._q_ens_train_operator, feed_dict) gt.stamp('train') # ============================================================ # Perform evaluation after one full epoch of training is completed. # ============================================================ if self._eval_n_episodes < 1: continue if self._evaluation_strategy == 'ensemble': # Use a whole ensemble of AC instances for evaluation. paths = rollouts(self._eval_env, self, self._sampler._max_path_length, self._eval_n_episodes) elif self._evaluation_strategy == 'best-policy': # Choose the AC instance with the highest observed performance so far for evaluation. eval_alg_ins = max(self._alg_instances, key=operator.itemgetter(1)) with eval_alg_ins[0].policy.deterministic( self._eval_deterministic): paths = rollouts(self._eval_env, eval_alg_ins[0].policy, self._sampler._max_path_length, self._eval_n_episodes) else: paths = None if paths is not None: total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) # Produce log info after each episode of training and evaluation. times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self._sampler.log_diagnostics() logger.dump_tabular() # logger.pop_prefix() gt.stamp('eval') # Terminate the sampler after the training process is completed. self._sampler.terminate() def reset(self, dones=None): """ CG: the function required in order to support evaluation (or rollout) by using EAC as an ensemble of AC instances. :param dones: :return: """ pass def get_action(self, observation): """ CG: the function required in order to support evaluation (or rollout) by using EAC as an ensemble of AC instances. :param observation: :return: """ import numpy as np # Collect all recommended actions. recommend_actions = [] for ains in self._alg_instances: with ains[0].policy.deterministic(self._eval_deterministic): recommend_actions.append( ains[0].policy.get_action(observation)) # Determine the value of performing each recommended action. # Option 1: select actions based on average Q-value. # action_values = np.zeros(len(recommend_actions)) # for i, ract in enumerate(recommend_actions): # # for ains in self._alg_instances: # action_values[i] = action_values[i] + ains[0]._qf1.eval([observation], [ract[0]])[0] # action_values[i] = action_values[i] / len(self._alg_instances) # Option 2: select actions based on average rank. # from scipy.stats import rankdata # action_ranks = np.zeros((len(self._alg_instances), len(self._alg_instances))) # for i, ract in enumerate(recommend_actions): # for j, ains in enumerate(self._alg_instances): # action_ranks[j,i] = ains[0]._qf1.eval([observation], [ract[0]])[0] # # action_ranks = np.array([rankdata(x, method='dense') for x in action_ranks]) # action_values = np.sum(action_ranks, axis=0) # Option 3: select actions based on the ensemble action-selection Q-function. action_values = np.zeros(len(recommend_actions)) for i, ract in enumerate(recommend_actions): action_values[i] = self._Q_ensemble.eval([observation], [ract[0]])[0] # Choose the recommended action with the highest value. act_ind = np.argmax(action_values) return recommend_actions[act_ind]
def run_experiment(*_): env = normalize(VoltVarEnv()) pool = SimpleReplayBuffer(max_replay_buffer_size=1e6, env_spec=env.spec) sampler = SimpleSampler(max_path_length=168, min_pool_size=100, batch_size=256) base_kwargs = dict( sampler=sampler, epoch_length=1000, n_epochs=50, n_initial_exploration_steps=10000, n_train_repeat=1, # eval_render=False, eval_n_episodes=10, #50, eval_deterministic=True) qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[64, 32], name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[64, 32], name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[64, 32], name='vf') qfc1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[64, 32], name='qfc1') qfc2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[64, 32], name='qfc2') vfc = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[64, 32], name='vfc') initial_exploration_policy = UniformPolicy2(env_spec=env.spec) # policy = GaussianPolicy( # env_spec=env.spec, # hidden_layer_sizes=[64, 32], # reparameterize=True, # reg=1e-3, # ) policy = CategoricalPolicy(env_spec=env.spec, hidden_layer_sizes=[64, 32]) algo = SACD( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, qfc1=qfc1, qfc2=qfc2, vf=vf, vfc=vfc, # plotter=plotter, lr=1e-3, scale_reward=50, #2.5, ## 50 bus 4; 10 bus34 scale_rewardc=50, # 2.5, ## 50 bus 4; 10 bus34 alpha=1, constraint_lr=1e-5, # 1e-5, #1e-6,#bus34 5e-6; # constraint_coeff=1, # 0, # constraint_coeff_targ=1, discount=0.99, tau=5e-4, #bus34 5e-4;bus123 2.5e-4,; target_update_interval=1, #reparameterize=True, save_full_state=False) algo.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=aug_env_spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reg=0.001, ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN_BD( base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], # Additional params for behaviour tracking metric=variant['metric'], env_id=variant['prefix'], eval_freq=variant['eval_freq'], log_dir=get_logdir(args, variant), ) algorithm.train()
def run_experiment(variant): # print('MuJoCo') # env = normalize(GymEnv('HalfCheetah-v1')) # ----------------------------------------------------- print('Unity3D environment') env = UnityEnv('/home/recharrs/Apps/UnityEnvob3/RollerBall.x86_64', time_state=True, idx=args.idx, no_graphics=args.no_graphics) # ----------------------------------------------------- obs_space = env.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low.flatten(), np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high.flatten(), np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=5000, ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=0, # must set to 0 or it will be error eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=aug_env_spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reg=0.001, ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN( base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], ) algorithm.train()
def main(env, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, normalize_obs, buffer_size, max_path_length, min_pool_size, batch_size, policy_mode): tf.set_random_seed(seed=seed) # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) if policy_mode == GMMPolicy: # use GMM policy policy = GMMPolicy(env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True) else: _, mode = str(policy_mode).split('-') if _ != "Knack": raise AssertionError( "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration" ) else: policy = KnackBasedPolicy( a_lim_lows=env.action_space.low, a_lim_highs=env.action_space.high, mode=mode, env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=n_epochs, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=20, eval_deterministic=True, ) max_replay_buffer_size = buffer_size pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) sampler_params = { 'max_path_length': max_path_length, 'min_pool_size': min_pool_size, 'batch_size': batch_size } sampler = NormalizeSampler( **sampler_params) if normalize_obs else SimpleSampler(**sampler_params) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC(base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=1., discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff, clip_norm=clip_norm) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): sub_level_policies_paths = [] args = arg() if args.domain == 'sawyer-reach': print("Composition Reach") goal_size = 0 sub_level_policies_paths.append("ikx") sub_level_policies_paths.append("iky") sub_level_policies_paths.append("ikz") random_arm_init = [-0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, )))) ep_length = 1500 elif args.domain == 'sawyer-reach-pick': print("Composition Reach and Pick") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") render = False random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 1000 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) ep_length = 1500 elif args.domain == 'sawyer-reach-pick-simple': print("Composition Reach and Pick Simple") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") render = False random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 500 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, placement_initializer=UniformRandomSampler( x_range=[-0.01, 0.01], y_range=[-0.01, 0.01], ensure_object_boundary_in_range=False, z_rotation=None, ), # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) ep_length = 3000 else: raise ValueError("Domain not available") if args.demo: pool = DemoReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) else: pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler( max_path_length=horizon - 1, # should be same as horizon min_pool_size=1000, batch_size=256) base_kwargs = dict( epoch_length=ep_length, n_epochs=5e3, # n_epochs=5, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler, use_demos=args.demo, ) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] if variant['num_hidden'] != 256: M = variant['num_hidden'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1', batchnormvf=variant['batchnormvf']) qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2', batchnormvf=variant['batchnormvf']) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), batchnormvf=variant['batchnormvf'], dropoutvf_keep_prob=variant['dropoutvf']) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], todropoutpi=(variant['dropoutpi'] < 1.0), dropoutpi=variant['dropoutpi'], batchnormpi=variant['batchnormpi']) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get( 'preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) if variant['reward_scale'] < 0: scale_rew = algorithm_params['scale_reward'] else: scale_rew = variant['reward_scale'] algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'], scale_reward=scale_rew, discount=algorithm_params['discount'], tau=variant['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, l1regpi=variant['l1regpi'], l2regpi=variant['l2regpi'], l1regvf=variant['l1regvf'], l2regvf=variant['l2regvf'], ent_coef=variant['ent_coef'], wclippi=variant['wclippi'], wclipvf=variant['wclipvf'], dropoutpi=variant['dropoutpi'], dropoutvf=variant['dropoutvf'], batchnormpi=variant['batchnormpi'], batchnormvf=variant['batchnormvf']) algorithm._sess.run(tf.global_variables_initializer()) for v in tf.trainable_variables(): print(v.name) algorithm.train() if variant['policypath'] != '': save_w_path = os.path.expanduser(variant['policypath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='gaussian_policy'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',') if variant['valuepath'] != '': save_w_path = os.path.expanduser(variant['valuepath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf1'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf2'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='vf'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',')
def run_experiment(variant): sub_level_policies_paths = [] # args = parse_args() args = arg() if args.domain == 'sawyer-reach': goal_size = 0 sub_level_policies_paths.append("ikx") sub_level_policies_paths.append("iky") sub_level_policies_paths.append("ikz") random_arm_init = [-0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, )))) else: raise ValueError("Domain not available") pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler( max_path_length=horizon - 1, # should be same as horizon min_pool_size=1000, batch_size=256) base_kwargs = dict( epoch_length=1000, n_epochs=2e3, # n_epochs=5, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def __init__( self, environment_name, algorithm_name, lr, scale_reward, scale_entropy, discount, tau, max_replay_buffer_size, sampler_params, value_func_layers_number, value_func_layer_size, policy_func_layers_number, policy_func_layer_size, base_ac_alg_params, q_param_list, use_ucb=False, evaluation_strategy='ensemble', ): """ CG: the constructor. :param environment_name: the name of the environment in string. :param algorithm_name: the name of the AC algorithm to be used in the ensemble. :param lr: the learning rate to be used in the ensemble. :param scale_reward: the reward scaling factor. :param scale_entropy: the entropy scaling factor. :param discount: the reward discount factor. :param tau: the target value function updating factor. :param max_replay_buffer_size: the maximum size of the replay buffer. :param sampler_params: extra parameter settings for the random sampler. :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function. :param value_func_layer_size: the number of neurons of each hidden layer of the value network. :param policy_func_layers_number: th number of hidden layers for the policy network. :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network. :param base_ac_alg_params: base parameters for the AC algorithm. :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble. :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration. :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'. """ # Set up the environment. self._environment_name = environment_name self._env = GymEnv(self._environment_name) # Set up the algorithm parameters. self._algorithm_name = algorithm_name self._lr = lr self._scale_reward = scale_reward self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._use_ucb = use_ucb self._evaluation_strategy = evaluation_strategy # Set up the replay buffer. self._max_replay_buffer_size = max_replay_buffer_size self._pool = SimpleReplayBuffer( env_spec=self._env.spec, max_replay_buffer_size=self._max_replay_buffer_size) # Set up the environment sampler. self._sampler_params = sampler_params self._sampler = SimpleSampler(**self._sampler_params) # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network. self._alg_instances = [] self._base_ac_params = base_ac_alg_params self._base_alg_params = dict(self._base_ac_params, sampler=self._sampler) for id, q_val in enumerate(q_param_list): # Set up the value function network for an AC instance. qf1 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf1') qf2 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf2') vf = NNVFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'vf') # Set up the policy network for an AC instance. policy = GaussianPolicy( env_spec=self._env.spec, hidden_layer_sizes=tuple([ policy_func_layer_size for _ in range(policy_func_layers_number) ]), squash=True, reparameterize=False, reg=1.e-3, name=str(id) + 'gaussian_policy') initial_exploration_policy = policy # Set up an AC instance. if self._algorithm_name == 'sac': algorithm = SACV1( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, ) elif self._algorithm_name == 'tac': algorithm = TAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, tsallisQ=q_val, ) elif self._algorithm_name == 'rac': algorithm = RAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, renyiQ=q_val, ) else: raise NotImplementedError # Initialize the AC instance. # algorithm._sess.run(tf.global_variables_initializer()) # Put the initialized AC instance into the algorithm instance list. # Each element of the algorithm instance list is made up of # the algorithm instance, # the moving average performance of the instance, # the number of times the instance has been used for exploration previously, and # the UCB bound. self._alg_instances.append([algorithm, 0.0, 0.0, 0.0]) # Set up the ensemble Q-function for action selection. self._Q_ensemble = NNQFunction( env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name='ensqf') # ======================================================================== # Set up the training target for the ensemble Q-function for action selection. # ======================================================================== # Create the observation placeholder. self._observations_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='obv_ens', ) # Create the next observation placeholder. self._observations_ens_next_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='next_obv_ens', ) # Create a list of next action placeholders. self._acts_next_phs = [] for i in range(len(q_param_list)): act_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name=str(i) + '_next_act_ens', ) self._acts_next_phs.append(act_ens_ph) # Create the observed action placeholder. self._obv_act_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name='act_obv_ens', ) # Create the reward placeholder. self._rewards_ph = tf.placeholder( tf.float32, shape=(None, ), name='rew_ens', ) # Create the terminal placeholder. self._terminals_ph = tf.placeholder( tf.float32, shape=(None, ), name='ter_ens', ) # Determine the target Q-value for next step. self._q_ens_targets = [] for act_next_ph in self._acts_next_phs: qt = self._Q_ensemble.get_output_for( self._observations_ens_next_ph, act_next_ph, reuse=True) self._q_ens_targets.append(qt) for i, q_t in enumerate(self._q_ens_targets): if i == 0: self._q_ens_next = q_t else: self._q_ens_next = tf.maximum(self._q_ens_next, q_t) # self._q_ens_next = self._q_ens_next + q_t # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets) # Determine the Q-loss. self._q_train = self._Q_ensemble.get_output_for( self._observations_ens_ph, self._obv_act_ph, reuse=True) self._q_ens_loss = 0.5 * tf.reduce_mean( (self._q_train - tf.stop_gradient(self._scale_reward * self._rewards_ph + (1 - self._terminals_ph) * self._discount * self._q_ens_next))**2) # Determine the Q-training operator. self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize( loss=self._q_ens_loss, var_list=self._Q_ensemble.get_params_internal()) # Set up the tensor flow session. self._sess = tf_utils.get_default_session() self._sess.run(tf.global_variables_initializer())
def run(variant): env = normalize( MultiGoalEnv( actuation_cost_coeff=1, distance_cost_coeff=0.1, goal_reward=1, init_sigma=0.1, )) pool = SimpleReplayBuffer(max_replay_buffer_size=1e6, env_spec=env.spec) sampler = SimpleSampler(max_path_length=30, min_pool_size=100, batch_size=64) base_kwargs = dict(sampler=sampler, epoch_length=1000, n_epochs=1000, n_train_repeat=1, eval_render=True, eval_n_episodes=10, eval_deterministic=False) M = 128 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) if variant['policy_type'] == 'gmm': policy = GMMPolicy(env_spec=env.spec, K=4, hidden_layer_sizes=[M, M], qf=qf, reg=0.001) elif variant['policy_type'] == 'lsp': bijector_config = { "scale_regularization": 0.0, "num_coupling_layers": 2, "translation_hidden_sizes": (M, ), "scale_hidden_sizes": (M, ), } policy = LatentSpacePolicy(env_spec=env.spec, mode="train", squash=True, bijector_config=bijector_config, observations_preprocessor=None) plotter = QFPolicyPlotter(qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SAC(base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, plotter=plotter, lr=3e-4, scale_reward=3.0, discount=0.99, tau=1e-4, save_full_state=True) algorithm.train()
def __init__( self, environment_name, algorithm_name, lr, scale_reward, scale_entropy, discount, tau, max_replay_buffer_size, sampler_params, value_func_layers_number, value_func_layer_size, policy_func_layers_number, policy_func_layer_size, base_ac_alg_params, q_param_list, use_ucb=False, evaluation_strategy='ensemble', ): """ CG: the constructor. :param environment_name: the name of the environment in string. :param algorithm_name: the name of the AC algorithm to be used in the ensemble. :param lr: the learning rate to be used in the ensemble. :param scale_reward: the reward scaling factor. :param scale_entropy: the entropy scaling factor. :param discount: the reward discount factor. :param tau: the target value function updating factor. :param max_replay_buffer_size: the maximum size of the replay buffer. :param sampler_params: extra parameter settings for the random sampler. :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function. :param value_func_layer_size: the number of neurons of each hidden layer of the value network. :param policy_func_layers_number: th number of hidden layers for the policy network. :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network. :param base_ac_alg_params: base parameters for the AC algorithm. :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble. :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration. :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'. """ # Set up the environment. self._environment_name = environment_name self._env = GymEnv(self._environment_name) # Set up the algorithm parameters. self._algorithm_name = algorithm_name self._lr = lr self._scale_reward = scale_reward self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._use_ucb = use_ucb self._evaluation_strategy = evaluation_strategy # Set up the replay buffer. self._max_replay_buffer_size = max_replay_buffer_size self._pool = SimpleReplayBuffer( env_spec=self._env.spec, max_replay_buffer_size=self._max_replay_buffer_size) # Set up the environment sampler. self._sampler_params = sampler_params self._sampler = SimpleSampler(**self._sampler_params) # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network. self._alg_instances = [] self._base_ac_params = base_ac_alg_params self._base_alg_params = dict(self._base_ac_params, sampler=self._sampler) for id, q_val in enumerate(q_param_list): # Set up the value function network for an AC instance. qf1 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf1') qf2 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf2') vf = NNVFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'vf') # Set up the policy network for an AC instance. policy = GaussianPolicy( env_spec=self._env.spec, hidden_layer_sizes=tuple([ policy_func_layer_size for _ in range(policy_func_layers_number) ]), squash=True, reparameterize=False, reg=1.e-3, name=str(id) + 'gaussian_policy') initial_exploration_policy = policy # Set up an AC instance. if self._algorithm_name == 'sac': algorithm = SACV1( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, ) elif self._algorithm_name == 'tac': algorithm = TAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, tsallisQ=q_val, ) elif self._algorithm_name == 'rac': algorithm = RAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, renyiQ=q_val, ) else: raise NotImplementedError # Initialize the AC instance. algorithm._sess.run(tf.global_variables_initializer()) # Put the initialized AC instance into the algorithm instance list. # Each element of the algorithm instance list is made up of # the algorithm instance, # the moving average performance of the instance, # the number of times the instance has been used for exploration previously, and # the UCB bound. self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])
def run_experiment(variant): domain = None goal_size = None sub_level_policies_paths = [] if args.domain == 'ant-cross-maze': domain = CrossMazeAntEnv goal_size = 2 sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl") elif args.domain == 'ant-random-goal': domain = RandomGoalAntEnv goal_size = 2 sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl") elif args.domain == 'cheetah-hurdle': domain = HalfCheetahHurdleEnv goal_size = 2 sub_level_policies_paths.append("primitive-policies/hc/fwd/fwd.pkl") sub_level_policies_paths.append( "primitive-policies/hc/jp-longz/jump.pkl") elif args.domain == 'pusher': domain = PusherEnv goal_size = 0 sub_level_policies_paths.append( "primitive-policies/pusher/bottom/bottom.pkl") sub_level_policies_paths.append( "primitive-policies/pusher/left/left.pkl") env = normalize(domain()) #CrossMazeAntEnv()) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler(max_path_length=1000, min_pool_size=1000, batch_size=256) base_kwargs = dict(epoch_length=1000, n_epochs=5e3, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def main(env_id, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, normalize_obs, buffer_size, max_path_length, min_pool_size, batch_size, policy_mode, eval_model, e, stochastic): tf.set_random_seed(seed=seed) env = GymEnv(env_id) env.min_action = env.action_space.low[0] env.max_action = env.action_space.high[0] if hasattr(env, "seed"): env.seed(seed) else: env.env.seed(seed) # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) print("here") # use GMM policy if policy_mode == "GMMPolicy": # use GMM policy policy = GMMPolicy(env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True) elif policy_mode == "EExploitationPolicy": policy = EExploitationPolicy( env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True, e=e) else: _, mode = str(policy_mode).split('-') if _ != "Knack": raise AssertionError( "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration" ) else: policy = KnackBasedPolicy( a_lim_lows=env.action_space.low, a_lim_highs=env.action_space.high, mode=mode, env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, vf=vf, reg=1e-3, squash=True) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=n_epochs, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=20, eval_deterministic=True, ) max_replay_buffer_size = buffer_size pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) sampler_params = { 'max_path_length': max_path_length, 'min_pool_size': min_pool_size, 'batch_size': batch_size } sampler = NormalizeSampler( **sampler_params) if normalize_obs else SimpleSampler(**sampler_params) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC(base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=1., discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff, clip_norm=clip_norm) algorithm._sess.run(tf.global_variables_initializer()) # -------------- setting done ------------------------ # -------------- main process ------------------------ with algorithm._sess.as_default(): algorithm._saver.restore(algorithm._sess, eval_model) if stochastic: knack_file = os.path.join(os.path.dirname(eval_model), "array/epoch0_2001.npz") final_knacks = np.load(knack_file)['knack_kurtosis'][-1] env = algorithm._env if hasattr(env, "env"): env = env.env # np.random.seed(seed) # env.seed(seed) num_data = 50 # num_data * nprocess == 1500 steps_thresh = 1000 data = {'acs': [], 'ep_rets': [], 'obs': [], 'rews': []} for i in range(num_data): obs = env.reset() done = False steps = 0 ret = 0 tmp_data = {'acs': [], 'obs': [], 'rews': []} if stochastic: _min = np.min(final_knacks) _max = np.max(final_knacks) print("start episode {}".format(i)) while not done: steps += 1 # env.render() if stochastic: if hasattr(algorithm.pi, "knack_thresh"): v, mean, var, kurtosis = algorithm._policy.calc_and_update_knack( [obs]) knack_value = kurtosis[0] # _min = min(knack_value, _min) # _max = max(knack_value, _max) knack_value = (knack_value - _min) / (_max - _min) if knack_value > 0.8: ## TODO hyper param print("knack {}".format(knack_value)) was = algorithm._policy._is_deterministic algorithm._policy._is_deterministic = True action, _ = algorithm.policy.get_action( obs.flatten()) algorithm._policy._is_deterministic = was else: action, _ = algorithm.policy.get_action( obs.flatten()) else: algorithm._policy._is_deterministic = False action, _ = algorithm.policy.get_action(obs.flatten()) else: if hasattr(algorithm._policy, "_is_deterministic"): algorithm._policy._is_deterministic = True action, _ = algorithm.policy.get_action(obs.flatten()) obs_next, rew, done, _ = env.step(action) tmp_data['obs'].append(obs) tmp_data['acs'].append(action) tmp_data['rews'].append(rew) ret += rew obs = obs_next if steps >= steps_thresh: done = True data['ep_rets'].append(ret) for k, v in tmp_data.items(): data[k].append(v) # np.savez_compressed("a.npz", **data) # print("return mean: {}".format(np.mean(data['ep_rets']))) return data
def run_experiment(variant): low_level_policy = load_low_level_policy( policy_path=variant['low_level_policy_path']) env_name = variant['env_name'] env_type = env_name.split('-')[-1] env_args = { name.replace('env_', '', 1): value for name, value in variant.items() if name.startswith('env_') and name != 'env_name' } if 'random-goal' in env_name: EnvClass = RANDOM_GOAL_ENVS[env_type] elif 'rllab' in variant['env_name']: EnvClass = RLLAB_ENVS[variant['env_name']] else: raise NotImplementedError base_env = normalize(EnvClass(**env_args)) env = HierarchyProxyEnv(wrapped_env=base_env, low_level_policy=low_level_policy) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) sampler = SimpleSampler(max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict(epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = variant['layer_size'] qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) preprocessing_hidden_sizes = variant.get('preprocessing_hidden_sizes') observations_preprocessor = ( MLPPreprocessor(env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, name='high_level_observations_preprocessor') if preprocessing_hidden_sizes is not None else None) policy_s_t_layers = variant['policy_s_t_layers'] policy_s_t_units = variant['policy_s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { "scale_regularization": 0.0, "num_coupling_layers": variant['policy_coupling_layers'], "translation_hidden_sizes": s_t_hidden_sizes, "scale_hidden_sizes": s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, mode="train", squash=False, bijector_config=bijector_config, q_function=qf, fix_h_on_reset=variant.get('policy_fix_h_on_reset', False), observations_preprocessor=observations_preprocessor, name="high_level_policy") algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], target_update_interval=variant['target_update_interval'], action_prior=variant['action_prior'], save_full_state=False, ) algorithm.train()
def run_experiment(env, seed, scale_reward, scale_entropy, tsallisQ, num_of_train): tf.set_random_seed(seed) environmentName = env # environmentName = "LunarLanderContinuous-v2" print("Experiment: {}".format(environmentName)) # Set up the PyBullet environment. # env = normalize(gym.make(environmentName)) env = GymEnv(environmentName) # Set up the replay buffer. pool = SimpleReplayBuffer(env_spec = env.spec, max_replay_buffer_size = 1000000) # Set up the sampler. sampler_params = { 'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 256, } sampler = SimpleSampler(**sampler_params) # Set up the value function networks. M = 128 qf1 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf1') qf2 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf2') vf = NNVFunction(env_spec = env.spec, hidden_layer_sizes = (M, M)) # Set up the policy network. # initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPolicy( env_spec = env.spec, hidden_layer_sizes = (M, M), reparameterize = False, reg = 1e-3, ) # policy = GMMPolicy( # env_spec=env.spec, # K=1, # hidden_layer_sizes=(M, M), # reparameterize=False, # qf=qf1, # reg=1.0e-3, # ) initial_exploration_policy = policy base_kwargs = { 'epoch_length': 1000, 'n_train_repeat': num_of_train, 'n_initial_exploration_steps': 1000, 'eval_render': False, 'eval_n_episodes': 3, 'eval_deterministic': True, } base_kwargs = dict(base_kwargs, sampler = sampler) # Define a function for reward scaling. def incrementor(itr): return (0.5 + (0.8 - 0.5) * tf.minimum(itr / 500000., 1.0)) def decrementor(itr): return (0.8 - (0.8 - 0.6) * tf.minimum(itr / 500000., 1.0)) algorithm = TAC( base_kwargs = base_kwargs, env = env, policy = policy, initial_exploration_policy = initial_exploration_policy, pool = pool, qf1 = qf1, qf2 = qf2, vf = vf, lr = 3.0e-4, scale_reward = scale_reward, # CG: default 1.0, 0.5 for the lunar lander problem, 3.0 for the pendulum problem. scale_entropy = scale_entropy, # CG: default 1.0, 0.8 for the lunar lander problem. discount = 0.99, tau = 0.01, reparameterize = False, target_update_interval = 1, action_prior = 'uniform', save_full_state = False, tsallisQ = tsallisQ, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] constants.COST_TYPE = variant['algorithm_params']['cost_type'] register( id='MECS-v1', entry_point='sac.envs.environment_V_sweep:MEC_v1', max_episode_steps=5000, ) register( id='MECS-v2', entry_point='sac.envs.env_V_sweep_v2:MEC_v2', max_episode_steps=5000, ) register( id='MECS-v3', entry_point='sac.envs.env_V_sweep_v3:MEC_v3', max_episode_steps=5000, ) register( id='MECS-v4', entry_point='sac.envs.env_V_sweep_v4:MEC_v4', max_episode_steps=5000, ) register( id='MECS-v5', entry_point='sac.envs.env_V_sweep_v5:MEC_v5', max_episode_steps=5000, ) register( id='MECS-v6', entry_point='sac.envs.env_V_sweep_v6:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v61', entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v7', entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7', max_episode_steps=5000, ) register( id='MECS-v8', entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8', max_episode_steps=5000, ) register( id='MECS-v9', entry_point='sac.envs.env_V_sweep_v9:MEC_v9', max_episode_steps=5000, ) register( id='MECS-v10', entry_point='sac.envs.env_V_sweep_v10:MEC_v10', max_episode_steps=5000, ) register( id='MECS-v11', entry_point='sac.envs.env_V_sweep_v11:MEC_v11', max_episode_steps=5000, ) register( id='MECS-v12', entry_point='sac.envs.env_V_sweep_v12:MEC_v12', max_episode_steps=5000, ) register( id='MECS-v13', entry_point='sac.envs.env_V_sweep_v13:MEC_v13', max_episode_steps=5000, ) register( id='MECS-v14', entry_point='sac.envs.env_V_sweep_v14:MEC_v14', max_episode_steps=5000, ) register( id='MECS-v15', entry_point='sac.envs.env_V_sweep_v15:MEC_v15', max_episode_steps=5000, ) register( id='MECS-v16', entry_point='sac.envs.env_V_sweep_v16:MEC_v16', max_episode_steps=5000, ) register( id='MECS-v17', entry_point='sac.envs.env_V_sweep_v17:MEC_v17', max_episode_steps=5000, ) register( id='MECS-v18', entry_point='sac.envs.env_V_sweep_v18:MEC_v18', max_episode_steps=5000, ) register( id='MECS-v19', entry_point='sac.envs.env_V_sweep_v19:MEC_v19', max_episode_steps=5000, ) register( id='MECS-v20', entry_point='sac.envs.env_V_sweep_v20:MEC_v20', max_episode_steps=5000, ) register( id='MECS-v21', entry_point='sac.envs.env_V_sweep_v21:MEC_v21', max_episode_steps=5000, ) register( id='MECS-v22', entry_point='sac.envs.env_V_sweep_v22:MEC_v22', max_episode_steps=5000, ) register( id='MECS-v23', entry_point='sac.envs.env_V_sweep_v23:MEC_v23', max_episode_steps=5000, ) register( id='MECS-v24', entry_point='sac.envs.env_V_sweep_v24:MEC_v24', max_episode_steps=5000, ) register( id='MECS-v25', entry_point='sac.envs.env_V_sweep_v25:MEC_v25', max_episode_steps=5000, ) register( id='MECS-v26', entry_point='sac.envs.env_V_sweep_v26:MEC_v26', max_episode_steps=5000, ) register( id='MECS-v27', entry_point='sac.envs.env_V_sweep_v27:MEC_v27', max_episode_steps=5000, ) register( id='MECS-v28', entry_point='sac.envs.env_V_sweep_v28:MEC_v28', max_episode_steps=5000, ) register( id='MECS-v29', entry_point='sac.envs.env_V_sweep_v29:MEC_v29', max_episode_steps=5000, ) register( id='MECS-v30', entry_point='sac.envs.env_V_sweep_v30:MEC_v30', max_episode_steps=5000, ) env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def main(root_dir): # tf.set_random_seed(seed=seed) # env = GymEnv('MountainCarContinuous-v0') env = GymEnv('MountainCarContinuousColor-v0') max_replay_buffer_size = int(1e6) sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128} # TODO Normalize or not sampler = SimpleSampler(**sampler_params) entropy_coeff = 0. dynamic_coeff = True # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) # use GMM policy policy = GMMPolicy( env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True ) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=10, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=20, eval_deterministic=True, ) pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=1., discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff ) algorithm._sess.run(tf.global_variables_initializer()) # TODO Normalize or not # Currently only MountainCar is available with algorithm._sess.as_default(): model_file = os.path.join(root_dir, 'model') algorithm._saver.restore(algorithm._sess, model_file) for i in range(1): obs = env.reset() env.env.render() sleep(4.0) traj = [obs] done = False while not done: env.env.render() action = algorithm.policy.get_action(obs.flatten()) obs, rew, done, _ = env.step(action) traj.append(obs.flatten()) knack, knack_kurtosis = sub_goal_detect(algorithm, traj) idxs = np.argsort(knack_kurtosis) # idxs = np.argsort(knack) print(idxs[::-1]) COL = MplColorHelper('Blues', np.min(knack_kurtosis), np.max(knack_kurtosis)) for j, s in enumerate(traj): env.env.state = np.array(traj[j]) rgba = COL.get_rgb(knack_kurtosis[j]) env.env.render(car_rgba=rgba) sleep(1.0) for idx in idxs[::-1]: obs = env.reset() env.env.state = np.array(traj[0]) rgba = COL.get_rgb(knack_kurtosis[0]) env.env.render(car_rgba=rgba) for j in range(idx+1): env.env.state = np.array(traj[j]) rgba = COL.get_rgb(knack_kurtosis[j]) # env.env.viewer.geoms[1].set_color(*(0.0, 0.0, 1.0)) env.env.render(car_rgba=rgba) sleep(0.5)