def __init__(self, root_dir): self.map_files = glob(os.path.join(root_dir, 'maps/*.npz')) data = log_reader(os.path.join(root_dir, 'log.json')) self.total_steps = data['total_step'] self.average_return = data['eval_average_return'] self.eval_terminal_states = data['eval_terminal_states'] self.train_terminal_states = data['train_terminal_states'] from sac.envs import GymEnv env = GymEnv('MountainCarContinuousColor-v0') low_state = env.env.low_state high_state = env.env.high_state extent = [low_state[0], high_state[0], high_state[1], low_state[1]] aspect = (high_state[0] - low_state[0]) / (high_state[1] - low_state[1]) # figure configuration plt.style.use('mystyle3') self.fig, self.axes = plt.subplots(2, 2, sharex='col', sharey='row') title = [ 'average relative Q(s,a)', 'relative V(s)', 'relative knack map', 'relative knack map kurtosis' ] for t, ax in zip(title, self.axes.flatten()): ax.set_title(t) # prepare to draw updatable map # !!!!!!!! set vmin and vmax is important!!!!!!!!! # we normalize array in (0., 1.) to visualize # extent set tick label range tmp = np.zeros([25, 25]) self.im = np.array([ ax.imshow(tmp, cmap='Blues', animated=True, vmin=0., vmax=1., extent=extent) for ax in self.axes.flatten() ]).reshape(2, 2) for ax in self.axes.flatten(): ax.set_aspect(aspect) # ax.set_aspect('equal') self.frame_skip = 2
def init_figure(self, env_id): from sac.envs import GymEnv env = GymEnv(env_id) low_state = env.env.low_state high_state = env.env.high_state self.range = [ [low_state[0], high_state[0]], [low_state[1], high_state[1]] ] # range of x, y coordinate. y range is reversed for visualization extent = [low_state[0], high_state[0], high_state[1], low_state[1]] aspect = (high_state[0] - low_state[0]) / (high_state[1] - low_state[1]) # figure configuration plt.style.use('mystyle3') self.fig, self.axes = plt.subplots(2, 2, sharex='col', sharey='row') title = [ 'average relative Q(s,a)', 'relative V(s)', 'relative knack map', 'relative knack map kurtosis' ] for t, ax in zip(title, self.axes.flatten()): ax.set_title(t) # prepare to draw updatable map # !!!!!!!! set vmin and vmax is important!!!!!!!!! # we normalize array in (0., 1.) to visualize # extent set tick label range tmp = np.zeros([25, 25]) self.im = np.array([ ax.imshow(tmp, cmap='Blues', animated=True, vmin=0., vmax=1., extent=extent) for ax in self.axes.flatten() ]).reshape(2, 2) for ax in self.axes.flatten(): ax.set_aspect(aspect)
data['ep_rets'].append(ret) for k, v in tmp_data.items(): data[k].append(v) np.savez_compressed("a.npz", **data) print("return mean: {}".format(np.mean(data['ep_rets']))) if __name__ == '__main__': args = parse_args() # set environment seed = args['seed'] env_id = args.pop('env_id') env = GymEnv(env_id) # set log directory root_dir = args.pop('root_dir') opt_log_name = args.pop('opt_log_name') logger2 = mylogger.get_logger() if args['eval_model'] is None: env_id = env.env_id # set log current_log_dir = root_dir logger2.set_log_dir(current_log_dir, exist_ok=True) logger2.set_save_array_flag(args.pop("save_array_flag")) if args["use_optuna"]: logger.set_level(logger.DISABLED) else: logger.configure(dir=current_log_dir, enable_std_out=False)
from sac.misc.instrument import run_sac_experiment from sac.misc.utils import timestamp, unflatten from sac.policies import GaussianPolicy, LatentSpacePolicy, GMMPolicy, UniformPolicy from sac.misc.sampler import SimpleSampler from sac.replay_buffers import SimpleReplayBuffer from sac.value_functions import NNQFunction, NNVFunction from sac.preprocessors import MLPPreprocessor from examples.variants_re_1 import parse_domain_and_task, get_variants, get_variants_delayed from sac.envs import constants from gym.envs.registration import register DELAY_CONST = 20 ENVIRONMENTS = { 'swimmer-gym': { 'default': lambda: GymEnv('Swimmer-v1'), 'delayed': lambda: GymEnvDelayed('Swimmer-v1', delay = DELAY_CONST), }, 'swimmer-rllab': { 'default': SwimmerEnv, 'multi-direction': MultiDirectionSwimmerEnv, }, 'ant': { 'default': lambda: GymEnv('Ant-v1'), 'multi-direction': MultiDirectionAntEnv, 'cross-maze': CrossMazeAntEnv, 'delayed': lambda: GymEnvDelayed('Ant-v1', delay = DELAY_CONST), }, 'humanoid-gym': { 'default': lambda: GymEnv('Humanoid-v1'), 'delayed': lambda: GymEnvDelayed('Humanoid-v1', delay = DELAY_CONST),
def main(env_id, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, normalize_obs, buffer_size, max_path_length, min_pool_size, batch_size, policy_mode, eval_model, e, stochastic): tf.set_random_seed(seed=seed) env = GymEnv(env_id) env.min_action = env.action_space.low[0] env.max_action = env.action_space.high[0] if hasattr(env, "seed"): env.seed(seed) else: env.env.seed(seed) # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) print("here") # use GMM policy if policy_mode == "GMMPolicy": # use GMM policy policy = GMMPolicy(env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True) elif policy_mode == "EExploitationPolicy": policy = EExploitationPolicy( env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True, e=e) else: _, mode = str(policy_mode).split('-') if _ != "Knack": raise AssertionError( "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration" ) else: policy = KnackBasedPolicy( a_lim_lows=env.action_space.low, a_lim_highs=env.action_space.high, mode=mode, env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, vf=vf, reg=1e-3, squash=True) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=n_epochs, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=20, eval_deterministic=True, ) max_replay_buffer_size = buffer_size pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) sampler_params = { 'max_path_length': max_path_length, 'min_pool_size': min_pool_size, 'batch_size': batch_size } sampler = NormalizeSampler( **sampler_params) if normalize_obs else SimpleSampler(**sampler_params) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC(base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=1., discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff, clip_norm=clip_norm) algorithm._sess.run(tf.global_variables_initializer()) # -------------- setting done ------------------------ # -------------- main process ------------------------ with algorithm._sess.as_default(): algorithm._saver.restore(algorithm._sess, eval_model) if stochastic: knack_file = os.path.join(os.path.dirname(eval_model), "array/epoch0_2001.npz") final_knacks = np.load(knack_file)['knack_kurtosis'][-1] env = algorithm._env if hasattr(env, "env"): env = env.env # np.random.seed(seed) # env.seed(seed) num_data = 50 # num_data * nprocess == 1500 steps_thresh = 1000 data = {'acs': [], 'ep_rets': [], 'obs': [], 'rews': []} for i in range(num_data): obs = env.reset() done = False steps = 0 ret = 0 tmp_data = {'acs': [], 'obs': [], 'rews': []} if stochastic: _min = np.min(final_knacks) _max = np.max(final_knacks) print("start episode {}".format(i)) while not done: steps += 1 # env.render() if stochastic: if hasattr(algorithm.pi, "knack_thresh"): v, mean, var, kurtosis = algorithm._policy.calc_and_update_knack( [obs]) knack_value = kurtosis[0] # _min = min(knack_value, _min) # _max = max(knack_value, _max) knack_value = (knack_value - _min) / (_max - _min) if knack_value > 0.8: ## TODO hyper param print("knack {}".format(knack_value)) was = algorithm._policy._is_deterministic algorithm._policy._is_deterministic = True action, _ = algorithm.policy.get_action( obs.flatten()) algorithm._policy._is_deterministic = was else: action, _ = algorithm.policy.get_action( obs.flatten()) else: algorithm._policy._is_deterministic = False action, _ = algorithm.policy.get_action(obs.flatten()) else: if hasattr(algorithm._policy, "_is_deterministic"): algorithm._policy._is_deterministic = True action, _ = algorithm.policy.get_action(obs.flatten()) obs_next, rew, done, _ = env.step(action) tmp_data['obs'].append(obs) tmp_data['acs'].append(action) tmp_data['rews'].append(rew) ret += rew obs = obs_next if steps >= steps_thresh: done = True data['ep_rets'].append(ret) for k, v in tmp_data.items(): data[k].append(v) # np.savez_compressed("a.npz", **data) # print("return mean: {}".format(np.mean(data['ep_rets']))) return data
def main(root_dir, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, regularize): tf.set_random_seed(seed=seed) env = GymEnv('MountainCarContinuous-v0') env.min_action = env.action_space.low[0] env.max_action = env.action_space.high[0] env.env.seed(seed) max_replay_buffer_size = int(1e6) sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128} sampler = SimpleSampler(**sampler_params) entropy_coeff = entropy_coeff dynamic_coeff = dynamic_coeff # env_id = 'ContinuousSpaceMaze{}_{}_RB{}_entropy_{}__Normalize'.format(goal[0], goal[1], max_replay_buffer_size, entropy_coeff) env_id = 'MountainCarContinuous_RB1e6_entropy{}_epoch{}__Normalize_uniform'.format(entropy_coeff, n_epochs) env_id = env_id + '_dynamicCoeff' if dynamic_coeff else env_id os.makedirs(root_dir, exist_ok=True) env_dir = os.path.join(root_dir, env_id) os.makedirs(env_dir, exist_ok=True) current_log_dir = os.path.join(env_dir, 'seed{}'.format(seed)) mylogger.make_log_dir(current_log_dir) # env_id = 'Test' print(env_id) print('environment set done') # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) # use GMM policy policy = GMMPolicy( env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True ) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=n_epochs, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=20, eval_deterministic=True, ) pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=1., discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff, clip_norm=clip_norm, ) # name = env_id + datetime.now().strftime("-%m%d-%Hh-%Mm-%ss") # mylogger.make_log_dir(name) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def __init__( self, environment_name, algorithm_name, lr, scale_reward, scale_entropy, discount, tau, max_replay_buffer_size, sampler_params, value_func_layers_number, value_func_layer_size, policy_func_layers_number, policy_func_layer_size, base_ac_alg_params, q_param_list, use_ucb=False, evaluation_strategy='ensemble', ): """ CG: the constructor. :param environment_name: the name of the environment in string. :param algorithm_name: the name of the AC algorithm to be used in the ensemble. :param lr: the learning rate to be used in the ensemble. :param scale_reward: the reward scaling factor. :param scale_entropy: the entropy scaling factor. :param discount: the reward discount factor. :param tau: the target value function updating factor. :param max_replay_buffer_size: the maximum size of the replay buffer. :param sampler_params: extra parameter settings for the random sampler. :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function. :param value_func_layer_size: the number of neurons of each hidden layer of the value network. :param policy_func_layers_number: th number of hidden layers for the policy network. :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network. :param base_ac_alg_params: base parameters for the AC algorithm. :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble. :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration. :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'. """ # Set up the environment. self._environment_name = environment_name self._env = GymEnv(self._environment_name) # Set up the algorithm parameters. self._algorithm_name = algorithm_name self._lr = lr self._scale_reward = scale_reward self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._use_ucb = use_ucb self._evaluation_strategy = evaluation_strategy # Set up the replay buffer. self._max_replay_buffer_size = max_replay_buffer_size self._pool = SimpleReplayBuffer( env_spec=self._env.spec, max_replay_buffer_size=self._max_replay_buffer_size) # Set up the environment sampler. self._sampler_params = sampler_params self._sampler = SimpleSampler(**self._sampler_params) # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network. self._alg_instances = [] self._base_ac_params = base_ac_alg_params self._base_alg_params = dict(self._base_ac_params, sampler=self._sampler) for id, q_val in enumerate(q_param_list): # Set up the value function network for an AC instance. qf1 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf1') qf2 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf2') vf = NNVFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'vf') # Set up the policy network for an AC instance. policy = GaussianPolicy( env_spec=self._env.spec, hidden_layer_sizes=tuple([ policy_func_layer_size for _ in range(policy_func_layers_number) ]), squash=True, reparameterize=False, reg=1.e-3, name=str(id) + 'gaussian_policy') initial_exploration_policy = policy # Set up an AC instance. if self._algorithm_name == 'sac': algorithm = SACV1( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, ) elif self._algorithm_name == 'tac': algorithm = TAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, tsallisQ=q_val, ) elif self._algorithm_name == 'rac': algorithm = RAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, renyiQ=q_val, ) else: raise NotImplementedError # Initialize the AC instance. # algorithm._sess.run(tf.global_variables_initializer()) # Put the initialized AC instance into the algorithm instance list. # Each element of the algorithm instance list is made up of # the algorithm instance, # the moving average performance of the instance, # the number of times the instance has been used for exploration previously, and # the UCB bound. self._alg_instances.append([algorithm, 0.0, 0.0, 0.0]) # Set up the ensemble Q-function for action selection. self._Q_ensemble = NNQFunction( env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name='ensqf') # ======================================================================== # Set up the training target for the ensemble Q-function for action selection. # ======================================================================== # Create the observation placeholder. self._observations_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='obv_ens', ) # Create the next observation placeholder. self._observations_ens_next_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='next_obv_ens', ) # Create a list of next action placeholders. self._acts_next_phs = [] for i in range(len(q_param_list)): act_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name=str(i) + '_next_act_ens', ) self._acts_next_phs.append(act_ens_ph) # Create the observed action placeholder. self._obv_act_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name='act_obv_ens', ) # Create the reward placeholder. self._rewards_ph = tf.placeholder( tf.float32, shape=(None, ), name='rew_ens', ) # Create the terminal placeholder. self._terminals_ph = tf.placeholder( tf.float32, shape=(None, ), name='ter_ens', ) # Determine the target Q-value for next step. self._q_ens_targets = [] for act_next_ph in self._acts_next_phs: qt = self._Q_ensemble.get_output_for( self._observations_ens_next_ph, act_next_ph, reuse=True) self._q_ens_targets.append(qt) for i, q_t in enumerate(self._q_ens_targets): if i == 0: self._q_ens_next = q_t else: self._q_ens_next = tf.maximum(self._q_ens_next, q_t) # self._q_ens_next = self._q_ens_next + q_t # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets) # Determine the Q-loss. self._q_train = self._Q_ensemble.get_output_for( self._observations_ens_ph, self._obv_act_ph, reuse=True) self._q_ens_loss = 0.5 * tf.reduce_mean( (self._q_train - tf.stop_gradient(self._scale_reward * self._rewards_ph + (1 - self._terminals_ph) * self._discount * self._q_ens_next))**2) # Determine the Q-training operator. self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize( loss=self._q_ens_loss, var_list=self._Q_ensemble.get_params_internal()) # Set up the tensor flow session. self._sess = tf_utils.get_default_session() self._sess.run(tf.global_variables_initializer())
def __init__( self, environment_name, algorithm_name, lr, scale_reward, scale_entropy, discount, tau, max_replay_buffer_size, sampler_params, value_func_layers_number, value_func_layer_size, policy_func_layers_number, policy_func_layer_size, base_ac_alg_params, q_param_list, use_ucb=False, evaluation_strategy='ensemble', ): """ CG: the constructor. :param environment_name: the name of the environment in string. :param algorithm_name: the name of the AC algorithm to be used in the ensemble. :param lr: the learning rate to be used in the ensemble. :param scale_reward: the reward scaling factor. :param scale_entropy: the entropy scaling factor. :param discount: the reward discount factor. :param tau: the target value function updating factor. :param max_replay_buffer_size: the maximum size of the replay buffer. :param sampler_params: extra parameter settings for the random sampler. :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function. :param value_func_layer_size: the number of neurons of each hidden layer of the value network. :param policy_func_layers_number: th number of hidden layers for the policy network. :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network. :param base_ac_alg_params: base parameters for the AC algorithm. :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble. :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration. :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'. """ # Set up the environment. self._environment_name = environment_name self._env = GymEnv(self._environment_name) # Set up the algorithm parameters. self._algorithm_name = algorithm_name self._lr = lr self._scale_reward = scale_reward self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._use_ucb = use_ucb self._evaluation_strategy = evaluation_strategy # Set up the replay buffer. self._max_replay_buffer_size = max_replay_buffer_size self._pool = SimpleReplayBuffer( env_spec=self._env.spec, max_replay_buffer_size=self._max_replay_buffer_size) # Set up the environment sampler. self._sampler_params = sampler_params self._sampler = SimpleSampler(**self._sampler_params) # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network. self._alg_instances = [] self._base_ac_params = base_ac_alg_params self._base_alg_params = dict(self._base_ac_params, sampler=self._sampler) for id, q_val in enumerate(q_param_list): # Set up the value function network for an AC instance. qf1 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf1') qf2 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf2') vf = NNVFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'vf') # Set up the policy network for an AC instance. policy = GaussianPolicy( env_spec=self._env.spec, hidden_layer_sizes=tuple([ policy_func_layer_size for _ in range(policy_func_layers_number) ]), squash=True, reparameterize=False, reg=1.e-3, name=str(id) + 'gaussian_policy') initial_exploration_policy = policy # Set up an AC instance. if self._algorithm_name == 'sac': algorithm = SACV1( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, ) elif self._algorithm_name == 'tac': algorithm = TAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, tsallisQ=q_val, ) elif self._algorithm_name == 'rac': algorithm = RAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, renyiQ=q_val, ) else: raise NotImplementedError # Initialize the AC instance. algorithm._sess.run(tf.global_variables_initializer()) # Put the initialized AC instance into the algorithm instance list. # Each element of the algorithm instance list is made up of # the algorithm instance, # the moving average performance of the instance, # the number of times the instance has been used for exploration previously, and # the UCB bound. self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])
ENVIRONMENTS = { 'swimmer': { 'default': SwimmerEnv, 'multi-direction': MultiDirectionSwimmerEnv, }, 'ant': { 'default': AntEnv, 'multi-direction': MultiDirectionAntEnv, 'cross-maze': CrossMazeAntEnv }, 'humanoid': { 'default': HumanoidEnv, 'multi-direction': MultiDirectionHumanoidEnv, }, 'hopper': { 'default': lambda: normalize(GymEnv('Hopper-v1')) }, 'half-cheetah': { 'default': lambda: normalize(GymEnv('HalfCheetah-v2')) }, 'walker': { 'default': lambda: normalize(GymEnv('Walker2d-v1')) }, } DEFAULT_DOMAIN = DEFAULT_ENV = 'swimmer' AVAILABLE_DOMAINS = set(ENVIRONMENTS.keys()) AVAILABLE_TASKS = set(y for x in ENVIRONMENTS.values() for y in x.keys()) def parse_args(): parser = argparse.ArgumentParser()
fig, ax = plt.subplots(1, 1, figsize=(6, 6)) # define the data between 0 and 20 NUM_VALS = 20 x = np.random.uniform(0, NUM_VALS, size=NUM_VALS) y = np.random.uniform(0, NUM_VALS, size=NUM_VALS) # define the color chart between 2 and 10 using the 'autumn_r' colormap, so # y <= 2 is yellow # y >= 10 is red # 2 < y < 10 is between from yellow to red, according to its value COL = MplColorHelper('autumn_r', 2, 10) scat = ax.scatter(x, y, s=300, c=COL.get_rgb(y)) ax.set_title('Well defined discrete colors') plt.show() if __name__ == '__main__': import gym import environments from sac.envs import GymEnv from time import sleep env = GymEnv('MountainCarContinuousColor-v0') env.reset() env.env.render() env.step(0.5) env.env.render() sleep(3)
MultiDirectionHumanoidEnv, CrossMazeAntEnv, ) from sac.misc.instrument import run_sac_experiment from sac.misc.utils import timestamp, unflatten from sac.policies import GaussianPolicy, LatentSpacePolicy, GMMPolicy, UniformPolicy from sac.misc.sampler import SimpleSampler from sac.replay_buffers import SimpleReplayBuffer from sac.value_functions import NNQFunction, NNVFunction from sac.preprocessors import MLPPreprocessor from examples.variants import parse_domain_and_task, get_variants ENVIRONMENTS = { 'swimmer-gym': { 'default': lambda: GymEnv('Swimmer-v1'), }, 'swimmer-rllab': { 'default': SwimmerEnv, 'multi-direction': MultiDirectionSwimmerEnv, }, 'ant': { 'default': lambda: GymEnv('Ant-v1'), 'multi-direction': MultiDirectionAntEnv, 'cross-maze': CrossMazeAntEnv }, 'humanoid-gym': { 'default': lambda: GymEnv('Humanoid-v2') }, 'humanoid-rllab': { 'default': HumanoidEnv,
CrossMazeAntEnv, ) from sac.misc.instrument import run_sac_experiment from sac.misc.utils import timestamp, unflatten from sac.policies import GaussianPolicy, LatentSpacePolicy, GMMPolicy, UniformPolicy from sac.misc.sampler import SimpleSampler from sac.replay_buffers import SimpleReplayBuffer from sac.value_functions import NNQFunction, NNVFunction from sac.preprocessors import MLPPreprocessor from examples.variants import parse_domain_and_task, get_variants import copy ENVIRONMENTS = { 'swimmer-gym': { 'default': lambda: GymEnv('Swimmer-v2'), }, 'swimmer-rllab': { 'default': SwimmerEnv, 'multi-direction': MultiDirectionSwimmerEnv, }, 'ant': { 'default': lambda: GymEnv('Ant-v2'), 'multi-direction': MultiDirectionAntEnv, 'cross-maze': CrossMazeAntEnv }, 'humanoid-gym': { 'default': lambda: GymEnv('Humanoid-v2') }, 'humanoid-rllab': { 'default': HumanoidEnv,
def main(root_dir): # tf.set_random_seed(seed=seed) # env = GymEnv('MountainCarContinuous-v0') env = GymEnv('MountainCarContinuousColor-v0') max_replay_buffer_size = int(1e6) sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128} # TODO Normalize or not sampler = SimpleSampler(**sampler_params) entropy_coeff = 0. dynamic_coeff = True # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) # use GMM policy policy = GMMPolicy( env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True ) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=10, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=20, eval_deterministic=True, ) pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=1., discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff ) algorithm._sess.run(tf.global_variables_initializer()) # TODO Normalize or not # Currently only MountainCar is available with algorithm._sess.as_default(): model_file = os.path.join(root_dir, 'model') algorithm._saver.restore(algorithm._sess, model_file) for i in range(1): obs = env.reset() env.env.render() sleep(4.0) traj = [obs] done = False while not done: env.env.render() action = algorithm.policy.get_action(obs.flatten()) obs, rew, done, _ = env.step(action) traj.append(obs.flatten()) knack, knack_kurtosis = sub_goal_detect(algorithm, traj) idxs = np.argsort(knack_kurtosis) # idxs = np.argsort(knack) print(idxs[::-1]) COL = MplColorHelper('Blues', np.min(knack_kurtosis), np.max(knack_kurtosis)) for j, s in enumerate(traj): env.env.state = np.array(traj[j]) rgba = COL.get_rgb(knack_kurtosis[j]) env.env.render(car_rgba=rgba) sleep(1.0) for idx in idxs[::-1]: obs = env.reset() env.env.state = np.array(traj[0]) rgba = COL.get_rgb(knack_kurtosis[0]) env.env.render(car_rgba=rgba) for j in range(idx+1): env.env.state = np.array(traj[j]) rgba = COL.get_rgb(knack_kurtosis[j]) # env.env.viewer.geoms[1].set_color(*(0.0, 0.0, 1.0)) env.env.render(car_rgba=rgba) sleep(0.5)