def run(*_): env = normalize( MultiGoalEnv( actuation_cost_coeff=1, distance_cost_coeff=0.1, goal_reward=1, init_sigma=0.1, )) pool = SimpleReplayBuffer( max_replay_buffer_size=1e6, env_spec=env.spec, ) base_kwargs = dict(min_pool_size=30, epoch_length=1000, n_epochs=1000, max_path_length=30, batch_size=64, n_train_repeat=1, eval_render=True, eval_n_episodes=10, eval_deterministic=True) M = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) policy = GMMPolicy(env_spec=env.spec, K=4, hidden_layer_sizes=[M, M], qf=qf, reg=0.001) plotter = QFPolicyPlotter(qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SAC(base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, plotter=plotter, lr=3E-4, scale_reward=3, discount=0.99, tau=0.001, save_full_state=True) algorithm.train()
def run_experiment(variant): env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=env.spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reg=0.001, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], save_full_state=False, ) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] #what does task and domain relate to ?? domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) #TODO- define the baxter environment pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) # TODO:Need to have a HER Pool sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): sub_level_policies_paths = [] # args = parse_args() args = arg() domain = ENVIRONMENTS[args.domain][args.task] if args.domain == 'sawyer-reach': goal_size = 0 sub_level_policies_paths.append("ikx") sub_level_policies_paths.append("iky") sub_level_policies_paths.append("ikz") random_arm_init = [-0.1, 0.1] lower_goal_range = [-0.1, -0.1, -0.1] upper_goal_range = [0.1, 0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( IKWrapper( domain( # playable params random_arm_init=random_arm_init, lower_goal_range=lower_goal_range, upper_goal_range=upper_goal_range, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, )))) else: raise ValueError("Domain not available") pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler( max_path_length=horizon - 1, # should be same as horizon min_pool_size=1000, batch_size=256) base_kwargs = dict( epoch_length=1000, n_epochs=2e3, # n_epochs=5, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def main(trial, use_optuna, env, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, normalize_obs, buffer_size, max_path_length, min_pool_size, batch_size, policy_mode, eval_model, eval_n_episodes, eval_n_frequency, exploitation_ratio, return_queue=None, scale_reward=1.): if use_optuna: logger.configure(logger.get_dir(), log_suffix="_optune{}".format(trial.number), enable_std_out=False) logger.set_level(logger.DISABLED) tf.set_random_seed(seed=seed) env.min_action = env.action_space.low[0] env.max_action = env.action_space.high[0] if hasattr(env, "seed"): env.seed(seed) else: env.env.seed(seed) # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) # use GMM policy if policy_mode == "GMMPolicy": # use GMM policy policy = GMMPolicy(env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True) elif policy_mode == "EExploitation": policy = EExploitationPolicy( env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True, e=exploitation_ratio) else: if policy_mode == "Knack-exploration" or policy_mode == "kurtosis": metric = "kurtosis" elif policy_mode in [ "signed_variance", "negative_signed_variance", "small_variance", "large_variance" ]: metric = policy_mode elif "kurtosis-" in policy_mode: metric = policy_mode else: raise AssertionError( "policy_mode should be GMMPolicy or Knack-exploration or Knack-exploration or signed_variance or variance" ) policy = KnackBasedPolicy( env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, vf=vf, reg=1e-3, squash=True, metric=metric, exploitation_ratio=exploitation_ratio, optuna_trial=trial, ) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=n_epochs, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=eval_n_episodes, eval_deterministic=True, eval_n_frequency=eval_n_frequency) max_replay_buffer_size = buffer_size pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) sampler_params = { 'max_path_length': max_path_length, 'min_pool_size': min_pool_size, 'batch_size': batch_size } sampler = NormalizeSampler( **sampler_params) if normalize_obs else SimpleSampler(**sampler_params) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC(base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=scale_reward, discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff, clip_norm=clip_norm) algorithm._sess.run(tf.global_variables_initializer()) if eval_model is None: avg_return = algorithm.train() if return_queue is not None: return_queue.put(avg_return) tf.reset_default_graph() algorithm._sess.close() del algorithm return avg_return else: return algorithm
def run_experiment(param): random_arm_init = [-0.1, 0.1] lower_goal_range = [-0.1, -0.1, -0.1] upper_goal_range = [0.1, 0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( # IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, lower_goal_range=lower_goal_range, upper_goal_range=upper_goal_range, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ) # ) ) ) replay_buffer_params = { 'max_replay_buffer_size': 1e6, } sampler_params = { 'max_path_length': horizon - 1, 'min_pool_size': 1000, 'batch_size': 256, } pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict( { 'epoch_length': 1500, 'n_train_repeat': 1, 'n_initial_exploration_steps': 5000, 'eval_render': False, 'eval_n_episodes': 1, 'eval_deterministic': True, 'n_epochs': 2e3 }, sampler=sampler) M = 64 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(64, 64), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=20, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): #low_level_policy = load_low_level_policy( # policy_path='/home/rcorona/sac/data/humanoid-rllab/default-humanoid_base-00/itr_0.pkl')#variant['low_level_policy_path']) env_name = variant['env_name'] env_type = env_name.split('-')[-1] env_args = { name.replace('env_', '', 1): value for name, value in variant.items() if name.startswith('env_') and name != 'env_name' } if 'random-goal' in env_name: EnvClass = RANDOM_GOAL_ENVS[env_type] elif 'rllab' in variant['env_name']: EnvClass = RLLAB_ENVS[variant['env_name']] else: raise NotImplementedError base_env = normalize(EnvClass(**env_args)) env = base_env #env = HierarchyProxyEnv(wrapped_env=base_env, # low_level_policy=low_level_policy) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) sampler = SimpleSampler(max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict(epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = variant['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) preprocessing_hidden_sizes = variant.get('preprocessing_hidden_sizes') observations_preprocessor = ( MLPPreprocessor(env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, name='high_level_observations_preprocessor') if preprocessing_hidden_sizes is not None else None) policy_s_t_layers = variant['policy_s_t_layers'] policy_s_t_units = variant['policy_s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { "scale_regularization": 0.0, "num_coupling_layers": variant['policy_coupling_layers'], "translation_hidden_sizes": s_t_hidden_sizes, "scale_hidden_sizes": s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, mode="train", squash=False, bijector_config=bijector_config, q_function=qf1, fix_h_on_reset=variant.get('policy_fix_h_on_reset', False), observations_preprocessor=observations_preprocessor, name="high_level_policy") algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf1=qf1, vf=vf, qf2=qf2, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], target_update_interval=variant['target_update_interval'], action_prior=variant['action_prior'], initial_exploration_policy=initial_exploration_policy, save_full_state=False, ) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] constants.COST_TYPE = variant['algorithm_params']['cost_type'] register( id='MECS-v1', entry_point='sac.envs.environment_V_sweep:MEC_v1', max_episode_steps=5000, ) register( id='MECS-v2', entry_point='sac.envs.env_V_sweep_v2:MEC_v2', max_episode_steps=5000, ) register( id='MECS-v3', entry_point='sac.envs.env_V_sweep_v3:MEC_v3', max_episode_steps=5000, ) register( id='MECS-v4', entry_point='sac.envs.env_V_sweep_v4:MEC_v4', max_episode_steps=5000, ) register( id='MECS-v5', entry_point='sac.envs.env_V_sweep_v5:MEC_v5', max_episode_steps=5000, ) register( id='MECS-v6', entry_point='sac.envs.env_V_sweep_v6:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v61', entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v7', entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7', max_episode_steps=5000, ) register( id='MECS-v8', entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8', max_episode_steps=5000, ) register( id='MECS-v9', entry_point='sac.envs.env_V_sweep_v9:MEC_v9', max_episode_steps=5000, ) register( id='MECS-v10', entry_point='sac.envs.env_V_sweep_v10:MEC_v10', max_episode_steps=5000, ) register( id='MECS-v11', entry_point='sac.envs.env_V_sweep_v11:MEC_v11', max_episode_steps=5000, ) register( id='MECS-v12', entry_point='sac.envs.env_V_sweep_v12:MEC_v12', max_episode_steps=5000, ) register( id='MECS-v13', entry_point='sac.envs.env_V_sweep_v13:MEC_v13', max_episode_steps=5000, ) register( id='MECS-v14', entry_point='sac.envs.env_V_sweep_v14:MEC_v14', max_episode_steps=5000, ) register( id='MECS-v15', entry_point='sac.envs.env_V_sweep_v15:MEC_v15', max_episode_steps=5000, ) register( id='MECS-v16', entry_point='sac.envs.env_V_sweep_v16:MEC_v16', max_episode_steps=5000, ) register( id='MECS-v17', entry_point='sac.envs.env_V_sweep_v17:MEC_v17', max_episode_steps=5000, ) register( id='MECS-v18', entry_point='sac.envs.env_V_sweep_v18:MEC_v18', max_episode_steps=5000, ) register( id='MECS-v19', entry_point='sac.envs.env_V_sweep_v19:MEC_v19', max_episode_steps=5000, ) register( id='MECS-v20', entry_point='sac.envs.env_V_sweep_v20:MEC_v20', max_episode_steps=5000, ) register( id='MECS-v21', entry_point='sac.envs.env_V_sweep_v21:MEC_v21', max_episode_steps=5000, ) register( id='MECS-v22', entry_point='sac.envs.env_V_sweep_v22:MEC_v22', max_episode_steps=5000, ) register( id='MECS-v23', entry_point='sac.envs.env_V_sweep_v23:MEC_v23', max_episode_steps=5000, ) register( id='MECS-v24', entry_point='sac.envs.env_V_sweep_v24:MEC_v24', max_episode_steps=5000, ) register( id='MECS-v25', entry_point='sac.envs.env_V_sweep_v25:MEC_v25', max_episode_steps=5000, ) register( id='MECS-v26', entry_point='sac.envs.env_V_sweep_v26:MEC_v26', max_episode_steps=5000, ) register( id='MECS-v27', entry_point='sac.envs.env_V_sweep_v27:MEC_v27', max_episode_steps=5000, ) register( id='MECS-v28', entry_point='sac.envs.env_V_sweep_v28:MEC_v28', max_episode_steps=5000, ) register( id='MECS-v29', entry_point='sac.envs.env_V_sweep_v29:MEC_v29', max_episode_steps=5000, ) register( id='MECS-v30', entry_point='sac.envs.env_V_sweep_v30:MEC_v30', max_episode_steps=5000, ) env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) env = DelayedEnv(env, delay=0.01) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) sampler = RemoteSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size'] ) base_kwargs = dict( sampler=sampler, epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=env.spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reparameterize=variant['reparameterize'], reg=0.001, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], reparameterize=variant['reparameterize'], save_full_state=False, ) algorithm.train()
def run_experiment(variant): domain = None goal_size = None sub_level_policies_paths = [] if args.domain == 'ant-cross-maze': domain = CrossMazeAntEnv goal_size = 2 sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl") elif args.domain == 'ant-random-goal': domain = RandomGoalAntEnv goal_size = 2 sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl") elif args.domain == 'cheetah-hurdle': domain = HalfCheetahHurdleEnv goal_size = 2 sub_level_policies_paths.append("primitive-policies/hc/fwd/fwd.pkl") sub_level_policies_paths.append( "primitive-policies/hc/jp-longz/jump.pkl") elif args.domain == 'pusher': domain = PusherEnv goal_size = 0 sub_level_policies_paths.append( "primitive-policies/pusher/bottom/bottom.pkl") sub_level_policies_paths.append( "primitive-policies/pusher/left/left.pkl") env = normalize(domain()) #CrossMazeAntEnv()) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler(max_path_length=1000, min_pool_size=1000, batch_size=256) base_kwargs = dict(epoch_length=1000, n_epochs=5e3, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run(variant): env = normalize( MultiGoalEnv( actuation_cost_coeff=1, distance_cost_coeff=0.1, goal_reward=1, init_sigma=0.1, )) pool = SimpleReplayBuffer(max_replay_buffer_size=1e6, env_spec=env.spec) sampler = SimpleSampler(max_path_length=30, min_pool_size=100, batch_size=64) base_kwargs = dict(sampler=sampler, epoch_length=1000, n_epochs=1000, n_train_repeat=1, eval_render=True, eval_n_episodes=10, eval_deterministic=False) M = 128 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) if variant['policy_type'] == 'gmm': policy = GMMPolicy(env_spec=env.spec, K=4, hidden_layer_sizes=[M, M], qf=qf, reg=0.001) elif variant['policy_type'] == 'lsp': bijector_config = { "scale_regularization": 0.0, "num_coupling_layers": 2, "translation_hidden_sizes": (M, ), "scale_hidden_sizes": (M, ), } policy = LatentSpacePolicy(env_spec=env.spec, mode="train", squash=True, bijector_config=bijector_config, observations_preprocessor=None) plotter = QFPolicyPlotter(qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SAC(base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, plotter=plotter, lr=3e-4, scale_reward=3.0, discount=0.99, tau=1e-4, save_full_state=True) algorithm.train()
def main(root_dir, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, regularize): tf.set_random_seed(seed=seed) env = GymEnv('MountainCarContinuous-v0') env.min_action = env.action_space.low[0] env.max_action = env.action_space.high[0] env.env.seed(seed) max_replay_buffer_size = int(1e6) sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128} sampler = SimpleSampler(**sampler_params) entropy_coeff = entropy_coeff dynamic_coeff = dynamic_coeff # env_id = 'ContinuousSpaceMaze{}_{}_RB{}_entropy_{}__Normalize'.format(goal[0], goal[1], max_replay_buffer_size, entropy_coeff) env_id = 'MountainCarContinuous_RB1e6_entropy{}_epoch{}__Normalize_uniform'.format(entropy_coeff, n_epochs) env_id = env_id + '_dynamicCoeff' if dynamic_coeff else env_id os.makedirs(root_dir, exist_ok=True) env_dir = os.path.join(root_dir, env_id) os.makedirs(env_dir, exist_ok=True) current_log_dir = os.path.join(env_dir, 'seed{}'.format(seed)) mylogger.make_log_dir(current_log_dir) # env_id = 'Test' print(env_id) print('environment set done') # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) # use GMM policy policy = GMMPolicy( env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True ) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=n_epochs, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=20, eval_deterministic=True, ) pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=1., discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff, clip_norm=clip_norm, ) # name = env_id + datetime.now().strftime("-%m%d-%Hh-%Mm-%ss") # mylogger.make_log_dir(name) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def main(env, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, normalize_obs, buffer_size, max_path_length, min_pool_size, batch_size, policy_mode): tf.set_random_seed(seed=seed) # define value function layer_size = 100 qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size)) if policy_mode == GMMPolicy: # use GMM policy policy = GMMPolicy(env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True) else: _, mode = str(policy_mode).split('-') if _ != "Knack": raise AssertionError( "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration" ) else: policy = KnackBasedPolicy( a_lim_lows=env.action_space.low, a_lim_highs=env.action_space.high, mode=mode, env_spec=env.spec, K=4, hidden_layer_sizes=[layer_size, layer_size], qf=qf, reg=1e-3, squash=True) # TODO base_kwargs = dict( epoch_length=1000, n_epochs=n_epochs, # scale_reward=1, n_train_repeat=1, eval_render=False, eval_n_episodes=20, eval_deterministic=True, ) max_replay_buffer_size = buffer_size pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size) sampler_params = { 'max_path_length': max_path_length, 'min_pool_size': min_pool_size, 'batch_size': batch_size } sampler = NormalizeSampler( **sampler_params) if normalize_obs else SimpleSampler(**sampler_params) base_kwargs = dict(base_kwargs, sampler=sampler) algorithm = SAC(base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=3e-4, scale_reward=1., discount=0.99, tau=1e-2, target_update_interval=1, action_prior='uniform', save_full_state=False, dynamic_coeff=dynamic_coeff, entropy_coeff=entropy_coeff, clip_norm=clip_norm) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): sub_level_policies_paths = [] args = arg() if args.domain == 'sawyer-reach': print("Composition Reach") goal_size = 0 sub_level_policies_paths.append("ikx") sub_level_policies_paths.append("iky") sub_level_policies_paths.append("ikz") random_arm_init = [-0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, )))) ep_length = 1500 elif args.domain == 'sawyer-reach-pick': print("Composition Reach and Pick") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") render = False random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 1000 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) ep_length = 1500 elif args.domain == 'sawyer-reach-pick-simple': print("Composition Reach and Pick Simple") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") render = False random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 500 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, placement_initializer=UniformRandomSampler( x_range=[-0.01, 0.01], y_range=[-0.01, 0.01], ensure_object_boundary_in_range=False, z_rotation=None, ), # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) ep_length = 3000 else: raise ValueError("Domain not available") if args.demo: pool = DemoReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) else: pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler( max_path_length=horizon - 1, # should be same as horizon min_pool_size=1000, batch_size=256) base_kwargs = dict( epoch_length=ep_length, n_epochs=5e3, # n_epochs=5, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler, use_demos=args.demo, ) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] if variant['num_hidden'] != 256: M = variant['num_hidden'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1', batchnormvf=variant['batchnormvf']) qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2', batchnormvf=variant['batchnormvf']) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), batchnormvf=variant['batchnormvf'], dropoutvf_keep_prob=variant['dropoutvf']) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], todropoutpi=(variant['dropoutpi'] < 1.0), dropoutpi=variant['dropoutpi'], batchnormpi=variant['batchnormpi']) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get( 'preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) if variant['reward_scale'] < 0: scale_rew = algorithm_params['scale_reward'] else: scale_rew = variant['reward_scale'] algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'], scale_reward=scale_rew, discount=algorithm_params['discount'], tau=variant['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, l1regpi=variant['l1regpi'], l2regpi=variant['l2regpi'], l1regvf=variant['l1regvf'], l2regvf=variant['l2regvf'], ent_coef=variant['ent_coef'], wclippi=variant['wclippi'], wclipvf=variant['wclipvf'], dropoutpi=variant['dropoutpi'], dropoutvf=variant['dropoutvf'], batchnormpi=variant['batchnormpi'], batchnormvf=variant['batchnormvf']) algorithm._sess.run(tf.global_variables_initializer()) for v in tf.trainable_variables(): print(v.name) algorithm.train() if variant['policypath'] != '': save_w_path = os.path.expanduser(variant['policypath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='gaussian_policy'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',') if variant['valuepath'] != '': save_w_path = os.path.expanduser(variant['valuepath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf1'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf2'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='vf'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',')
def run_experiment(variant): tf.logging.set_verbosity(tf.logging.INFO) with tf.Session() as sess: data = joblib.load(variant['snapshot_filename']) policy = data['policy'] env = data['env'] num_skills = data['policy'].observation_space.flat_dim - data[ 'env'].spec.observation_space.flat_dim best_z = get_best_skill(policy, env, num_skills, variant['max_path_length']) fixed_z_env = FixedOptionEnv(env, num_skills, best_z) tf.logging.info('Finetuning best skill...') pool = SimpleReplayBuffer( env_spec=fixed_z_env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] if variant['use_pretrained_values']: qf = data['qf'] vf = data['vf'] else: del data['qf'] del data['vf'] qf = NNQFunction( env_spec=fixed_z_env.spec, hidden_layer_sizes=[M, M], var_scope='qf-finetune', ) vf = NNVFunction( env_spec=fixed_z_env.spec, hidden_layer_sizes=[M, M], var_scope='vf-finetune', ) algorithm = SAC( base_kwargs=base_kwargs, env=fixed_z_env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], save_full_state=False, ) algorithm.train()