def test_q(): q_function_params = { 'hidden_layer_sizes': (128, 128), } q_function = nn.QFunction( name='value_function', **q_function_params) observation_ph = tf.placeholder(tf.float32, shape=(None, 5)) action_ph = tf.placeholder(tf.float32, shape=(None, 3)) try: print(q_function(observation_ph, action_ph)) except: print('q_function(observation_ph) falied') try: print(q_function((observation_ph, action_ph))) except: print('q_function((observation_ph, )) failed')
def train_SAC(env_name, exp_name, seed, reparametrize, two_qf, old_funct, logdir, debug, gpu): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, 0.2) algorithm_params = { 'alpha': alpha, 'batch_size': 256, 'discount': 0.99, 'learning_rate': 1e-3, 'reparameterize': reparametrize, 'tau': 0.01, 'epoch_length': 1000, 'n_epochs': 500, 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': 1000, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (128, 128), } q_function_params = { 'hidden_layer_sizes': (128, 128), } policy_params = { 'hidden_layer_sizes': (128, 128), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction(name='value_function', **value_function_params) target_value_function = nn.ValueFunction(name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], old_funct=old_funct, **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=gpu) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) with tf.Session(config=tf_config) as sess: if debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) algorithm.build(env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, session=sess, n_epochs=algorithm_params.get( 'n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
def train_SAC(env_name, exp_name, n_iter, ep_len, seed, logdir, alpha, prefill_steps, discount, batch_size, learning_rate, tau, two_qf): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, alpha) algorithm_params = { 'alpha': alpha, 'batch_size': batch_size, 'discount': discount, 'learning_rate': learning_rate, 'reparameterize': True, 'tau': tau, 'epoch_length': ep_len, 'n_epochs': n_iter, 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': prefill_steps, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (64, 64), } q_function_params = { 'hidden_layer_sizes': (64, 64), } policy_params = { 'hidden_layer_sizes': (64, 64), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction( name='value_function', **value_function_params) target_value_function = nn.ValueFunction( name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True # may need if using GPU with tf.Session(config=tf_config): algorithm.build( env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
def train_SAC(env_name, exp_name, seed, logdir, extra_params=None): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, 0.2) algorithm_params = { 'alpha': alpha, 'batch_size': 256, 'discount': 0.99, 'learning_rate': 1e-3, 'reparameterize': get_extra_param(extra_params, 'reparameterize', False), 'tau': 0.01, 'epoch_length': 1000, 'n_epochs': 500, 'two_qf': get_extra_param(extra_params, 'two_qf', False), } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': 1000, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (128, 128), } q_function_params = { 'hidden_layer_sizes': (128, 128), } policy_params = { 'hidden_layer_sizes': (128, 128), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.random.set_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) #q_function.build(input_shape = [env.observation_space.shape, env.action_space.shape]) q_function.call( (tf.random.normal(shape=(1, env.observation_space.shape[0])), tf.random.normal(shape=(1, env.action_space.shape[0])))) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) q_function2.build( input_shape=[env.observation_space.shape, env.action_space.shape]) else: q_function2 = None value_function = nn.ValueFunction(name='value_function', **value_function_params) # We initialize build() so the function has variables to be watch() in tape at first gradient. # (We want to control which variables to have gradients so in tape we set it watch_accessed_variables = False, # hence we need to have the trainable_variables in the first loop) #value_function.build(input_shape = env.observation_space.shape) value_function(tf.random.normal(shape=(1, env.observation_space.shape[0]))) target_value_function = nn.ValueFunction(name='target_value_function', **value_function_params) target_value_function( tf.random.normal(shape=(1, env.observation_space.shape[0]))) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], **policy_params) policy.build(input_shape=env.observation_space.shape) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) #tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) #tf_config.gpu_options.allow_growth = True # may need if using GPU algorithm.build(env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get( 'n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()