Beispiel #1
0
def train_SAC(env_name, exp_name, n_iter, ep_len, seed, logdir, alpha,
              prefill_steps, discount, batch_size, learning_rate, tau, two_qf):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
    }.get(env_name, alpha)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': batch_size,
        'discount': discount,
        'learning_rate': learning_rate,
        'reparameterize': True,
        'tau': tau,
        'epoch_length': ep_len,
        'n_epochs': n_iter,
        'two_qf': two_qf,
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': prefill_steps,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (64, 64),
    }

    q_function_params = {
        'hidden_layer_sizes': (64, 64),
    }

    policy_params = {
        'hidden_layer_sizes': (64, 64),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    env = gym.envs.make(env_name)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        **replay_pool_params)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(
        name='value_function', **value_function_params)
    target_value_function = nn.ValueFunction(
        name='target_value_function', **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=env.action_space.shape[0],
        reparameterize=algorithm_params['reparameterize'],
        **policy_params)

    sampler.initialize(env, policy, replay_pool)

    algorithm = SAC(**algorithm_params)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True  # may need if using GPU
    with tf.Session(config=tf_config):
        algorithm.build(
            env=env,
            policy=policy,
            q_function=q_function,
            q_function2=q_function2,
            value_function=value_function,
            target_value_function=target_value_function)

        for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()
def train_SAC(env_name, exp_name, seed, reparametrize, two_qf, old_funct,
              logdir, debug, gpu):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
    }.get(env_name, 0.2)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': 256,
        'discount': 0.99,
        'learning_rate': 1e-3,
        'reparameterize': reparametrize,
        'tau': 0.01,
        'epoch_length': 1000,
        'n_epochs': 500,
        'two_qf': two_qf,
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': 1000,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    q_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    policy_params = {
        'hidden_layer_sizes': (128, 128),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    env = gym.envs.make(env_name)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        **replay_pool_params)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(name='value_function',
                                      **value_function_params)
    target_value_function = nn.ValueFunction(name='target_value_function',
                                             **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=env.action_space.shape[0],
        reparameterize=algorithm_params['reparameterize'],
        old_funct=old_funct,
        **policy_params)

    sampler.initialize(env, policy, replay_pool)

    algorithm = SAC(**algorithm_params)

    gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=gpu)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)
    with tf.Session(config=tf_config) as sess:

        if debug:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        algorithm.build(env=env,
                        policy=policy,
                        q_function=q_function,
                        q_function2=q_function2,
                        value_function=value_function,
                        target_value_function=target_value_function)

        for epoch in algorithm.train(sampler,
                                     session=sess,
                                     n_epochs=algorithm_params.get(
                                         'n_epochs', 1000)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()
Beispiel #3
0
def train_SAC(env_name, exp_name, seed, logdir, extra_params=None):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
    }.get(env_name, 0.2)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': 256,
        'discount': 0.99,
        'learning_rate': 1e-3,
        'reparameterize': get_extra_param(extra_params, 'reparameterize',
                                          False),
        'tau': 0.01,
        'epoch_length': 1000,
        'n_epochs': 500,
        'two_qf': get_extra_param(extra_params, 'two_qf', False),
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': 1000,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    q_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    policy_params = {
        'hidden_layer_sizes': (128, 128),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    env = gym.envs.make(env_name)
    # Set random seeds
    tf.random.set_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        **replay_pool_params)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    #q_function.build(input_shape = [env.observation_space.shape, env.action_space.shape])
    q_function.call(
        (tf.random.normal(shape=(1, env.observation_space.shape[0])),
         tf.random.normal(shape=(1, env.action_space.shape[0]))))

    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
        q_function2.build(
            input_shape=[env.observation_space.shape, env.action_space.shape])
    else:
        q_function2 = None

    value_function = nn.ValueFunction(name='value_function',
                                      **value_function_params)
    # We initialize build() so the function has variables to be watch() in tape at first gradient.
    # (We want to control which variables to have gradients so in tape we set it watch_accessed_variables = False,
    # hence we need to have the trainable_variables in the first loop)
    #value_function.build(input_shape = env.observation_space.shape)
    value_function(tf.random.normal(shape=(1, env.observation_space.shape[0])))

    target_value_function = nn.ValueFunction(name='target_value_function',
                                             **value_function_params)
    target_value_function(
        tf.random.normal(shape=(1, env.observation_space.shape[0])))

    policy = nn.GaussianPolicy(
        action_dim=env.action_space.shape[0],
        reparameterize=algorithm_params['reparameterize'],
        **policy_params)
    policy.build(input_shape=env.observation_space.shape)

    sampler.initialize(env, policy, replay_pool)

    algorithm = SAC(**algorithm_params)

    #tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
    #tf_config.gpu_options.allow_growth = True  # may need if using GPU
    algorithm.build(env=env,
                    policy=policy,
                    q_function=q_function,
                    q_function2=q_function2,
                    value_function=value_function,
                    target_value_function=target_value_function)

    for epoch in algorithm.train(sampler,
                                 n_epochs=algorithm_params.get(
                                     'n_epochs', 1000)):
        logz.log_tabular('Iteration', epoch)
        for k, v in algorithm.get_statistics().items():
            logz.log_tabular(k, v)
        for k, v in replay_pool.get_statistics().items():
            logz.log_tabular(k, v)
        for k, v in sampler.get_statistics().items():
            logz.log_tabular(k, v)
        logz.dump_tabular()