Ejemplo n.º 1
0
def test_run(args, dir, max_steps=2000):
    with open(os.path.join(dir, 'params.json')) as f:
        json_params = json.load(f)
    env = gym.make(json_params['env_name'])
    # Observation and action sizes
    ac_dim = env.action_space.n \
        if isinstance(env.action_space, gym.spaces.Discrete) \
        else env.action_space.shape[0]
    obs_dim = env.observation_space.shape[0]

    tf.reset_default_graph()

    policy = nn.GaussianPolicy(
        action_dim=ac_dim,
        reparameterize=json_params['algorithm_params']['reparameterize'],
        **json_params['policy_params'])
    policy.build([None, obs_dim])
    # saver = tf.train.Saver()

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True  # may need if using GPU
    sess = tf.Session(config=tf_config)
    with sess.as_default():
        # policy = saver.restore(sess, os.path.join(dir, 'policy.h5'))
        # policy = load_model(os.path.join(dir, 'policy.h5'),
        #                    custom_objects={'GaussianPolicy': nn.GaussianPolicy,
        #                                    'DistributionLayer': nn.DistributionLayer})
        policy.load_weights(os.path.join(dir, 'policy.h5'))
        for e in range(args.n_experiments):
            seed = args.seed + 10 * e
            print('Running experiment with seed %d' % seed)

            tf.set_random_seed(args.seed)
            np.random.seed(args.seed)
            env.seed(args.seed)

            uid = 'seed_' + str(seed) + '_' + time.strftime(
                "%d-%m-%Y_%H-%M-%S")
            logz.configure_output_dir(dir,
                                      file='test-run' + '_' + uid + '.txt',
                                      check=False)

            env = gym.wrappers.Monitor(env, args.exp_name, force=True, uid=uid)
            obs = env.reset()
            for istep in range(max_steps):
                action = policy.eval(obs)
                obs, reward, done, _ = env.step(action)
                if args.render:
                    env.render()
                time.sleep(1e-3)
                logz.log_tabular('step', istep)
                for i, ob in obs:
                    logz.log_tabular('observation_' + str(i), obs)
                for j, act in action:
                    logz.log_tabular('action_' + str(j), act)
                logz.log_tabular('reward', reward)
                if done:
                    break
                logz.dump_tabular()
Ejemplo n.º 2
0
def train_SAC(env_name,
              exp_name,
              seed,
              logdir,
              two_qf=False,
              reparam=False,
              nepochs=100,
              paras={}):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
        'Toddler': 0.05,
        'Adult': 0.05,
        'LunarLander': 0.1
    }.get(env_name, 0.2)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': 256,
        'discount': 0.99,
        'learning_rate': 1e-3,
        'reparameterize': reparam,
        'tau': 0.01,
        'epoch_length': 1000,
        'n_epochs': nepochs,  # 500
        'two_qf': two_qf,
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': 1000,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    q_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    policy_params = {
        'hidden_layer_sizes': (128, 128),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    if env_name == 'Toddler' or env_name == 'Adult':
        env = CustomHumanoidEnv(template=env_name)
    elif env_name == 'LunarLander':
        env = LunarLanderContinuous(**paras)
    else:
        env = gym.envs.make(env_name)

    # Observation and action sizes
    ac_dim = env.action_space.n \
        if isinstance(env.action_space, gym.spaces.Discrete) \
        else env.action_space.shape[0]

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(name='value_function',
                                      **value_function_params)
    target_value_function = nn.ValueFunction(name='target_value_function',
                                             **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=ac_dim,
        reparameterize=algorithm_params['reparameterize'],
        **policy_params)

    samplers = []
    replay_pools = []

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=(ac_dim, ),
        **replay_pool_params)
    sampler.initialize(env, policy, replay_pool)
    samplers.append(sampler)
    replay_pools.append(replay_pool)

    algorithm = SAC(**algorithm_params)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True  # may need if using GPU
    with tf.Session(config=tf_config):
        algorithm.build(env=env,
                        policy=policy,
                        q_function=q_function,
                        q_function2=q_function2,
                        value_function=value_function,
                        target_value_function=target_value_function)
        # algorithm_params.get('n_epochs', 1000)
        for epoch in algorithm.train(sampler,
                                     n_epochs=algorithm_params.get(
                                         'n_epochs', 100)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()