def test_run(args, dir, max_steps=2000): with open(os.path.join(dir, 'params.json')) as f: json_params = json.load(f) env = gym.make(json_params['env_name']) # Observation and action sizes ac_dim = env.action_space.n \ if isinstance(env.action_space, gym.spaces.Discrete) \ else env.action_space.shape[0] obs_dim = env.observation_space.shape[0] tf.reset_default_graph() policy = nn.GaussianPolicy( action_dim=ac_dim, reparameterize=json_params['algorithm_params']['reparameterize'], **json_params['policy_params']) policy.build([None, obs_dim]) # saver = tf.train.Saver() tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True # may need if using GPU sess = tf.Session(config=tf_config) with sess.as_default(): # policy = saver.restore(sess, os.path.join(dir, 'policy.h5')) # policy = load_model(os.path.join(dir, 'policy.h5'), # custom_objects={'GaussianPolicy': nn.GaussianPolicy, # 'DistributionLayer': nn.DistributionLayer}) policy.load_weights(os.path.join(dir, 'policy.h5')) for e in range(args.n_experiments): seed = args.seed + 10 * e print('Running experiment with seed %d' % seed) tf.set_random_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) uid = 'seed_' + str(seed) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") logz.configure_output_dir(dir, file='test-run' + '_' + uid + '.txt', check=False) env = gym.wrappers.Monitor(env, args.exp_name, force=True, uid=uid) obs = env.reset() for istep in range(max_steps): action = policy.eval(obs) obs, reward, done, _ = env.step(action) if args.render: env.render() time.sleep(1e-3) logz.log_tabular('step', istep) for i, ob in obs: logz.log_tabular('observation_' + str(i), obs) for j, act in action: logz.log_tabular('action_' + str(j), act) logz.log_tabular('reward', reward) if done: break logz.dump_tabular()
def train_SAC(env_name, exp_name, seed, logdir, two_qf=False, reparam=False, nepochs=100, paras={}): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, 'Toddler': 0.05, 'Adult': 0.05, 'LunarLander': 0.1 }.get(env_name, 0.2) algorithm_params = { 'alpha': alpha, 'batch_size': 256, 'discount': 0.99, 'learning_rate': 1e-3, 'reparameterize': reparam, 'tau': 0.01, 'epoch_length': 1000, 'n_epochs': nepochs, # 500 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': 1000, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (128, 128), } q_function_params = { 'hidden_layer_sizes': (128, 128), } policy_params = { 'hidden_layer_sizes': (128, 128), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) if env_name == 'Toddler' or env_name == 'Adult': env = CustomHumanoidEnv(template=env_name) elif env_name == 'LunarLander': env = LunarLanderContinuous(**paras) else: env = gym.envs.make(env_name) # Observation and action sizes ac_dim = env.action_space.n \ if isinstance(env.action_space, gym.spaces.Discrete) \ else env.action_space.shape[0] # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction(name='value_function', **value_function_params) target_value_function = nn.ValueFunction(name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=ac_dim, reparameterize=algorithm_params['reparameterize'], **policy_params) samplers = [] replay_pools = [] sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=(ac_dim, ), **replay_pool_params) sampler.initialize(env, policy, replay_pool) samplers.append(sampler) replay_pools.append(replay_pool) algorithm = SAC(**algorithm_params) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True # may need if using GPU with tf.Session(config=tf_config): algorithm.build(env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) # algorithm_params.get('n_epochs', 1000) for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get( 'n_epochs', 100)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()