def main(): params = { 'actor_learning_rate': 1e-4, 'critic_learning_rate': 1e-3, 'gamma': 0.99, 'tau': 0.001, 'sigma': 0.2, 'num_epochs': 500, 'num_episodes': 20, 'replay_size': 1000000, 'num_train_steps': 50, 'replay_init_size': 1000, 'batch_size': 64, 'render_train': False, 'restore': False, 'env': 'HalfCheetah-v2' } agent = DDPG(params) #agent.train() agent.test()
def test(agent, trial_dir, test_episode, visual_flag, submit_flag): pid = os.getpid() logger, _ = prepare_for_logging("pid_{}".format(pid), False) logger.info("trial_dir={}".format(trial_dir)) if not os.path.exists(trial_dir): logger.info("trial_dir does not exist") return # create environment env = NIPS(visualize=visual_flag) # load config with open(os.path.join(trial_dir, "config.pk"), "rb") as f: config = pickle.load(f) if agent == 'DDPG': config["scale_action"] = scale_action # observation processor if "ob_processor" not in config or config["ob_processor"] == "dummy": ob_processor = ObservationProcessor() elif config["ob_processor"] == "2ndorder": ob_processor = SecondOrderAugmentor() else: ob_processor = BodySpeedAugmentor() config["ob_aug_dim"] = ob_processor.get_aug_dim() util.print_settings(logger, config, env) # create random process oup = create_rand_process(env, config) # create replay buffer memory = create_memory(env, config) # create ddpg agent agent = DDPG(env, memory, oup, ob_processor, config) agent.build_nets(actor_hiddens=config["actor_hiddens"], scale_action=config["scale_action"], critic_hiddens=config["critic_hiddens"]) # load weights paths = {} if test_episode > 0: paths["actor"] = "actor_{}.h5".format(test_episode) paths["critic"] = "critic_{}.h5".format(test_episode) paths["target"] = "target_{}.h5".format(test_episode) else: paths["actor"] = "actor.h5" paths["critic"] = "critic.h5" paths["target"] = "target.h5" paths = {k: os.path.join(trial_dir, v) for k, v in paths.iteritems()} logger.info("Paths to models: {}".format(paths)) agent.load_models(paths) elif agent == 'TRPO': def ob_processor_maker(): if config["ob_processor"] == "normal": return ObservationProcessor() elif config["ob_processor"] == "2ndorder": return SecondOrderAugmentor() elif config['ob_processor'] == 'bodyspeed': return BodySpeedAugmentor() else: raise ValueError('invalid ob processor type') config = { "agent": 'TRPO', "batch_size": 5000, "n_envs": 16, "n_iters": 5000, "ob_processor": "bodyspeed", # "hidden_nonlinearity": "relu", # "action_nonlinearity": "tanh", # "policy_hiddens": [128, 128, 64, 64], # "baseline_hiddens": [128, 128, 64, 64], "policy_hiddens": [256, 128, 64], "baseline_hiddens": [256, 128, 64], "hidden_nonlinearity": "tanh", "action_nonlinearity": None, } agent = TRPO( env, env_maker=None, logger=logger, log_dir=None, ob_processor_maker=ob_processor_maker, policy_hiddens=config['policy_hiddens'], baseline_hiddens=config['baseline_hiddens'], hidden_nonlinearity=config['hidden_nonlinearity'], action_nonlinearity=config['action_nonlinearity'], n_envs=config['n_envs'], batch_size=config['batch_size'], n_iters=config['n_iters'], ) agent.load_models(trial_dir) else: raise ValueError('invalid agent type') if submit_flag: submit(agent, logger) else: rewards = [] for i in xrange(10): steps, reward = agent.test(max_steps=1000) logger.info("episode={}, steps={}, reward={}".format( i, steps, reward)) rewards.append(reward) logger.info("avg_reward={}".format(np.mean(rewards)))