def func(algo=algo, locals=locals): GlobalConfig().set( 'DEFAULT_EXPERIMENT_END_POINT', dict(TOTAL_AGENT_TRAIN_SAMPLE_COUNT=500, TOTAL_AGENT_TEST_SAMPLE_COUNT=None, TOTAL_AGENT_UPDATE_COUNT=None)) if not algo: algo, locals = self.create_dqn() env_spec = locals['env_spec'] env = locals['env'] agent = self.create_agent(env=locals['env'], algo=algo, name='agent', eps=self.create_eps(env_spec)[0], env_spec=env_spec)[0] exp = self.create_exp(name='model_free', env=env, agent=agent) algo.parameters.set_scheduler( param_key='LEARNING_RATE', to_tf_ph_flag=True, scheduler=LinearScheduler( t_fn=exp.TOTAL_ENV_STEP_TRAIN_SAMPLE_COUNT, schedule_timesteps=GlobalConfig( ).DEFAULT_EXPERIMENT_END_POINT[ 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'], final_p=0.0001, initial_p=0.01)) exp.run() self.assertEqual(exp.TOTAL_AGENT_TEST_SAMPLE_COUNT(), exp.TOTAL_ENV_STEP_TEST_SAMPLE_COUNT()) self.assertEqual(exp.TOTAL_AGENT_TRAIN_SAMPLE_COUNT(), exp.TOTAL_ENV_STEP_TRAIN_SAMPLE_COUNT(), 500)
def acrobot_task_fn(): exp_config = ACROBOT_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Acrobot-v1') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) dqn = DQN(env_spec=env_spec, name=name + '_dqn', value_func=mlp_q, **exp_config['DQN']) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), **exp_config['EpsilonGreedy']['LinearScheduler']), **exp_config['EpsilonGreedy']['config_or_config_dict'])) flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'], func_dict={ 'test': {'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT']), }, 'train': {'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': {'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def test_eps_with_scheduler(self): dqn, locals = self.create_dqn() env = locals['env'] def func(): global x return x dqn.init() eps = EpsilonGreedy(action_space=dqn.env_spec.action_space, prob_scheduler=LinearScheduler(initial_p=1.0, t_fn=func, schedule_timesteps=10, final_p=0.0), init_random_prob=1.0) st = env.reset() for i in range(10): global x ac = eps.predict(obs=st, sess=self.sess, batch_flag=False, algo=dqn) st_new, re, done, _ = env.step(action=ac) self.assertAlmostEqual(eps.parameters('random_prob_func')(), 1.0 - (1.0 - 0.0) / 10 * x) x += 1
def task_fn(): env = make('Acrobot-v1') name = 'example_scheduler_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=PiecewiseScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), endpoints=((10, 0.3), (100, 0.1), (200, 0.0)), outside_value=0.0 ), init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True)) ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name + 'experiment_debug' ) dqn.parameters.set_scheduler(param_key='LEARNING_RATE', scheduler=LinearScheduler( t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT, schedule_timesteps=GlobalConfig().DEFAULT_EXPERIMENT_END_POINT[ 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'], final_p=0.0001, initial_p=0.01)) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env.env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) env_spec = env.env_spec mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 256, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=50000, GAMMA=0.99, BATCH_SIZE=32, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0), name=name + '_dqn', value_func=mlp_q) epsilon_greedy = EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), schedule_timesteps=int(0.1 * 100000), initial_p=1.0, final_p=0.02), init_random_prob=0.1) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=epsilon_greedy, noise_adder=None) flow = create_train_test_flow( test_every_sample_count=1000, train_every_sample_count=1, start_test_after_sample_count=0, start_train_after_sample_count=10000, sample_func_and_args=(agent.sample, (), dict(sample_count=1, env=agent.env, store_flag=True)), train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=1)), ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()