def create_agent(self, algo, env, env_spec, eps=None, name='agent'): agent = Agent(env=env, env_spec=env_spec, algo=algo, noise_adder=AgentActionNoiseWrapper( noise=OUNoise(), action_weight_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), schedule_timesteps=100, final_p=1.0, initial_p=0.0), noise_weight_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), schedule_timesteps=100, final_p=0.0, initial_p=1.0)), name=name, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_ENV_STEP_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), exploration_strategy=eps) return agent, locals()
def mountaincar_task_fn(): exp_config = MOUNTAINCAR_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('MountainCar-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) dqn = DQN(env_spec=env_spec, name=name + '_dqn', value_func=mlp_q, **exp_config['DQN']) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), **exp_config['EpsilonGreedy']['LinearScheduler']), **exp_config['EpsilonGreedy']['config_or_config_dict'])) flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'], func_dict={ 'test': {'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT']), }, 'train': {'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': {'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def validate(self, agent: Agent, *args, **kwargs): old_result = self.result self.validation_result = 0 for a in range(len(self._dynamics_model)): individual_model = self._dynamics_model.model[a] env = individual_model.return_as_env() batch_data = agent.sample( env=env, sample_count=self.parameters('validation_trajectory_count'), sample_type='trajectory', store_flag=False) self.result[a] = batch_data.get_mean_of('reward') if self.result[a] > old_result[a]: self.validation_result += 1 self.validation_result = self.validation_result / len( self._dynamics_model) return self.validation_result
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope=name + 'mlp_v', name=name + 'mlp_v', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "L1_NORM": 0.01, "L2_NORM": 0.01, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = NormalDistributionMLPPolicy(env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.01, "L2_NORM": 0.01, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ppo = PPO(env_spec=env_spec, config_or_config_dict={ "gamma": 0.995, "lam": 0.98, "policy_train_iter": 10, "value_func_train_iter": 10, "clipping_range": None, "beta": 1.0, "eta": 50, "log_var_init": -1.0, "kl_target": 0.003, "policy_lr": 0.01, "value_func_lr": 0.01, "value_func_train_batch_size": 10, "lr_multiplier": 1.0 }, value_func=mlp_v, stochastic_policy=policy, name=name + 'ppo') agent = Agent( env=env, env_spec=env_spec, algo=ppo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + 'agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env.env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) env_spec = env.env_spec mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 256, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=50000, GAMMA=0.99, BATCH_SIZE=32, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0), name=name + '_dqn', value_func=mlp_q) epsilon_greedy = EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), schedule_timesteps=int(0.1 * 100000), initial_p=1.0, final_p=0.02), init_random_prob=0.1) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=epsilon_greedy, noise_adder=None) flow = create_train_test_flow( test_every_sample_count=1000, train_every_sample_count=1, start_test_after_sample_count=0, start_train_after_sample_count=10000, sample_func_and_args=(agent.sample, (), dict(sample_count=1, env=agent.env, store_flag=True)), train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=1)), ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', **exp_config['DynamicsModel'] ) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=exp_config['MPC'], name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy') ) algo.set_terminal_reward_function_for_dynamics_env(reward_func=REWARD_FUNC_DICT['Pendulum-v0'](), terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), ) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_dynamics': {'func': agent.train, 'args': list(), 'kwargs': dict()}, 'train_algo': None, 'test_algo': {'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True)}, 'test_dynamics': {'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=100, env=env)}, 'sample_from_real_env': {'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True)}, 'sample_from_dynamics_env': None, 'train_algo_from_synthesized_data': None } ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) agent = Agent( env=env, env_spec=env_spec, algo=ddpg, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, learning_rate=0.01, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict(SAMPLED_HORIZON=2, SAMPLED_PATH_NUM=5, dynamics_model_train_iter=10), name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')) algo.set_terminal_reward_function_for_dynamics_env( reward_func=RandomRewardFunc(name='reward_func'), terminal_func=RandomTerminalFunc(name='random_terminal'), ) agent = Agent( env=env, env_spec=env_spec, algo=algo, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TEST_EVERY_SAMPLE_COUNT": 10, "TRAIN_EVERY_SAMPLE_COUNT": 10, "START_TRAIN_AFTER_SAMPLE_COUNT": 5, "START_TEST_AFTER_SAMPLE_COUNT": 5, }, func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) mlp_dyna_list = [] for i in range(10): mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna_{}'.format(i), name=name + '_mlp_dyna_{}'.format(i), learning_rate=0.01, state_input_scaler=RunningStandardScaler( dims=env_spec.flat_obs_dim), action_input_scaler=RunningStandardScaler( dims=env_spec.flat_action_dim), output_delta_state_scaler=RunningStandardScaler( dims=env_spec.flat_obs_dim), mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) mlp_dyna_list.append(mlp_dyna) dyna_ensemble_model = ModelEnsemble(n_models=10, model=mlp_dyna_list, prediction_type='random', env_spec=env_spec) algo = ModelEnsembleAlgo(env_spec=env_spec, model_free_algo=ddpg, dynamics_model=dyna_ensemble_model, config_or_config_dict=dict( dynamics_model_train_iter=10, model_free_algo_train_iter=10, validation_trajectory_count=2, )) # For examples only, we use random reward function and terminal function with fixed episode length. algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=PendulumRewardFunc()) agent = Agent( env=env, env_spec=env_spec, algo=algo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=200, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) # we can easily reuse the dyna training flow to implement the Model-ensemble training flow. flow = create_dyna_flow( train_algo_func=(agent.train, (), dict(state='state_agent_training')), train_algo_from_synthesized_data_func=( agent.train, (), dict(state='state_agent_training')), train_dynamics_func=(agent.train, (), dict(state='state_dynamics_training')), test_algo_func=(agent.test, (), dict(sample_count=10)), test_dynamics_func=(agent.algo.test_dynamics, (), dict(sample_count=10, env=env)), sample_from_real_env_func=(agent.sample, (), dict(sample_count=10, env=agent.env, store_flag=True)), sample_from_dynamics_env_func=(agent.sample, (), dict(sample_count=10, env=agent.algo.dynamics_env, store_flag=True)), # set this to large enough so agent only use data from dynamics env. train_algo_every_real_sample_count_by_data_from_real_env=100, train_algo_every_real_sample_count_by_data_from_dynamics_env=100, test_algo_every_real_sample_count=100, test_dynamics_every_real_sample_count=100, train_dynamics_ever_real_sample_count=100, start_train_algo_after_sample_count=1, start_train_dynamics_after_sample_count=1, start_test_algo_after_sample_count=1, start_test_dynamics_after_sample_count=1, warm_up_dynamics_samples=100) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + '_exp') experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "Q_NET_L1_NORM_SCALE": 0.01, "Q_NET_L2_NORM_SCALE": 0.01, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, learning_rate=0.01, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) # For examples only, we use random reward function and terminal function with fixed episode length. algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=RandomRewardFunc()) agent = Agent( env=env, env_spec=env_spec, algo=algo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_REAL_ENV": 10, "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_DYNAMICS_ENV": 10, "TEST_ALGO_EVERY_REAL_SAMPLE_COUNT": 10, "TEST_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10, "TRAIN_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10, "START_TRAIN_ALGO_AFTER_SAMPLE_COUNT": 1, "START_TRAIN_DYNAMICS_AFTER_SAMPLE_COUNT": 1, "START_TEST_ALGO_AFTER_SAMPLE_COUNT": 1, "START_TEST_DYNAMICS_AFTER_SAMPLE_COUNT": 1, "WARM_UP_DYNAMICS_SAMPLES": 1 }, func_dict={ 'train_algo': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_algo_from_synthesized_data': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=10, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.algo.dynamics_env, in_which_status='TRAIN', store_flag=True) } }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + '_exp') experiment.run()
def mountiancar_task_fn(): exp_config = MOUNTAIN_CAR_CONTINUOUS_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('MountainCarContinuous-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG(env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG']) n_actions = env.action_space.shape[0] agent = Agent(env=env, env_spec=env_spec, algo=ddpg, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper( noise=OrnsteinUhlenbeckActionNoise( mu=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions)), noise_weight_scheduler=ConstantScheduler(value=1), action_weight_scheduler=ConstantScheduler(value=1.0)), reset_noise_every_terminal_state=True, name=name + '_agent') flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow'] ['config_or_config_dict'], func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TEST_SAMPLES_COUNT']), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, Q_NET_L1_NORM_SCALE=0.001, Q_NET_L2_NORM_SCALE=0.001, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent( env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TEST_EVERY_SAMPLE_COUNT": 10, "TRAIN_EVERY_SAMPLE_COUNT": 10, "START_TRAIN_AFTER_SAMPLE_COUNT": 5, "START_TEST_AFTER_SAMPLE_COUNT": 5, }, func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): # create the gym environment by make function env = make('Pendulum-v0') # give your experiment a name which is used to generate the log path etc. name = 'demo_exp' # construct the environment specification env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) # construct the neural network to approximate q function of DDPG mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) # construct the neural network to approximate policy for DDPG policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) # construct the DDPG algorithms ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) # construct a neural network based global dynamics model to approximate the state transition of environment mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', learning_rate=0.01, state_input_scaler=RunningStandardScaler(dims=env_spec.flat_obs_dim), action_input_scaler=RunningStandardScaler( dims=env_spec.flat_action_dim), output_delta_state_scaler=RunningStandardScaler( dims=env_spec.flat_obs_dim), mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) # finally, construct the Dyna algorithms with a model free algorithm DDGP, and a NN model. algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) # To make the NN based dynamics model a proper environment so be a sampling source for DDPG, reward function and # terminal function need to be set. # For examples only, we use random reward function and terminal function with fixed episode length. algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=RandomRewardFunc()) # construct agent with additional exploration strategy if needed. agent = Agent( env=env, env_spec=env_spec, algo=algo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) # construct the training flow, called Dyna flow. It defines how the training proceed, and the terminal condition flow = create_dyna_flow( train_algo_func=(agent.train, (), dict(state='state_agent_training')), train_algo_from_synthesized_data_func=( agent.train, (), dict(state='state_agent_training')), train_dynamics_func=(agent.train, (), dict(state='state_dynamics_training')), test_algo_func=(agent.test, (), dict(sample_count=1)), test_dynamics_func=(agent.algo.test_dynamics, (), dict(sample_count=10, env=env)), sample_from_real_env_func=(agent.sample, (), dict(sample_count=10, env=agent.env, store_flag=True)), sample_from_dynamics_env_func=(agent.sample, (), dict(sample_count=10, env=agent.algo.dynamics_env, store_flag=True)), train_algo_every_real_sample_count_by_data_from_real_env=40, train_algo_every_real_sample_count_by_data_from_dynamics_env=40, test_algo_every_real_sample_count=40, test_dynamics_every_real_sample_count=40, train_dynamics_ever_real_sample_count=20, start_train_algo_after_sample_count=1, start_train_dynamics_after_sample_count=1, start_test_algo_after_sample_count=1, start_test_dynamics_after_sample_count=1, warm_up_dynamics_samples=1) # construct the experiment experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + '_exp') # run! experiment.run()
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, **exp_config['DynamicsModel']) dyna_env = DynamicsEnvWrapper(mlp_dyna) dyna_env.set_terminal_reward_func( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=dyna_env.total_step_count_fn), reward_func=REWARD_FUNC_DICT['Pendulum-v0']()) policy = iLQRPolicy(env_spec=env_spec, **exp_config['ILQR'], dynamics=dyna_env, cost_fn=RewardFuncCostWrapper( reward_func=REWARD_FUNC_DICT['Pendulum-v0']())) algo = iLQRAlogWrapper(policy=policy, env_spec=env_spec, dynamics_env=dyna_env) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'train_algo': None, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=100, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': None, 'train_algo_from_synthesized_data': None }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def func(): env = make(env_id) exp_config = make_config(obs_dim=env.env_spec.flat_obs_dim, action_dim=env.env_spec.flat_action_dim, policy_hid1_multi=10, value_hid3_size=5, value_hid1_multi=10, total_episode=total_episode, episode_length=1000, episode_per_sample=episode_per_sample) GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env.reset() env = StepObservationWrapper( env, step_limit=env.unwrapped._max_episode_steps) name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope=name + 'mlp_v', name=name + 'mlp_v', **exp_config['MLP_V']) policy = NormalDistributionMLPPolicy( env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', **exp_config['POLICY'], output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, reuse=False) ppo = PPO(env_spec=env_spec, **exp_config['PPO'], value_func=mlp_v, stochastic_policy=policy, name=name + '_ppo', use_time_index_flag=True) agent = Agent(env=env, env_spec=env_spec, algo=ppo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_FUNC_COUNT'), config_or_config_dict=exp_config['TrainTestFlow'] ['config_or_config_dict'], func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TEST_SAMPLES_COUNT']), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TRAIN_SAMPLES_COUNT'], env=agent.env, sample_type='trajectory', in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def pendulum_task_fn(): GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG(env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG']) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, **exp_config['DynamicsModel']) algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=REWARD_FUNC_DICT['Pendulum-v0']()) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper( noise=NormalActionNoise(), noise_weight_scheduler=ConstantSchedule(value=0.3), action_weight_scheduler=ConstantSchedule(value=1.0)), name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_algo': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_algo_from_synthesized_data': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training', train_iter=1) }, 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=10, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=50, sample_type='transition', env=agent.algo.dynamics_env, in_which_status='TRAIN', store_flag=False) } }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope=name + 'mlp_v', name=name + 'mlp_v', **exp_config['MLP_V']) policy = NormalDistributionMLPPolicy( env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', **exp_config['POLICY'], output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, reuse=False) ppo = PPO(env_spec=env_spec, **exp_config['PPO'], value_func=mlp_v, stochastic_policy=policy, name=name + 'ppo') agent = Agent(env=env, env_spec=env_spec, algo=ppo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow'] ['config_or_config_dict'], func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TEST_SAMPLES_COUNT'], sample_trajectory_flag=True), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TRAIN_SAMPLES_COUNT'], env=agent.env, sample_type='trajectory', in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = env.env_spec mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', learning_rate=0.01, mlp_config=[{ "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 128, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict(SAMPLED_HORIZON=2, SAMPLED_PATH_NUM=5, dynamics_model_train_iter=10), name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')) algo.set_terminal_reward_function_for_dynamics_env( reward_func=RandomRewardFunc(name='reward_func'), terminal_func=RandomTerminalFunc(name='random_terminal'), ) agent = Agent( env=env, env_spec=env_spec, algo=algo, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow(test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'example_scheduler_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=PiecewiseScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), endpoints=((10, 0.3), (100, 0.1), (200, 0.0)), outside_value=0.0 ), init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True)) ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name + 'experiment_debug' ) dqn.parameters.set_scheduler(param_key='LEARNING_RATE', scheduler=LinearScheduler( t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT, schedule_timesteps=GlobalConfig().DEFAULT_EXPERIMENT_END_POINT[ 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'], final_p=0.0001, initial_p=0.01)) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, Q_NET_L1_NORM_SCALE=0.001, Q_NET_L2_NORM_SCALE=0.001, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent( env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG( env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG'] ) agent = Agent(env=env, env_spec=env_spec, algo=ddpg, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper(noise=NormalActionNoise(), noise_weight_scheduler=ConstantSchedule(value=0.3), action_weight_scheduler=ConstantSchedule(value=1.0)), name=name + '_agent') flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'], func_dict={ 'test': {'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT'], sample_trajectory_flag=True), }, 'train': {'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': {'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def test_integration_with_dqn(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name='mlp_q', name_scope='mlp_q', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, name='dqn_test', config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name='agent') agent.init() # dqn.init() st = env.reset() from baconian.common.sampler.sample_data import TransitionData a = TransitionData(env_spec) res = [] agent.sample(env=env, sample_count=100, in_which_status='TRAIN', store_flag=True, sample_type='transition') agent.sample(env=env, sample_count=100, in_which_status='TRAIN', store_flag=True, sample_type='transition') res.append(dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)['average_loss']) res.append(dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)['average_loss']) self.assertTrue(dqn in dqn.recorder._obj_log) self.assertTrue('average_loss' in dqn.recorder._obj_log[dqn]) self.assertTrue(len(dqn.recorder._obj_log[dqn]['average_loss']) == 2) self.assertTrue( np.equal(np.array(res), [x['value'] for x in dqn.recorder._obj_log[dqn]['average_loss']]).all()) self.assertTrue(len(Logger()._registered_recorders) > 0) self.assertTrue(dqn.recorder in Logger()._registered_recorders) res = dqn.recorder.get_log(attr_name='average_loss', filter_by_status=dict()) self.assertEqual(len(res), 2) res = agent.recorder.get_log(attr_name='sum_reward', filter_by_status={'status': 'TRAIN'}) self.assertEqual(len(res), 2) res = agent.recorder.get_log(attr_name='sum_reward', filter_by_status={'status': 'TEST'}) self.assertEqual(len(res), 0) Logger().flush_recorder()