def create_mlp_deterministic_policy(self, env_spec, name='mlp_policy'): policy = DeterministicMLPPolicy(env_spec=env_spec, name=name, name_scope=name, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], output_high=None, output_low=None, output_norm=None, input_norm=None, reuse=False) return policy, locals()
def create_ddpg(self, env_id='Pendulum-v0', name='ddpg'): env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + 'mlp_q', name=name + 'mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) self.assertTrue(len(mlp_q.parameters('tf_var_list')) == 4) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) self.assertTrue(len(policy.parameters('tf_var_list')) == 4) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name, replay_buffer=None) return ddpg, locals()
def task_fn(): env = make('Pendulum-v0') name = 'mb_test' env_spec = env.env_spec model_path = '/home/yitongx/Documents/baconian-project/experiments/log' cyber = PendulumnCyber(env=env, epoch_to_use=60, use_traj_input=False, use_mbmf=True, \ model_path=model_path) mlp_config = [{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 32, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 8, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }] mlp_q = MLPQValueFunction(env_spec=env_spec, name=name + '_mlp_q', name_scope=name + '_mlp_q', output_high=env.action_space.high, mlp_config=mlp_config) mlp_policy = DeterministicMLPPolicy(env_spec=env_spec, name=name + '_mlp_policy', name_scope=name + '_mlp_policy', output_high=env.observation_space.high, mlp_config=mlp_config, reuse=False) polyak = 0.995 gamma = 0.99 noise_scale = 0.5 noise_decay = 0.999 # default 0.995 batch_size = 128 actor_lr = 0.001 # default 0.001 critic_lr = 0.001 # default 0.001 buffer_size = 100000 total_steps = 500000 # default 1000000 max_step_per_episode = 500 # reset env when counter > max_step_per_episode train_after_step = 10000 # default 10000 train_every_step = 1 train_iter_per_call = 1 test_after_step = 10000 test_every_step = 1000 num_test = 10 algo = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": buffer_size, "GAMMA": gamma, "CRITIC_LEARNING_RATE": critic_lr, "ACTOR_LEARNING_RATE": actor_lr, "DECAY": polyak, "BATCH_SIZE": batch_size, "TRAIN_ITERATION": train_iter_per_call, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=mlp_policy, name=name + '_ddpg', replay_buffer=None) step_counter = SinglentonStepCounter(-1) noise_adder = AgentActionNoiseWrapper( noise=UniformNoise(scale=noise_scale), action_weight_scheduler=ConstantScheduler(1.), noise_weight_scheduler=DDPGNoiseScheduler( train_every_step=train_every_step, train_after_step=train_after_step, noise_decay=noise_decay, step_counter=step_counter)) agent = DDPG_Agent(env=env, algo=algo, env_spec=env_spec, noise_adder=noise_adder, name=name + '_agent') flow = create_train_test_flow(env=env, cyber=cyber, agent=agent, num_test=num_test, total_steps=total_steps, max_step_per_episode=max_step_per_episode, train_after_step=train_after_step, test_after_step=test_after_step, train_every_step=train_every_step, test_every_step=test_every_step, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict()), sample_func_and_args=(agent.sample, (), dict()), flow_type='DDPG_TrainTestFlow') experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) agent = Agent( env=env, env_spec=env_spec, algo=ddpg, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) mlp_dyna_list = [] for i in range(10): mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna_{}'.format(i), name=name + '_mlp_dyna_{}'.format(i), learning_rate=0.01, state_input_scaler=RunningStandardScaler( dims=env_spec.flat_obs_dim), action_input_scaler=RunningStandardScaler( dims=env_spec.flat_action_dim), output_delta_state_scaler=RunningStandardScaler( dims=env_spec.flat_obs_dim), mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) mlp_dyna_list.append(mlp_dyna) dyna_ensemble_model = ModelEnsemble(n_models=10, model=mlp_dyna_list, prediction_type='random', env_spec=env_spec) algo = ModelEnsembleAlgo(env_spec=env_spec, model_free_algo=ddpg, dynamics_model=dyna_ensemble_model, config_or_config_dict=dict( dynamics_model_train_iter=10, model_free_algo_train_iter=10, validation_trajectory_count=2, )) # For examples only, we use random reward function and terminal function with fixed episode length. algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=PendulumRewardFunc()) agent = Agent( env=env, env_spec=env_spec, algo=algo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=200, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) # we can easily reuse the dyna training flow to implement the Model-ensemble training flow. flow = create_dyna_flow( train_algo_func=(agent.train, (), dict(state='state_agent_training')), train_algo_from_synthesized_data_func=( agent.train, (), dict(state='state_agent_training')), train_dynamics_func=(agent.train, (), dict(state='state_dynamics_training')), test_algo_func=(agent.test, (), dict(sample_count=10)), test_dynamics_func=(agent.algo.test_dynamics, (), dict(sample_count=10, env=env)), sample_from_real_env_func=(agent.sample, (), dict(sample_count=10, env=agent.env, store_flag=True)), sample_from_dynamics_env_func=(agent.sample, (), dict(sample_count=10, env=agent.algo.dynamics_env, store_flag=True)), # set this to large enough so agent only use data from dynamics env. train_algo_every_real_sample_count_by_data_from_real_env=100, train_algo_every_real_sample_count_by_data_from_dynamics_env=100, test_algo_every_real_sample_count=100, test_dynamics_every_real_sample_count=100, train_dynamics_ever_real_sample_count=100, start_train_algo_after_sample_count=1, start_train_dynamics_after_sample_count=1, start_test_algo_after_sample_count=1, start_test_dynamics_after_sample_count=1, warm_up_dynamics_samples=100) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + '_exp') experiment.run()
def hopper_task_fn(): exp_config = HOPPER_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Hopper-v2') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLP_V']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['POLICY'], reuse=False) ddpg = DDPG(env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG']) agent = Agent(env=env, env_spec=env_spec, algo=ddpg, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper( noise=OUNoise(theta=0.15, sigma=0.3), noise_weight_scheduler=ConstantScheduler(1), action_weight_scheduler=ConstantScheduler(1), ), name=name + '_agent') flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow'] ['config_or_config_dict'], func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TEST_SAMPLES_COUNT']), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def mountiancar_task_fn(): exp_config = MOUNTAIN_CAR_CONTINUOUS_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('MountainCarContinuous-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG(env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG']) n_actions = env.action_space.shape[0] agent = Agent(env=env, env_spec=env_spec, algo=ddpg, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper( noise=OrnsteinUhlenbeckActionNoise( mu=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions)), noise_weight_scheduler=ConstantScheduler(value=1), action_weight_scheduler=ConstantScheduler(value=1.0)), reset_noise_every_terminal_state=True, name=name + '_agent') flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow'] ['config_or_config_dict'], func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TEST_SAMPLES_COUNT']), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def pendulum_task_fn(): GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG(env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG']) mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', **exp_config['DynamicsModel']) algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=REWARD_FUNC_DICT['Pendulum-v0']()) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper( noise=NormalActionNoise(), noise_weight_scheduler=ConstantSchedule(value=0.3), action_weight_scheduler=ConstantSchedule(value=1.0)), name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_algo': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_algo_from_synthesized_data': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training', train_iter=1) }, 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=10, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=50, sample_type='transition', env=agent.algo.dynamics_env, in_which_status='TRAIN', store_flag=False) } }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): # create the gym environment by make function env = make('Pendulum-v0') # give your experiment a name which is used to generate the log path etc. name = 'demo_exp' # construct the environment specification env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) # construct the neural network to approximate q function of DDPG mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) # construct the neural network to approximate policy for DDPG policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) # construct the DDPG algorithms ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) # construct a neural network based global dynamics model to approximate the state transition of environment mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', learning_rate=0.01, state_input_scaler=RunningStandardScaler(dims=env_spec.flat_obs_dim), action_input_scaler=RunningStandardScaler( dims=env_spec.flat_action_dim), output_delta_state_scaler=RunningStandardScaler( dims=env_spec.flat_obs_dim), mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) # finally, construct the Dyna algorithms with a model free algorithm DDGP, and a NN model. algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) # To make the NN based dynamics model a proper environment so be a sampling source for DDPG, reward function and # terminal function need to be set. # For examples only, we use random reward function and terminal function with fixed episode length. algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=RandomRewardFunc()) # construct agent with additional exploration strategy if needed. agent = Agent( env=env, env_spec=env_spec, algo=algo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) # construct the training flow, called Dyna flow. It defines how the training proceed, and the terminal condition flow = create_dyna_flow( train_algo_func=(agent.train, (), dict(state='state_agent_training')), train_algo_from_synthesized_data_func=( agent.train, (), dict(state='state_agent_training')), train_dynamics_func=(agent.train, (), dict(state='state_dynamics_training')), test_algo_func=(agent.test, (), dict(sample_count=1)), test_dynamics_func=(agent.algo.test_dynamics, (), dict(sample_count=10, env=env)), sample_from_real_env_func=(agent.sample, (), dict(sample_count=10, env=agent.env, store_flag=True)), sample_from_dynamics_env_func=(agent.sample, (), dict(sample_count=10, env=agent.algo.dynamics_env, store_flag=True)), train_algo_every_real_sample_count_by_data_from_real_env=40, train_algo_every_real_sample_count_by_data_from_dynamics_env=40, test_algo_every_real_sample_count=40, test_dynamics_every_real_sample_count=40, train_dynamics_ever_real_sample_count=20, start_train_algo_after_sample_count=1, start_train_dynamics_after_sample_count=1, start_test_algo_after_sample_count=1, start_test_dynamics_after_sample_count=1, warm_up_dynamics_samples=1) # construct the experiment experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + '_exp') # run! experiment.run()
def test_DDPG_1(self): self.LogSetup() env = make('Pendulum-v0') name = 'mb_test' env_spec = env.env_spec cyber = PendulumnCyber(env=env, epoch_to_use=60, use_traj_input=False, use_mbmf=True, \ model_path='/home/yitongx/Documents/baconian-project/experiments/log') actor_policy_mlp_config = [{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 32, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 8, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }] mlp_q = MLPQValueFunction(env_spec=env_spec, name=name + '_mlp_q', name_scope=name + '_mlp_q', output_high=env.action_space.high, mlp_config=actor_policy_mlp_config) mlp_policy = DeterministicMLPPolicy( env_spec=env_spec, name=name + '_mlp_policy', name_scope=name + '_mlp_policy', output_high=env.observation_space.high, mlp_config=actor_policy_mlp_config, reuse=False) polyak = 0.995 gamma = 0.99 batch_size = 128 actor_lr = 0.001 critic_lr = 0.001 train_iter_per_call = 1 buffer_size = 100000 total_steps = 100000 # default 1000000 max_step_per_episode = 500 # reset env when counter > max_step_per_episode train_after_step = 10000 test_after_step = 10000 train_every_step = 1 test_every_step = 1000 num_test = 10 algo = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": buffer_size, "GAMMA": gamma, "CRITIC_LEARNING_RATE": critic_lr, "ACTOR_LEARNING_RATE": actor_lr, "DECAY": polyak, "BATCH_SIZE": batch_size, "TRAIN_ITERATION": train_iter_per_call, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=mlp_policy, name=name + '_ddpg', replay_buffer=None) algo.init() buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, action_shape=env_spec.action_shape) for i in range(100): # num_trajectory obs = env.reset() for j in range(1000): action = env.action_space.sample() obs_, reward, done, info = env.step(action) buffer.append(obs, action, obs_, done, reward) if done: break else: obs = obs_ algo.append_to_memory(buffer) for i in range(10): res = algo.train() print(res) obs = env.reset() act = env.action_space.sample() obs_, reward, done, info = cyber.step(obs, act) print('obs_', obs_) obs = env.observation_space.sample() obs_batch = np.array([]) for i in range(5): obs_batch = np.concatenate( (obs_batch, env.observation_space.sample()), axis=0) # print(obs_batch.shape) act_batch = algo.predict(obs_batch) print(act_batch.shape) print(act_batch) print('====> Test') act = algo.predict(obs) print(act.shape) obs = env.reset() print(obs.shape) obs_, reward, done, info = cyber.step(obs, act)
def test_DDPG_2(self): self.LogSetup() env = make('Pendulum-v0') name = 'mb_test' env_spec = env.env_spec cyber = PendulumnCyber(env=env, epoch_to_use=60, use_traj_input=False, use_mbmf=True, \ model_path='/home/yitongx/Documents/baconian-project/experiments/log') actor_policy_mlp_config = [{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 32, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 8, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }] mlp_q = MLPQValueFunction(env_spec=env_spec, name=name + '_mlp_q', name_scope=name + '_mlp_q', output_high=env.action_space.high, mlp_config=actor_policy_mlp_config) mlp_policy = DeterministicMLPPolicy( env_spec=env_spec, name=name + '_mlp_policy', name_scope=name + '_mlp_policy', output_high=env.observation_space.high, mlp_config=actor_policy_mlp_config, reuse=False) polyak = 0.995 gamma = 0.99 noise_scale = 0.5 noise_decay = 0.995 batch_size = 128 actor_lr = 0.001 critic_lr = 0.001 train_iter_per_call = 1 buffer_size = 100000 total_steps = 100000 # default 1000000 max_step_per_episode = 500 # reset env when counter > max_step_per_episode train_after_step = 10000 # default 10000 train_every_step = 1 test_after_step = 10000 test_every_step = 1000 num_test = 10 algo = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": buffer_size, "GAMMA": gamma, "CRITIC_LEARNING_RATE": critic_lr, "ACTOR_LEARNING_RATE": actor_lr, "DECAY": polyak, "BATCH_SIZE": batch_size, "TRAIN_ITERATION": train_iter_per_call, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=mlp_policy, name=name + '_ddpg', replay_buffer=None) step_counter = SinglentonStepCounter(-1) noise_adder = AgentActionNoiseWrapper( noise=UniformNoise(scale=noise_scale), action_weight_scheduler=ConstantScheduler(1.), noise_weight_scheduler=DDPGNoiseScheduler( train_every_step=train_every_step, train_after_step=train_after_step, noise_decay=noise_decay, step_counter=step_counter)) agent = DDPG_Agent(env=env, algo=algo, env_spec=env_spec, noise_adder=noise_adder, name=name + '_agent') agent.init() test_reward = [] data_sample = [] obs, ep_ret, ep_len = env.reset(), 0, 0 for step in range(total_steps): step_counter.increase(1) act = agent.predict(obs=obs) obs_, reward, done, _ = cyber.step(obs, act) _buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, action_shape=env_spec.action_shape) _buffer.append(obs, act, obs_, done, reward) agent.algo.append_to_memory(_buffer) ep_ret += reward ep_len += 1 if done or ep_len > max_step_per_episode: obs, ep_ret, ep_len = env.reset(), 0, 0 else: obs = obs_ if step > train_after_step and step % train_every_step == 0: agent.train() if step > test_after_step and step % test_every_step == 0: data_sample, test_reward = agent.test( env=env, cyber=cyber, data_sample=data_sample, test_reward=test_reward, num_test=num_test, max_step_per_episode=max_step_per_episode)