def test_init(self): ddpg, locals = self.create_ddpg() env_spec = locals['env_spec'] env = locals['env'] mlp_dyna = self.create_continuous_mlp_global_dynamics_model(env_spec=env_spec)[0] algo = self.create_dyna(env_spec=env_spec, model_free_algo=ddpg, dyanmics_model=mlp_dyna)[0] algo.init() st = env.reset() data = TransitionData(env_spec) for _ in range(100): ac = algo.predict(st) new_st, re, done, _ = env.step(action=ac) data.append(state=st, new_state=new_st, reward=re, action=ac, done=done) algo.append_to_memory(samples=data) pre_res = 10000 for i in range(20): print(algo.train(batch_data=data)) print(algo.train(batch_data=data, state='state_dynamics_training')) print(algo.train(batch_data=data, state='state_agent_training')) res = algo.test_dynamics(env=env, sample_count=100) self.assertLess(list(res.values())[0], pre_res) self.assertLess(list(res.values())[1], pre_res) print(res) algo.test()
def test_with_dqn(self): dqn, local = self.create_dqn() env = local['env'] env_spec = local['env_spec'] dqn.init() st = env.reset() from baconian.common.sampler.sample_data import TransitionData a = TransitionData(env_spec) res = [] for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) dqn.append_to_memory(a) res.append( dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)['average_loss']) res.append( dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)['average_loss']) print(dqn._status()) print(dqn._status._info_dict_with_sub_info)
def test_trajectory_data(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) a = TrajectoryData(env_spec) tmp_traj = TransitionData(env_spec) st = env.reset() re_list = [] st_list = [] for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) st_list.append(st_new) re_list.append(re) if (i + 1) % 10 == 0: done = True else: done = False tmp_traj.append(state=st, new_state=st_new, action=ac, done=done, reward=re) if done: a.append(tmp_traj.get_copy()) tmp_traj.reset() self.assertEqual(a.trajectories.__len__(), 10) for traj in a.trajectories: self.assertEqual(len(traj), 10)
def DynaMLP_get_action(self, mlp_dyna: DynamicsModel, env: Env, state, cost_fn, num_simulated_paths, horizon): ''' mpc.ModelBasedModelPredictiveControl.predict() :param mlp_dyna: :param env: :param state: :param cost_fn: :param num_simulated_paths: :param horizon: :return: ''' rollout = TrajectoryData(env_spec=env.env_spec) for i in range(num_simulated_paths): path = TransitionData(env_spec=env.env_spec) obs = state for j in range(horizon): action = env.action_space.sample() obs_ = mlp_dyna.step(action=action, state=obs) cost = cost_fn(obs, action, obs_) path.append(obs, action, obs_, False, -cost) obs = obs_ rollout.append(path) rollout.trajectories.sort(key=lambda x: x.cumulative_reward, reverse=True) optimial_action = rollout.trajectories[0].action_set[0] return optimial_action
def test_sample_batch(self): env = make('ModifiedHalfCheetah') env.init() env_spec = env.env_spec random_buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=100) print("====> Random Sample") num_trajectory = 1 max_step = 30 for i in range(num_trajectory): ep_len = 0 obs = env.reset() while ep_len < max_step: act = self.RandomController_get_action(env=env, state=obs) obs_, reward, done, _ = env.step(act) random_buffer.append(obs, act, obs_, done, reward) assert not done obs = obs_ ep_len += 1 batch_data_1 = random_buffer.sample_batch(batch_size=16, shuffle_flag=True) assert isinstance(batch_data_1, dict) print(batch_data_1.keys()) self.assertEqual(len(batch_data_1['action_set']), 16)
def sample(self, batch_size) -> SampleData: if self.nb_entries < batch_size: raise MemoryBufferLessThanBatchSizeError() # todo This will be changed to prioritised batch_idxs = np.random.randint(self.nb_entries - 2, size=batch_size) obs0_batch = self.observations0.get_batch(batch_idxs) obs1_batch = self.observations1.get_batch(batch_idxs) action_batch = self.actions.get_batch(batch_idxs) reward_batch = self.rewards.get_batch(batch_idxs) terminal1_batch = self.terminals1.get_batch(batch_idxs) result = { 'obs0': array_min2d(obs0_batch), 'obs1': array_min2d(obs1_batch), 'rewards': array_min2d(reward_batch), 'actions': array_min2d(action_batch), 'terminals1': array_min2d(terminal1_batch), } res = TransitionData(obs_shape=self.obs_shape, action_shape=self.action_shape) for obs0, obs1, action, terminal, re in zip(result['obs0'], result['obs1'], result['actions'], result['terminals1'], result['rewards']): res.append(state=obs0, new_state=obs1, action=action, done=terminal, reward=re) return res
def test_trajectory_data(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) a = TrajectoryData(env_spec) tmp_traj = TransitionData(env_spec) st = env.reset() re_list = [] st_list = [] for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) st_list.append(st_new) re_list.append(re) if (i + 1) % 10 == 0: done = True else: done = False tmp_traj.append(state=st, new_state=st_new, action=ac, done=done, reward=re) if done is True: a.append(tmp_traj) tmp_traj.reset() self.assertEqual(a.trajectories.__len__(), 10) for traj in a.trajectories: self.assertEqual(len(traj), 10) data = a.return_as_transition_data() data_gen = data.return_generator() for d, re, st in zip(data_gen, re_list, st_list): self.assertEqual(d[3], re) self.assertTrue(np.equal(st, d[1]).all())
def wrap_func(): mlp_dyna, local = self.create_continue_dynamics_model( env_id='Pendulum-v0') env_spec = local['env_spec'] env = local['env'] policy = func(env_spec=env_spec)[0] algo, locals = self.create_mpc(env_spec=env_spec, mlp_dyna=mlp_dyna, policy=policy, env=env) algo.init() for _ in range(100): assert env_spec.action_space.contains( algo.predict(env_spec.obs_space.sample())) st = env.reset() data = TransitionData(env_spec) for _ in range(10): ac = algo.predict(st) new_st, re, done, _ = env.step(action=ac) data.append(state=st, new_state=new_st, reward=re, action=ac, done=done) print(algo.train(batch_data=data))
def test_StandScaler(self): env = make('ModifiedHalfCheetah') env_spec = env.env_spec self.assertEqual(env_spec.flat_obs_dim, 18) self.assertEqual(env_spec.flat_action_dim, 6) buffer_size = 10 buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=buffer_size) obs = env.reset() for i in range(buffer_size): act = env.action_space.sample() obs_, rew, done, _ = env.step(act) buffer.append(obs, act, obs_, done, rew) batch_list = buffer.sample_batch_as_Transition(4, all_as_batch=True) state_input_scaler_1 = RunningStandardScaler(env_spec.flat_action_dim) for batch_data in batch_list: state_input_scaler_1.update_scaler(batch_data.action_set) mean_1 = state_input_scaler_1._mean var_1 = state_input_scaler_1._var print(mean_1) print(var_1) state_input_scaler_2 = RunningStandardScaler(env_spec.flat_action_dim) state_input_scaler_2.update_scaler(buffer.action_set) mean_2 = state_input_scaler_2._mean var_2 = state_input_scaler_2._var print(mean_2) print(var_2)
def test_apply_normalization(self): ''' Test normalization & denormalization in Transition.apply_(de)normalization ''' mlp_dyna, local = self.create_continue_dynamics_model( env_id='ModifiedHalfCheetah', name='mlp_dyna_model') mlp_dyna.init() print(mlp_dyna.state_input_scaler) env = local['env'] assert isinstance(env, ModifiedHalfCheetahEnv) env_spec = env.env_spec buffer_size = 50 random_buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=buffer_size) obs = env.reset() for i in range(buffer_size): act = env.action_space.sample() obs_, reward, done, info = env.step(act) random_buffer.append(obs, act, obs_, done, reward) normalized_random_buffer, mean_dict, var_dict = random_buffer.apply_normalization( ) denormalized_random_buffer = normalized_random_buffer.apply_denormalization( None, mean_dict, var_dict) self.assertEqual(random_buffer.action_set.any(), denormalized_random_buffer.action_set.any()) self.assertEqual(random_buffer.state_set.any(), denormalized_random_buffer.state_set.any())
def _launch(self) -> bool: env = self.env env_spec = self.env_spec cyber = self.cyber obs, ep_ret, ep_len = env.reset(), 0, 0 for step in range(self.total_steps): self.step_counter.increase(1) act = self.agent.predict(obs=obs) obs_, reward, done, _ = cyber.step(obs, act) _buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, action_shape=env_spec.action_shape) _buffer.append(obs, act, obs_, done, reward) self.agent.algo.append_to_memory(_buffer) ep_ret += reward ep_len += 1 if done or ep_len > self.max_step_per_episode: obs, ep_ret, ep_len = env.reset(), 0, 0 else: obs = obs_ if step > self.train_after_step and step % self.train_every_step == 0: self.agent.train() if step > self.test_after_step and step % self.test_every_step == 0: self.data_sample, self.test_reward = self.agent.test(env=env, cyber=cyber, data_sample=self.data_sample, test_reward=self.test_reward, num_test=self.num_test, max_step_per_episode=self.max_step_per_episode) env.close() self.plot_test_reward(self.data_sample, self.test_reward) return True
def test_prior_eval(self): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) data = TransitionData(env_spec=env_spec) policy = UniformRandomPolicy(env_spec=env_spec) # Do some initial sampling here to train gmm model st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gmm = GaussianMixtureDynamicsPrior(env_spec=env_spec, batch_data=data) gmm.init() gmm.update(batch_data=data) mu0, Phi, m, n0 = gmm.eval(batch_data=data) state_shape = data.state_set.shape[1] action_shape = data.action_set.shape[1] self.assertEqual(state_shape + action_shape + state_shape, mu0.shape[0]) self.assertEqual(state_shape + action_shape + state_shape, Phi.shape[0]) self.assertEqual(state_shape + action_shape + state_shape, Phi.shape[1])
def test_Transition_union(self): ''' Useless testcase. ''' algo, locals = self.create_mpc(name='test_Transition_union') env_spec = locals['env_spec'] env = locals['env'] env.env_spec = env_spec algo.init() for _ in range(100): assert env_spec.action_space.contains( algo.predict(env_spec.obs_space.sample())) st = env.reset() data = TransitionData(env_spec) for _ in range(10): ac = algo.predict(st) new_st, re, done, _ = env.step(action=ac) data.append(state=st, new_state=new_st, reward=re, action=ac, done=done) print(algo.train(batch_data=data))
def test_init(self): dqn, locals = self.create_dqn() env = locals['env'] env_spec = locals['env_spec'] dqn.init() st = env.reset() a = TransitionData(env_spec) for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) st = st_new dqn.append_to_memory(a) new_dqn, _ = self.create_dqn(name='new_dqn') new_dqn.copy_from(dqn) self.assert_var_list_id_no_equal(dqn.q_value_func.parameters('tf_var_list'), new_dqn.q_value_func.parameters('tf_var_list')) self.assert_var_list_id_no_equal(dqn.target_q_value_func.parameters('tf_var_list'), new_dqn.target_q_value_func.parameters('tf_var_list')) self.assert_var_list_equal(dqn.q_value_func.parameters('tf_var_list'), new_dqn.q_value_func.parameters('tf_var_list')) self.assert_var_list_equal(dqn.target_q_value_func.parameters('tf_var_list'), new_dqn.target_q_value_func.parameters('tf_var_list')) dqn.save(save_path=GlobalConfig().DEFAULT_LOG_PATH + '/dqn_test', global_step=0, name=dqn.name) for i in range(10): print(dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)) print(dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)) self.assert_var_list_at_least_not_equal(dqn.q_value_func.parameters('tf_var_list'), new_dqn.q_value_func.parameters('tf_var_list')) self.assert_var_list_at_least_not_equal(dqn.target_q_value_func.parameters('tf_var_list'), new_dqn.target_q_value_func.parameters('tf_var_list')) dqn.load(path_to_model=GlobalConfig().DEFAULT_LOG_PATH + '/dqn_test', model_name=dqn.name, global_step=0) self.assert_var_list_equal(dqn.q_value_func.parameters('tf_var_list'), new_dqn.q_value_func.parameters('tf_var_list')) self.assert_var_list_equal(dqn.target_q_value_func.parameters('tf_var_list'), new_dqn.target_q_value_func.parameters('tf_var_list')) for i in range(10): self.sess.run(dqn.update_target_q_value_func_op, feed_dict=dqn.parameters.return_tf_parameter_feed_dict()) var1 = self.sess.run(dqn.q_value_func.parameters('tf_var_list')) var2 = self.sess.run(dqn.target_q_value_func.parameters('tf_var_list')) import numpy as np total_diff = 0.0 for v1, v2 in zip(var1, var2): total_diff += np.mean(np.abs(np.array(v1) - np.array(v2))) print('update target, difference mean', total_diff)
def get_some_samples(env, num, env_spec, policy): data = TransitionData(env_spec=env_spec) st = env.reset() for i in range(num): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st return data
def sample_transition(self, env, count=100): data = TransitionData(env.env_spec) st = env.get_state() for i in range(count): ac = env.env_spec.action_space.sample() new_st, re, done, info = env.step(action=ac) data.append(state=st, action=ac, new_state=new_st, done=done, reward=re) return data
def test_mlp_dynamics_model(self): mlp_dyna, local = self.create_continue_dynamics_model( name='mlp_dyna_model') env = local['env'] env_spec = local['env_spec'] env.reset() mlp_dyna.init() for i in range(100): mlp_dyna.step(action=np.array(env_spec.action_space.sample()), state=env_spec.obs_space.sample()) data = TransitionData(env_spec) st = env.get_state() for i in range(10): ac = env_spec.action_space.sample() new_st, re, done, info = env.step(action=ac) data.append(state=st, action=ac, new_state=new_st, done=done, reward=re) st = new_st print(mlp_dyna.train(batch_data=data, train_iter=10)) mlp_dyna_2, _ = self.create_continue_dynamics_model(name='model_2') mlp_dyna_2.init() self.assert_var_list_at_least_not_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) self.assert_var_list_id_no_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) mlp_dyna_2.init(source_obj=mlp_dyna) self.assert_var_list_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) self.assert_var_list_id_no_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) mlp_dyna_2.copy_from(mlp_dyna) self.assert_var_list_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) self.assert_var_list_id_no_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list'))
def test_dynamics_model_in_pendulum(self): env = self.create_env('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) policy, _ = self.create_uniform_policy(env_spec=env_spec) data = TransitionData(env_spec=env_spec) st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() for i in range(len(data.state_set)): res = gp.step(action=data.action_set[i], state=data.state_set[i], allow_clip=True) _, var = gp._state_transit(action=data.action_set[i], state=data.state_set[i], required_var=True) print(res) print(data.new_state_set[i]) print(np.sqrt(var)) # self.assertTrue(np.isclose(res, # data.new_state_set[i], atol=1e-3).all()) self.assertTrue(np.greater(data.new_state_set[i] + 1.96 * np.sqrt(var), res).all()) self.assertTrue(np.less(data.new_state_set[i] - 1.96 * np.sqrt(var), res).all()) lengthscales = {} variances = {} noises = {} for i, model in enumerate(gp.mgpr_model.models): lengthscales['GP' + str(i)] = model.kern.lengthscales.value variances['GP' + str(i)] = np.array([model.kern.variance.value]) noises['GP' + str(i)] = np.array([model.likelihood.variance.value]) print('-----Learned models------') pd.set_option('precision', 3) print('---Lengthscales---') print(pd.DataFrame(data=lengthscales)) print('---Variances---') print(pd.DataFrame(data=variances)) print('---Noises---') print(pd.DataFrame(data=noises))
def test_dynamics_model_basic(self): env = self.create_env('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) policy, _ = self.create_uniform_policy(env_spec=env_spec) data = TransitionData(env_spec=env_spec) st = env.reset() ac = policy.forward(st) for i in range(10): re = 0.0 data.append(state=np.ones_like(st) * 0.5, new_state=np.ones_like(st), reward=re, done=False, action=np.ones_like(ac) * 0.1) data.append(state=np.ones_like(st), new_state=np.ones_like(st) * 0.5, reward=re, done=False, action=np.ones_like(ac) * -0.1) gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() lengthscales = {} variances = {} noises = {} i = 0 for model in gp.mgpr_model.models: lengthscales['GP' + str(i)] = model.kern.lengthscales.value variances['GP' + str(i)] = np.array([model.kern.variance.value]) noises['GP' + str(i)] = np.array([model.likelihood.variance.value]) i += 1 print('-----Learned models------') pd.set_option('precision', 3) print('---Lengthscales---') print(pd.DataFrame(data=lengthscales)) print('---Variances---') print(pd.DataFrame(data=variances)) print('---Noises---') print(pd.DataFrame(data=noises)) for i in range(5): self.assertTrue(np.isclose(gp.step(action=np.ones_like(ac) * -0.1, state=np.ones_like(st)), np.ones_like(st) * 0.5).all()) for i in range(5): self.assertTrue(np.isclose(gp.step(action=np.ones_like(ac) * 0.1, state=np.ones_like(st) * 0.5), np.ones_like(st)).all()) for i in range(5): print(gp.step(action=np.ones_like(ac) * -0.1, state=np.ones_like(st) * 0.5))
def _sample_transitions(self, env: Env, agent, sample_count, init_state): state = init_state sample_record = TransitionData(env_spec=self.env_spec) for i in range(sample_count): action = agent.predict(obs=state) new_state, re, done, info = env.step(action) if not isinstance(done, bool): raise TypeError() sample_record.append(state=state, action=action, reward=re, new_state=new_state, done=done) if done: state = env.reset() else: state = new_state return sample_record
def predict(self, obs, **kwargs): if self.is_training is True: return self.env_spec.action_space.sample() rollout = TrajectoryData(env_spec=self.env_spec) state = obs for i in range(self.parameters('SAMPLED_PATH_NUM')): path = TransitionData(env_spec=self.env_spec) # todo terminal_func signal problem to be consider? for _ in range(self.parameters('SAMPLED_HORIZON')): ac = self.policy.forward(obs=state) new_state, re, done, _ = self.dynamics_env.step(action=ac, state=state) # step() as an Env path.append(state=state, action=ac, new_state=new_state, reward=re, done=done) state = new_state rollout.append(path) rollout.trajectories.sort(key=lambda x: x.cumulative_reward, reverse=True) ac = rollout.trajectories[0].action_set[0] assert self.env_spec.action_space.contains(ac) return ac
def test_random_buffer_1(self): env = make('ModifiedHalfCheetah') # env.init() env_spec = env.env_spec random_buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=5) rl_buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=10) max_step = 10 ep_len = 0 obs = env.reset() while ep_len < max_step: act = self.RandomController_get_action(env=env, state=obs) obs_, reward, done, _ = env.step(act) random_buffer.append(obs, act, obs_, done, reward) assert not done obs = obs_ ep_len += 1
def _sample_trajectories(self, env, agent, sample_count, init_state): state = init_state sample_record = TrajectoryData(self.env_spec) done = False for i in range(sample_count): traj_record = TransitionData(self.env_spec) while done is not True: action = agent.predict(obs=state) new_state, re, done, info = env.step(action) if not isinstance(done, bool): raise TypeError() traj_record.append(state=state, action=action, reward=re, new_state=new_state, done=done) state = new_state state = env.reset() sample_record.append(traj_record) return sample_record
def test_init_continuous(self): algo, locals = self.create_mpc(env_id='Pendulum-v0') env_spec = locals['env_spec'] env = locals['env'] algo.init() for _ in range(100): assert env_spec.action_space.contains( algo.predict(env_spec.obs_space.sample())) st = env.reset() data = TransitionData(env_spec) for _ in range(10): ac = algo.predict(st) new_st, re, done, _ = env.step(action=ac) data.append(state=st, new_state=new_st, reward=re, action=ac, done=done) print(algo.train(batch_data=data))
def _test_1(self): mlp_dyna, local = self.create_continue_dynamics_model( env_id='ModifiedHalfCheetah', name='mlp_dyna_model') print(local.items()) env = local['env'] assert isinstance(env, ModifiedHalfCheetahEnv) env_spec = env.env_spec batch_data = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape) batch_size = 32 obs = env.reset() for i in range(batch_size): act = self.RandomController_get_action(env=env, state=obs) obs_, reward, done, info = env.step(action=act) batch_data.append(obs, act, obs_, done, reward) self.assertEqual(len(batch_data), batch_size) mlp_dyna.init() train_epoch = 20 for i in range(train_epoch): res = mlp_dyna.train(batch_data, train_iter=10) print('iter:{} loss:{}'.format(i, res))
def predict(self, obs, is_reward_func=True): ''' Sample SAMPLED_PATH_NUM trajectories started from 'obs'. Return the optimal action. :param obs: Initial state. :param reverse_sort_flag: Decide the sort direction of trajectories, set to 'True' when using reward func. :return: Optimal action for 'obs'. ''' rollout = TrajectoryData(env_spec=self.env_spec) for i in range(self.parameters('SAMPLED_PATH_NUM')): path = TransitionData(env_spec=self.env_spec) state = obs for j in range(self.parameters('SAMPLED_HORIZON')): act = self.policy.forward(obs=state) # env.action_space.sample() new_state, cost, _, _ = self.dynamics_env.step(action=act, state=state) # step() as an Env path.append(state=state, action=act, new_state=new_state, reward=cost, done=False) state = new_state rollout.append(path) rollout.trajectories.sort(key=lambda x: x.cumulative_reward, reverse=is_reward_func) optimal_act = rollout.trajectories[0].action_set[0] assert self.env_spec.action_space.contains(optimal_act) return optimal_act
from baconian.algo.dynamics.dynamics_model import DynamicsEnvWrapper from baconian.algo.dynamics.terminal_func.terminal_func import RandomTerminalFunc from baconian.algo.dynamics.reward_func.reward_func import RandomRewardFunc env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) data = TransitionData(env_spec=env_spec) policy = UniformRandomPolicy(env_spec=env_spec) # Do some initial sampling here to train GP model st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() dyna_env = DynamicsEnvWrapper(dynamics=gp) # Since we only care about the prediction here, so we pass the terminal function and reward function setting with # random one dyna_env.set_terminal_reward_func(terminal_func=RandomTerminalFunc(), reward_func=RandomRewardFunc()) st = env.reset() real_state_list = [] dynamics_state_list = []
class PPO(ModelFreeAlgo, OnPolicyAlgo, MultiPlaceholderInput): required_key_dict = DictConfig.load_json( file_path=GlobalConfig().DEFAULT_PPO_REQUIRED_KEY_LIST) @typechecked def __init__(self, env_spec: EnvSpec, stochastic_policy: StochasticPolicy, config_or_config_dict: (DictConfig, dict), value_func: VValueFunction, name='ppo'): ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) self.config = construct_dict_config(config_or_config_dict, self) self.policy = stochastic_policy self.value_func = value_func to_ph_parameter_dict = dict() self.trajectory_memory = TrajectoryData(env_spec=env_spec) self.transition_data_for_trajectory = TransitionData(env_spec=env_spec) self.value_func_train_data_buffer = None # self.scaler = Scaler(obs_dim=self.env_spec.flat_obs_dim) self.scaler = RunningStandardScaler(dims=self.env_spec.flat_obs_dim) with tf.variable_scope(name): self.advantages_ph = tf.placeholder(tf.float32, (None, ), 'advantages') self.v_func_val_ph = tf.placeholder(tf.float32, (None, ), 'val_valfunc') dist_info_list = self.policy.get_dist_info() self.old_dist_tensor = [ (tf.placeholder(**dict(dtype=dist_info['dtype'], shape=dist_info['shape'], name=dist_info['name'])), dist_info['name']) for dist_info in dist_info_list ] self.old_policy = self.policy.make_copy( reuse=False, name_scope='old_{}'.format(self.policy.name), name='old_{}'.format(self.policy.name), distribution_tensors_tuple=tuple(self.old_dist_tensor)) to_ph_parameter_dict['beta'] = tf.placeholder( tf.float32, (), 'beta') to_ph_parameter_dict['eta'] = tf.placeholder(tf.float32, (), 'eta') to_ph_parameter_dict['kl_target'] = tf.placeholder( tf.float32, (), 'kl_target') to_ph_parameter_dict['lr_multiplier'] = tf.placeholder( tf.float32, (), 'lr_multiplier') self.parameters = ParametersWithTensorflowVariable( tf_var_list=[], rest_parameters=dict( advantages_ph=self.advantages_ph, v_func_val_ph=self.v_func_val_ph, ), to_ph_parameter_dict=to_ph_parameter_dict, name='ppo_param', save_rest_param_flag=False, source_config=self.config, require_snapshot=False) with tf.variable_scope(name): with tf.variable_scope('train'): self.kl = tf.reduce_mean(self.old_policy.kl(other=self.policy)) self.policy_loss, self.policy_optimizer, self.policy_update_op = self._setup_policy_loss( ) self.value_func_loss, self.value_func_optimizer, self.value_func_update_op = self._setup_value_func_loss( ) var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.policy_optimizer.variables( ) + self.value_func_optimizer.variables() self.parameters.set_tf_var_list( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[ dict( obj=self.value_func, attr_name='value_func', ), dict(obj=self.policy, attr_name='policy') ], parameters=self.parameters) @register_counter_info_to_status_decorator(increment=1, info_key='init', under_status='JUST_INITED') def init(self, sess=None, source_obj=None): self.policy.init() self.value_func.init() self.parameters.init() if source_obj: self.copy_from(source_obj) super().init() @record_return_decorator(which_recorder='self') @register_counter_info_to_status_decorator(increment=1, info_key='train', under_status='TRAIN') @typechecked def train(self, trajectory_data: TrajectoryData = None, train_iter=None, sess=None) -> dict: super(PPO, self).train() if trajectory_data is None: trajectory_data = self.trajectory_memory if len(trajectory_data) == 0: raise MemoryBufferLessThanBatchSizeError( 'not enough trajectory data') tf_sess = sess if sess else tf.get_default_session() SampleProcessor.add_estimated_v_value(trajectory_data, value_func=self.value_func) SampleProcessor.add_discount_sum_reward(trajectory_data, gamma=self.parameters('gamma')) SampleProcessor.add_gae(trajectory_data, gamma=self.parameters('gamma'), name='advantage_set', lam=self.parameters('lam'), value_func=self.value_func) train_data = trajectory_data.return_as_transition_data( shuffle_flag=False) SampleProcessor.normalization(train_data, key='advantage_set') policy_res_dict = self._update_policy( train_data=train_data, train_iter=train_iter if train_iter else self.parameters('policy_train_iter'), sess=tf_sess) value_func_res_dict = self._update_value_func( train_data=train_data, train_iter=train_iter if train_iter else self.parameters('value_func_train_iter'), sess=tf_sess) self.trajectory_memory.reset() return {**policy_res_dict, **value_func_res_dict} @register_counter_info_to_status_decorator(increment=1, info_key='test', under_status='TEST') def test(self, *arg, **kwargs) -> dict: return super().test(*arg, **kwargs) @register_counter_info_to_status_decorator(increment=1, info_key='predict') @typechecked def predict(self, obs: np.ndarray, sess=None, batch_flag: bool = False): tf_sess = sess if sess else tf.get_default_session() obs = make_batch(obs, original_shape=self.env_spec.obs_shape) obs = self.scaler.process(data=obs) ac = self.policy.forward( obs=obs, sess=tf_sess, feed_dict=self.parameters.return_tf_parameter_feed_dict()) return ac @typechecked def append_to_memory(self, samples: SampleData): # todo how to make sure the data's time sequential iter_samples = samples.return_generator() # scale, offset = self.scaler.get() obs_list = [] for state, new_state, action, reward, done in iter_samples: obs_list.append(state) self.transition_data_for_trajectory.append( state=self.scaler.process(state), new_state=self.scaler.process(new_state), action=action, reward=reward, done=done) if done is True: self.trajectory_memory.append( self.transition_data_for_trajectory) self.transition_data_for_trajectory.reset() self.scaler.update_scaler(data=np.array(obs_list)) @record_return_decorator(which_recorder='self') def save(self, global_step, save_path=None, name=None, **kwargs): save_path = save_path if save_path else GlobalConfig( ).DEFAULT_MODEL_CHECKPOINT_PATH name = name if name else self.name MultiPlaceholderInput.save(self, save_path=save_path, global_step=global_step, name=name, **kwargs) return dict(check_point_save_path=save_path, check_point_save_global_step=global_step, check_point_save_name=name) @record_return_decorator(which_recorder='self') def load(self, path_to_model, model_name, global_step=None, **kwargs): MultiPlaceholderInput.load(self, path_to_model, model_name, global_step, **kwargs) return dict(check_point_load_path=path_to_model, check_point_load_global_step=global_step, check_point_load_name=model_name) def _setup_policy_loss(self): """ Code clip from pat-cody Three loss terms: 1) standard policy gradient 2) D_KL(pi_old || pi_new) 3) Hinge loss on [D_KL - kl_targ]^2 See: https://arxiv.org/pdf/1707.02286.pdf """ if self.parameters('clipping_range') is not None: pg_ratio = tf.exp(self.policy.log_prob() - self.old_policy.log_prob()) clipped_pg_ratio = tf.clip_by_value( pg_ratio, 1 - self.parameters('clipping_range')[0], 1 + self.parameters('clipping_range')[1]) surrogate_loss = tf.minimum(self.advantages_ph * pg_ratio, self.advantages_ph * clipped_pg_ratio) loss = -tf.reduce_mean(surrogate_loss) else: loss1 = -tf.reduce_mean( self.advantages_ph * tf.exp(self.policy.log_prob() - self.old_policy.log_prob())) loss2 = tf.reduce_mean(self.parameters('beta') * self.kl) loss3 = self.parameters('eta') * tf.square( tf.maximum(0.0, self.kl - 2.0 * self.parameters('kl_target'))) loss = loss1 + loss2 + loss3 self.loss1 = loss1 self.loss2 = loss2 self.loss3 = loss3 if isinstance(self.policy, PlaceholderInput): reg_list = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.policy.name_scope) if len(reg_list) > 0: reg_loss = tf.reduce_sum(reg_list) loss += reg_loss optimizer = tf.train.AdamOptimizer( learning_rate=self.parameters('policy_lr') * self.parameters('lr_multiplier')) train_op = optimizer.minimize( loss, var_list=self.policy.parameters('tf_var_list')) return loss, optimizer, train_op def _setup_value_func_loss(self): # todo update the value_func design loss = tf.reduce_mean( tf.square(self.value_func.v_tensor - self.v_func_val_ph)) reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.value_func.name_scope) if len(reg_loss) > 0: loss += tf.reduce_sum(reg_loss) optimizer = tf.train.AdamOptimizer(self.parameters('value_func_lr')) train_op = optimizer.minimize( loss, var_list=self.value_func.parameters('tf_var_list')) return loss, optimizer, train_op def _update_policy(self, train_data: TransitionData, train_iter, sess): old_policy_feed_dict = dict() res = sess.run( [ getattr(self.policy, tensor[1]) for tensor in self.old_dist_tensor ], feed_dict={ self.policy.parameters('state_input'): train_data('state_set'), self.policy.parameters('action_input'): train_data('action_set'), **self.parameters.return_tf_parameter_feed_dict() }) for tensor, val in zip(self.old_dist_tensor, res): old_policy_feed_dict[tensor[0]] = val feed_dict = { self.policy.parameters('action_input'): train_data('action_set'), self.old_policy.parameters('action_input'): train_data('action_set'), self.policy.parameters('state_input'): train_data('state_set'), self.advantages_ph: train_data('advantage_set'), **self.parameters.return_tf_parameter_feed_dict(), **old_policy_feed_dict } average_loss, average_kl, average_entropy = 0.0, 0.0, 0.0 total_epoch = 0 kl = None for i in range(train_iter): loss, kl, entropy, _ = sess.run([ self.policy_loss, self.kl, tf.reduce_mean(self.policy.entropy()), self.policy_update_op ], feed_dict=feed_dict) average_loss += loss average_kl += kl average_entropy += entropy total_epoch = i + 1 if kl > self.parameters('kl_target', require_true_value=True) * 4: # early stopping if D_KL diverges badly break average_loss, average_kl, average_entropy = average_loss / total_epoch, average_kl / total_epoch, average_entropy / total_epoch if kl > self.parameters('kl_target', require_true_value=True ) * 2: # servo beta to reach D_KL target self.parameters.set( key='beta', new_val=np.minimum( 35, 1.5 * self.parameters('beta', require_true_value=True))) if self.parameters( 'beta', require_true_value=True) > 30 and self.parameters( 'lr_multiplier', require_true_value=True) > 0.1: self.parameters.set( key='lr_multiplier', new_val=self.parameters('lr_multiplier', require_true_value=True) / 1.5) elif kl < self.parameters('kl_target', require_true_value=True) / 2: self.parameters.set( key='beta', new_val=np.maximum( 1 / 35, self.parameters('beta', require_true_value=True) / 1.5)) if self.parameters('beta', require_true_value=True) < ( 1 / 30) and self.parameters('lr_multiplier', require_true_value=True) < 10: self.parameters.set( key='lr_multiplier', new_val=self.parameters('lr_multiplier', require_true_value=True) * 1.5) return dict(policy_average_loss=average_loss, policy_average_kl=average_kl, policy_average_entropy=average_entropy, policy_total_train_epoch=total_epoch) def _update_value_func(self, train_data: TransitionData, train_iter, sess): if self.value_func_train_data_buffer is None: self.value_func_train_data_buffer = train_data else: self.value_func_train_data_buffer.union(train_data) y_hat = self.value_func.forward(obs=train_data('state_set')) old_exp_var = 1 - np.var(train_data('advantage_set') - y_hat) / np.var( train_data('advantage_set')) for i in range(train_iter): data_gen = self.value_func_train_data_buffer.return_generator( batch_size=self.parameters('value_func_train_batch_size'), infinite_run=False, shuffle_flag=True, assigned_keys=('state_set', 'new_state_set', 'action_set', 'reward_set', 'done_set', 'advantage_set')) for batch in data_gen: loss, _ = sess.run( [self.value_func_loss, self.value_func_update_op], feed_dict={ self.value_func.state_input: batch[0], self.v_func_val_ph: batch[5], **self.parameters.return_tf_parameter_feed_dict() }) y_hat = self.value_func.forward(obs=train_data('state_set')) loss = np.mean(np.square(y_hat - train_data('advantage_set'))) exp_var = 1 - np.var(train_data('advantage_set') - y_hat) / np.var( train_data('advantage_set')) self.value_func_train_data_buffer = train_data return dict(value_func_loss=loss, value_func_policy_exp_var=exp_var, value_func_policy_old_exp_var=old_exp_var)
def test_l1_l2_norm(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) name = 'dqn' mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp', name=name + '_mlp', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03, "L1_NORM": 1000.0, "L2_NORM": 1000.0 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "L1_NORM": 1000.0, "L2_NORM": 1000.0, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name, value_func=mlp_q) dqn2, _ = self.create_dqn(name='dqn_2') a = TransitionData(env_spec) st = env.reset() dqn.init() dqn2.init() for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) st = st_new dqn.append_to_memory(a) for i in range(20): print( 'dqn1 loss: ', dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)) print( 'dqn2 loss: ', dqn2.train(batch_data=a, train_iter=10, sess=None, update_target=True)) var_list = self.sess.run(dqn.q_value_func.parameters('tf_var_list')) print(var_list) var_list2 = self.sess.run(dqn2.q_value_func.parameters('tf_var_list')) print(var_list2) for var, var2 in zip(var_list, var_list2): diff = np.abs(var2) - np.abs(var) self.assertTrue(np.greater(np.mean(diff), 0.0).all())
def test_init(self): ppo, locals = self.create_ppo() env = locals['env'] env_spec = locals['env_spec'] ppo.init() new_ppo, _ = self.create_ppo(name='new_ppo') new_ppo.copy_from(ppo) self.assert_var_list_id_no_equal( ppo.value_func.parameters('tf_var_list'), new_ppo.value_func.parameters('tf_var_list')) self.assert_var_list_id_no_equal( ppo.policy.parameters('tf_var_list'), new_ppo.policy.parameters('tf_var_list')) self.assert_var_list_equal( ppo.value_func.parameters('tf_var_list'), new_ppo.value_func.parameters('tf_var_list')) self.assert_var_list_equal(ppo.policy.parameters('tf_var_list'), new_ppo.policy.parameters('tf_var_list')) data = TransitionData(env_spec) st = env.reset() for i in range(100): ac = ppo.predict(st) assert ac.shape[0] == 1 self.assertTrue(env_spec.action_space.contains(ac[0])) new_st, re, done, _ = env.step(ac) if i % 9 == 0 and i > 0: done = True else: done = False data.append(state=st, new_state=new_st, action=ac, reward=re, done=done) traj = TrajectoryData(env_spec=env_spec) traj.append(data) ppo.append_to_memory(traj) ppo.save(save_path=GlobalConfig().DEFAULT_LOG_PATH + '/ppo_test', global_step=0, name=ppo.name) for i in range(5): ppo.append_to_memory(traj) res = ppo.train() print('value_func_loss {}, policy_average_loss: {}'.format( res['value_func_loss'], res['policy_average_loss'])) traj_data = TrajectoryData(env_spec=env_spec) traj_data.append(data) res = ppo.train(trajectory_data=traj_data, train_iter=5, sess=self.sess) print('value_func_loss {}, policy_average_loss: {}'.format( res['value_func_loss'], res['policy_average_loss'])) self.assert_var_list_at_least_not_equal( ppo.value_func.parameters('tf_var_list'), new_ppo.value_func.parameters('tf_var_list')) self.assert_var_list_at_least_not_equal( ppo.policy.parameters('tf_var_list'), new_ppo.policy.parameters('tf_var_list')) ppo.load(path_to_model=GlobalConfig().DEFAULT_LOG_PATH + '/ppo_test', model_name=ppo.name, global_step=0) self.assert_var_list_equal( ppo.value_func.parameters('tf_var_list'), new_ppo.value_func.parameters('tf_var_list')) self.assert_var_list_equal(ppo.policy.parameters('tf_var_list'), new_ppo.policy.parameters('tf_var_list'))