def test_Transition_union(self): ''' Useless testcase. ''' algo, locals = self.create_mpc(name='test_Transition_union') env_spec = locals['env_spec'] env = locals['env'] env.env_spec = env_spec algo.init() for _ in range(100): assert env_spec.action_space.contains( algo.predict(env_spec.obs_space.sample())) st = env.reset() data = TransitionData(env_spec) for _ in range(10): ac = algo.predict(st) new_st, re, done, _ = env.step(action=ac) data.append(state=st, new_state=new_st, reward=re, action=ac, done=done) print(algo.train(batch_data=data))
def test_trajectory_data(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) a = TrajectoryData(env_spec) tmp_traj = TransitionData(env_spec) st = env.reset() re_list = [] st_list = [] for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) st_list.append(st_new) re_list.append(re) if (i + 1) % 10 == 0: done = True else: done = False tmp_traj.append(state=st, new_state=st_new, action=ac, done=done, reward=re) if done is True: a.append(tmp_traj) tmp_traj.reset() self.assertEqual(a.trajectories.__len__(), 10) for traj in a.trajectories: self.assertEqual(len(traj), 10) data = a.return_as_transition_data() data_gen = data.return_generator() for d, re, st in zip(data_gen, re_list, st_list): self.assertEqual(d[3], re) self.assertTrue(np.equal(st, d[1]).all())
def test_sample_batch(self): env = make('ModifiedHalfCheetah') env.init() env_spec = env.env_spec random_buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=100) print("====> Random Sample") num_trajectory = 1 max_step = 30 for i in range(num_trajectory): ep_len = 0 obs = env.reset() while ep_len < max_step: act = self.RandomController_get_action(env=env, state=obs) obs_, reward, done, _ = env.step(act) random_buffer.append(obs, act, obs_, done, reward) assert not done obs = obs_ ep_len += 1 batch_data_1 = random_buffer.sample_batch(batch_size=16, shuffle_flag=True) assert isinstance(batch_data_1, dict) print(batch_data_1.keys()) self.assertEqual(len(batch_data_1['action_set']), 16)
def wrap_func(): mlp_dyna, local = self.create_continue_dynamics_model( env_id='Pendulum-v0') env_spec = local['env_spec'] env = local['env'] policy = func(env_spec=env_spec)[0] algo, locals = self.create_mpc(env_spec=env_spec, mlp_dyna=mlp_dyna, policy=policy, env=env) algo.init() for _ in range(100): assert env_spec.action_space.contains( algo.predict(env_spec.obs_space.sample())) st = env.reset() data = TransitionData(env_spec) for _ in range(10): ac = algo.predict(st) new_st, re, done, _ = env.step(action=ac) data.append(state=st, new_state=new_st, reward=re, action=ac, done=done) print(algo.train(batch_data=data))
def test_StandScaler(self): env = make('ModifiedHalfCheetah') env_spec = env.env_spec self.assertEqual(env_spec.flat_obs_dim, 18) self.assertEqual(env_spec.flat_action_dim, 6) buffer_size = 10 buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=buffer_size) obs = env.reset() for i in range(buffer_size): act = env.action_space.sample() obs_, rew, done, _ = env.step(act) buffer.append(obs, act, obs_, done, rew) batch_list = buffer.sample_batch_as_Transition(4, all_as_batch=True) state_input_scaler_1 = RunningStandardScaler(env_spec.flat_action_dim) for batch_data in batch_list: state_input_scaler_1.update_scaler(batch_data.action_set) mean_1 = state_input_scaler_1._mean var_1 = state_input_scaler_1._var print(mean_1) print(var_1) state_input_scaler_2 = RunningStandardScaler(env_spec.flat_action_dim) state_input_scaler_2.update_scaler(buffer.action_set) mean_2 = state_input_scaler_2._mean var_2 = state_input_scaler_2._var print(mean_2) print(var_2)
def _launch(self) -> bool: env = self.env env_spec = self.env_spec cyber = self.cyber obs, ep_ret, ep_len = env.reset(), 0, 0 for step in range(self.total_steps): self.step_counter.increase(1) act = self.agent.predict(obs=obs) obs_, reward, done, _ = cyber.step(obs, act) _buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, action_shape=env_spec.action_shape) _buffer.append(obs, act, obs_, done, reward) self.agent.algo.append_to_memory(_buffer) ep_ret += reward ep_len += 1 if done or ep_len > self.max_step_per_episode: obs, ep_ret, ep_len = env.reset(), 0, 0 else: obs = obs_ if step > self.train_after_step and step % self.train_every_step == 0: self.agent.train() if step > self.test_after_step and step % self.test_every_step == 0: self.data_sample, self.test_reward = self.agent.test(env=env, cyber=cyber, data_sample=self.data_sample, test_reward=self.test_reward, num_test=self.num_test, max_step_per_episode=self.max_step_per_episode) env.close() self.plot_test_reward(self.data_sample, self.test_reward) return True
def DynaMLP_get_action(self, mlp_dyna: DynamicsModel, env: Env, state, cost_fn, num_simulated_paths, horizon): ''' mpc.ModelBasedModelPredictiveControl.predict() :param mlp_dyna: :param env: :param state: :param cost_fn: :param num_simulated_paths: :param horizon: :return: ''' rollout = TrajectoryData(env_spec=env.env_spec) for i in range(num_simulated_paths): path = TransitionData(env_spec=env.env_spec) obs = state for j in range(horizon): action = env.action_space.sample() obs_ = mlp_dyna.step(action=action, state=obs) cost = cost_fn(obs, action, obs_) path.append(obs, action, obs_, False, -cost) obs = obs_ rollout.append(path) rollout.trajectories.sort(key=lambda x: x.cumulative_reward, reverse=True) optimial_action = rollout.trajectories[0].action_set[0] return optimial_action
def test_prior_eval(self): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) data = TransitionData(env_spec=env_spec) policy = UniformRandomPolicy(env_spec=env_spec) # Do some initial sampling here to train gmm model st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gmm = GaussianMixtureDynamicsPrior(env_spec=env_spec, batch_data=data) gmm.init() gmm.update(batch_data=data) mu0, Phi, m, n0 = gmm.eval(batch_data=data) state_shape = data.state_set.shape[1] action_shape = data.action_set.shape[1] self.assertEqual(state_shape + action_shape + state_shape, mu0.shape[0]) self.assertEqual(state_shape + action_shape + state_shape, Phi.shape[0]) self.assertEqual(state_shape + action_shape + state_shape, Phi.shape[1])
def test_apply_normalization(self): ''' Test normalization & denormalization in Transition.apply_(de)normalization ''' mlp_dyna, local = self.create_continue_dynamics_model( env_id='ModifiedHalfCheetah', name='mlp_dyna_model') mlp_dyna.init() print(mlp_dyna.state_input_scaler) env = local['env'] assert isinstance(env, ModifiedHalfCheetahEnv) env_spec = env.env_spec buffer_size = 50 random_buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=buffer_size) obs = env.reset() for i in range(buffer_size): act = env.action_space.sample() obs_, reward, done, info = env.step(act) random_buffer.append(obs, act, obs_, done, reward) normalized_random_buffer, mean_dict, var_dict = random_buffer.apply_normalization( ) denormalized_random_buffer = normalized_random_buffer.apply_denormalization( None, mean_dict, var_dict) self.assertEqual(random_buffer.action_set.any(), denormalized_random_buffer.action_set.any()) self.assertEqual(random_buffer.state_set.any(), denormalized_random_buffer.state_set.any())
def test_init(self): ddpg, locals = self.create_ddpg() env_spec = locals['env_spec'] env = locals['env'] mlp_dyna = self.create_continuous_mlp_global_dynamics_model(env_spec=env_spec)[0] algo = self.create_dyna(env_spec=env_spec, model_free_algo=ddpg, dyanmics_model=mlp_dyna)[0] algo.init() st = env.reset() data = TransitionData(env_spec) for _ in range(100): ac = algo.predict(st) new_st, re, done, _ = env.step(action=ac) data.append(state=st, new_state=new_st, reward=re, action=ac, done=done) algo.append_to_memory(samples=data) pre_res = 10000 for i in range(20): print(algo.train(batch_data=data)) print(algo.train(batch_data=data, state='state_dynamics_training')) print(algo.train(batch_data=data, state='state_agent_training')) res = algo.test_dynamics(env=env, sample_count=100) self.assertLess(list(res.values())[0], pre_res) self.assertLess(list(res.values())[1], pre_res) print(res) algo.test()
def test_with_dqn(self): dqn, local = self.create_dqn() env = local['env'] env_spec = local['env_spec'] dqn.init() st = env.reset() from baconian.common.sampler.sample_data import TransitionData a = TransitionData(env_spec) res = [] for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) dqn.append_to_memory(a) res.append( dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)['average_loss']) res.append( dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)['average_loss']) print(dqn._status()) print(dqn._status._info_dict_with_sub_info)
def test_trajectory_data(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) a = TrajectoryData(env_spec) tmp_traj = TransitionData(env_spec) st = env.reset() re_list = [] st_list = [] for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) st_list.append(st_new) re_list.append(re) if (i + 1) % 10 == 0: done = True else: done = False tmp_traj.append(state=st, new_state=st_new, action=ac, done=done, reward=re) if done: a.append(tmp_traj.get_copy()) tmp_traj.reset() self.assertEqual(a.trajectories.__len__(), 10) for traj in a.trajectories: self.assertEqual(len(traj), 10)
def sample(self, batch_size) -> SampleData: if self.nb_entries < batch_size: raise MemoryBufferLessThanBatchSizeError() # todo This will be changed to prioritised batch_idxs = np.random.randint(self.nb_entries - 2, size=batch_size) obs0_batch = self.observations0.get_batch(batch_idxs) obs1_batch = self.observations1.get_batch(batch_idxs) action_batch = self.actions.get_batch(batch_idxs) reward_batch = self.rewards.get_batch(batch_idxs) terminal1_batch = self.terminals1.get_batch(batch_idxs) result = { 'obs0': array_min2d(obs0_batch), 'obs1': array_min2d(obs1_batch), 'rewards': array_min2d(reward_batch), 'actions': array_min2d(action_batch), 'terminals1': array_min2d(terminal1_batch), } res = TransitionData(obs_shape=self.obs_shape, action_shape=self.action_shape) for obs0, obs1, action, terminal, re in zip(result['obs0'], result['obs1'], result['actions'], result['terminals1'], result['rewards']): res.append(state=obs0, new_state=obs1, action=action, done=terminal, reward=re) return res
def test_init(self): dqn, locals = self.create_dqn() env = locals['env'] env_spec = locals['env_spec'] dqn.init() st = env.reset() a = TransitionData(env_spec) for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) st = st_new dqn.append_to_memory(a) new_dqn, _ = self.create_dqn(name='new_dqn') new_dqn.copy_from(dqn) self.assert_var_list_id_no_equal(dqn.q_value_func.parameters('tf_var_list'), new_dqn.q_value_func.parameters('tf_var_list')) self.assert_var_list_id_no_equal(dqn.target_q_value_func.parameters('tf_var_list'), new_dqn.target_q_value_func.parameters('tf_var_list')) self.assert_var_list_equal(dqn.q_value_func.parameters('tf_var_list'), new_dqn.q_value_func.parameters('tf_var_list')) self.assert_var_list_equal(dqn.target_q_value_func.parameters('tf_var_list'), new_dqn.target_q_value_func.parameters('tf_var_list')) dqn.save(save_path=GlobalConfig().DEFAULT_LOG_PATH + '/dqn_test', global_step=0, name=dqn.name) for i in range(10): print(dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)) print(dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)) self.assert_var_list_at_least_not_equal(dqn.q_value_func.parameters('tf_var_list'), new_dqn.q_value_func.parameters('tf_var_list')) self.assert_var_list_at_least_not_equal(dqn.target_q_value_func.parameters('tf_var_list'), new_dqn.target_q_value_func.parameters('tf_var_list')) dqn.load(path_to_model=GlobalConfig().DEFAULT_LOG_PATH + '/dqn_test', model_name=dqn.name, global_step=0) self.assert_var_list_equal(dqn.q_value_func.parameters('tf_var_list'), new_dqn.q_value_func.parameters('tf_var_list')) self.assert_var_list_equal(dqn.target_q_value_func.parameters('tf_var_list'), new_dqn.target_q_value_func.parameters('tf_var_list')) for i in range(10): self.sess.run(dqn.update_target_q_value_func_op, feed_dict=dqn.parameters.return_tf_parameter_feed_dict()) var1 = self.sess.run(dqn.q_value_func.parameters('tf_var_list')) var2 = self.sess.run(dqn.target_q_value_func.parameters('tf_var_list')) import numpy as np total_diff = 0.0 for v1, v2 in zip(var1, var2): total_diff += np.mean(np.abs(np.array(v1) - np.array(v2))) print('update target, difference mean', total_diff)
def get_some_samples(env, num, env_spec, policy): data = TransitionData(env_spec=env_spec) st = env.reset() for i in range(num): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st return data
def test_random_buffer_1(self): env = make('ModifiedHalfCheetah') # env.init() env_spec = env.env_spec random_buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=5) rl_buffer = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape, size=10) max_step = 10 ep_len = 0 obs = env.reset() while ep_len < max_step: act = self.RandomController_get_action(env=env, state=obs) obs_, reward, done, _ = env.step(act) random_buffer.append(obs, act, obs_, done, reward) assert not done obs = obs_ ep_len += 1
def sample_transition(self, env, count=100): data = TransitionData(env.env_spec) st = env.get_state() for i in range(count): ac = env.env_spec.action_space.sample() new_st, re, done, info = env.step(action=ac) data.append(state=st, action=ac, new_state=new_st, done=done, reward=re) return data
def test_mlp_dynamics_model(self): mlp_dyna, local = self.create_continue_dynamics_model( name='mlp_dyna_model') env = local['env'] env_spec = local['env_spec'] env.reset() mlp_dyna.init() for i in range(100): mlp_dyna.step(action=np.array(env_spec.action_space.sample()), state=env_spec.obs_space.sample()) data = TransitionData(env_spec) st = env.get_state() for i in range(10): ac = env_spec.action_space.sample() new_st, re, done, info = env.step(action=ac) data.append(state=st, action=ac, new_state=new_st, done=done, reward=re) st = new_st print(mlp_dyna.train(batch_data=data, train_iter=10)) mlp_dyna_2, _ = self.create_continue_dynamics_model(name='model_2') mlp_dyna_2.init() self.assert_var_list_at_least_not_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) self.assert_var_list_id_no_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) mlp_dyna_2.init(source_obj=mlp_dyna) self.assert_var_list_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) self.assert_var_list_id_no_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) mlp_dyna_2.copy_from(mlp_dyna) self.assert_var_list_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list')) self.assert_var_list_id_no_equal( var_list1=mlp_dyna.parameters('tf_var_list'), var_list2=mlp_dyna_2.parameters('tf_var_list'))
def __init__( self, env_spec, dynamics_model: DynamicsModel, config_or_config_dict: (DictConfig, dict), policy: Policy, name='mpc', ): super().__init__(env_spec, dynamics_model, name) self.config = construct_dict_config(config_or_config_dict, self) self.policy = policy self.parameters = Parameters(parameters=dict(), source_config=self.config, name=name + '_' + 'mpc_param') self.memory = TransitionData(env_spec=env_spec)
def __init__(self, env_spec, dynamics_model: DynamicsModel, config_or_config_dict: (DictConfig, dict), policy: Policy, name='mpc', ): super().__init__(env_spec, dynamics_model, name) self.config = construct_dict_config(config_or_config_dict, self) self.parameters = Parameters(parameters=dict(), source_config=self.config, name=name + '_' + 'mpc_param') self.policy = policy # TODO: 9.18 should also make memory served as init parameter in __init__, # and set default value as Transition in init() self.memory = TransitionData(env_spec=env_spec)
def test_dynamics_model_in_pendulum(self): env = self.create_env('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) policy, _ = self.create_uniform_policy(env_spec=env_spec) data = TransitionData(env_spec=env_spec) st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() for i in range(len(data.state_set)): res = gp.step(action=data.action_set[i], state=data.state_set[i], allow_clip=True) _, var = gp._state_transit(action=data.action_set[i], state=data.state_set[i], required_var=True) print(res) print(data.new_state_set[i]) print(np.sqrt(var)) # self.assertTrue(np.isclose(res, # data.new_state_set[i], atol=1e-3).all()) self.assertTrue(np.greater(data.new_state_set[i] + 1.96 * np.sqrt(var), res).all()) self.assertTrue(np.less(data.new_state_set[i] - 1.96 * np.sqrt(var), res).all()) lengthscales = {} variances = {} noises = {} for i, model in enumerate(gp.mgpr_model.models): lengthscales['GP' + str(i)] = model.kern.lengthscales.value variances['GP' + str(i)] = np.array([model.kern.variance.value]) noises['GP' + str(i)] = np.array([model.likelihood.variance.value]) print('-----Learned models------') pd.set_option('precision', 3) print('---Lengthscales---') print(pd.DataFrame(data=lengthscales)) print('---Variances---') print(pd.DataFrame(data=variances)) print('---Noises---') print(pd.DataFrame(data=noises))
def test_dynamics_model_basic(self): env = self.create_env('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) policy, _ = self.create_uniform_policy(env_spec=env_spec) data = TransitionData(env_spec=env_spec) st = env.reset() ac = policy.forward(st) for i in range(10): re = 0.0 data.append(state=np.ones_like(st) * 0.5, new_state=np.ones_like(st), reward=re, done=False, action=np.ones_like(ac) * 0.1) data.append(state=np.ones_like(st), new_state=np.ones_like(st) * 0.5, reward=re, done=False, action=np.ones_like(ac) * -0.1) gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() lengthscales = {} variances = {} noises = {} i = 0 for model in gp.mgpr_model.models: lengthscales['GP' + str(i)] = model.kern.lengthscales.value variances['GP' + str(i)] = np.array([model.kern.variance.value]) noises['GP' + str(i)] = np.array([model.likelihood.variance.value]) i += 1 print('-----Learned models------') pd.set_option('precision', 3) print('---Lengthscales---') print(pd.DataFrame(data=lengthscales)) print('---Variances---') print(pd.DataFrame(data=variances)) print('---Noises---') print(pd.DataFrame(data=noises)) for i in range(5): self.assertTrue(np.isclose(gp.step(action=np.ones_like(ac) * -0.1, state=np.ones_like(st)), np.ones_like(st) * 0.5).all()) for i in range(5): self.assertTrue(np.isclose(gp.step(action=np.ones_like(ac) * 0.1, state=np.ones_like(st) * 0.5), np.ones_like(st)).all()) for i in range(5): print(gp.step(action=np.ones_like(ac) * -0.1, state=np.ones_like(st) * 0.5))
def _sample_transitions(self, env: Env, agent, sample_count, init_state): state = init_state sample_record = TransitionData(env_spec=self.env_spec) for i in range(sample_count): action = agent.predict(obs=state) new_state, re, done, info = env.step(action) if not isinstance(done, bool): raise TypeError() sample_record.append(state=state, action=action, reward=re, new_state=new_state, done=done) if done: state = env.reset() else: state = new_state return sample_record
def predict(self, obs, **kwargs): if self.is_training is True: return self.env_spec.action_space.sample() rollout = TrajectoryData(env_spec=self.env_spec) state = obs for i in range(self.parameters('SAMPLED_PATH_NUM')): path = TransitionData(env_spec=self.env_spec) # todo terminal_func signal problem to be consider? for _ in range(self.parameters('SAMPLED_HORIZON')): ac = self.policy.forward(obs=state) new_state, re, done, _ = self.dynamics_env.step(action=ac, state=state) # step() as an Env path.append(state=state, action=ac, new_state=new_state, reward=re, done=done) state = new_state rollout.append(path) rollout.trajectories.sort(key=lambda x: x.cumulative_reward, reverse=True) ac = rollout.trajectories[0].action_set[0] assert self.env_spec.action_space.contains(ac) return ac
def _sample_trajectories(self, env, agent, sample_count, init_state): state = init_state sample_record = TrajectoryData(self.env_spec) done = False for i in range(sample_count): traj_record = TransitionData(self.env_spec) while done is not True: action = agent.predict(obs=state) new_state, re, done, info = env.step(action) if not isinstance(done, bool): raise TypeError() traj_record.append(state=state, action=action, reward=re, new_state=new_state, done=done) state = new_state state = env.reset() sample_record.append(traj_record) return sample_record
def test_init_continuous(self): algo, locals = self.create_mpc(env_id='Pendulum-v0') env_spec = locals['env_spec'] env = locals['env'] algo.init() for _ in range(100): assert env_spec.action_space.contains( algo.predict(env_spec.obs_space.sample())) st = env.reset() data = TransitionData(env_spec) for _ in range(10): ac = algo.predict(st) new_st, re, done, _ = env.step(action=ac) data.append(state=st, new_state=new_st, reward=re, action=ac, done=done) print(algo.train(batch_data=data))
def _test_1(self): mlp_dyna, local = self.create_continue_dynamics_model( env_id='ModifiedHalfCheetah', name='mlp_dyna_model') print(local.items()) env = local['env'] assert isinstance(env, ModifiedHalfCheetahEnv) env_spec = env.env_spec batch_data = TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \ action_shape=env_spec.action_shape) batch_size = 32 obs = env.reset() for i in range(batch_size): act = self.RandomController_get_action(env=env, state=obs) obs_, reward, done, info = env.step(action=act) batch_data.append(obs, act, obs_, done, reward) self.assertEqual(len(batch_data), batch_size) mlp_dyna.init() train_epoch = 20 for i in range(train_epoch): res = mlp_dyna.train(batch_data, train_iter=10) print('iter:{} loss:{}'.format(i, res))
def predict(self, obs, is_reward_func=True): ''' Sample SAMPLED_PATH_NUM trajectories started from 'obs'. Return the optimal action. :param obs: Initial state. :param reverse_sort_flag: Decide the sort direction of trajectories, set to 'True' when using reward func. :return: Optimal action for 'obs'. ''' rollout = TrajectoryData(env_spec=self.env_spec) for i in range(self.parameters('SAMPLED_PATH_NUM')): path = TransitionData(env_spec=self.env_spec) state = obs for j in range(self.parameters('SAMPLED_HORIZON')): act = self.policy.forward(obs=state) # env.action_space.sample() new_state, cost, _, _ = self.dynamics_env.step(action=act, state=state) # step() as an Env path.append(state=state, action=act, new_state=new_state, reward=cost, done=False) state = new_state rollout.append(path) rollout.trajectories.sort(key=lambda x: x.cumulative_reward, reverse=is_reward_func) optimal_act = rollout.trajectories[0].action_set[0] assert self.env_spec.action_space.contains(optimal_act) return optimal_act
def test_l1_l2_norm(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) name = 'dqn' mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp', name=name + '_mlp', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03, "L1_NORM": 1000.0, "L2_NORM": 1000.0 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "L1_NORM": 1000.0, "L2_NORM": 1000.0, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name, value_func=mlp_q) dqn2, _ = self.create_dqn(name='dqn_2') a = TransitionData(env_spec) st = env.reset() dqn.init() dqn2.init() for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) st = st_new dqn.append_to_memory(a) for i in range(20): print( 'dqn1 loss: ', dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)) print( 'dqn2 loss: ', dqn2.train(batch_data=a, train_iter=10, sess=None, update_target=True)) var_list = self.sess.run(dqn.q_value_func.parameters('tf_var_list')) print(var_list) var_list2 = self.sess.run(dqn2.q_value_func.parameters('tf_var_list')) print(var_list2) for var, var2 in zip(var_list, var_list2): diff = np.abs(var2) - np.abs(var) self.assertTrue(np.greater(np.mean(diff), 0.0).all())
def __init__(self, env_spec: EnvSpec, stochastic_policy: StochasticPolicy, config_or_config_dict: (DictConfig, dict), value_func: VValueFunction, warm_up_trajectories_number=5, use_time_index_flag=False, name='ppo'): ModelFreeAlgo.__init__( self, env_spec=env_spec, name=name, warm_up_trajectories_number=warm_up_trajectories_number) self.use_time_index_flag = use_time_index_flag self.config = construct_dict_config(config_or_config_dict, self) self.policy = stochastic_policy self.value_func = value_func to_ph_parameter_dict = dict() self.trajectory_memory = TrajectoryData(env_spec=env_spec) self.transition_data_for_trajectory = TransitionData(env_spec=env_spec) self.value_func_train_data_buffer = None self.scaler = RunningStandardScaler(dims=self.env_spec.flat_obs_dim) if use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) with tf.variable_scope(name): self.advantages_ph = tf.placeholder(tf.float32, (None, ), 'advantages') self.v_func_val_ph = tf.placeholder(tf.float32, (None, ), 'val_val_func') dist_info_list = self.policy.get_dist_info() self.old_dist_tensor = [ (tf.placeholder(**dict(dtype=dist_info['dtype'], shape=dist_info['shape'], name=dist_info['name'])), dist_info['name']) for dist_info in dist_info_list ] self.old_policy = self.policy.make_copy( reuse=False, name_scope='old_{}'.format(self.policy.name), name='old_{}'.format(self.policy.name), distribution_tensors_tuple=tuple(self.old_dist_tensor)) to_ph_parameter_dict['beta'] = tf.placeholder( tf.float32, (), 'beta') to_ph_parameter_dict['eta'] = tf.placeholder(tf.float32, (), 'eta') to_ph_parameter_dict['kl_target'] = tf.placeholder( tf.float32, (), 'kl_target') to_ph_parameter_dict['lr_multiplier'] = tf.placeholder( tf.float32, (), 'lr_multiplier') self.parameters = ParametersWithTensorflowVariable( tf_var_list=[], rest_parameters=dict( advantages_ph=self.advantages_ph, v_func_val_ph=self.v_func_val_ph, ), to_ph_parameter_dict=to_ph_parameter_dict, name='ppo_param', save_rest_param_flag=False, source_config=self.config, require_snapshot=False) with tf.variable_scope(name): with tf.variable_scope('train'): self.kl = tf.reduce_mean(self.old_policy.kl(self.policy)) self.average_entropy = tf.reduce_mean(self.policy.entropy()) self.policy_loss, self.policy_optimizer, self.policy_update_op = self._setup_policy_loss( ) self.value_func_loss, self.value_func_optimizer, self.value_func_update_op = self._setup_value_func_loss( ) var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.policy_optimizer.variables( ) + self.value_func_optimizer.variables() self.parameters.set_tf_var_list( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[ dict( obj=self.value_func, attr_name='value_func', ), dict(obj=self.policy, attr_name='policy') ], parameters=self.parameters)