Ejemplo n.º 1
0
    def __init__(self, gym_env_id: str, name: str = None):
        """

        :param gym_env_id: gym environment id
        :type gym_env_id: str
        :param name: name of the gym environment instance
        :type name: str
        """
        super().__init__(name=name if name else gym_env_id)
        self.env_id = gym_env_id
        try:
            self._gym_env = gym.make(gym_env_id)
        except gym_error.UnregisteredEnv:
            raise ValueError(
                'Env id: {} is not supported currently'.format(gym_env_id))
        self._gym_env = gym.make(gym_env_id)
        self.action_space = space_converter(self._gym_env.action_space)
        self.observation_space = space_converter(
            self._gym_env.observation_space)
        if isinstance(self.action_space, garage_space.Box):
            self.action_space.low = np.nan_to_num(self.action_space.low)
            self.action_space.high = np.nan_to_num(self.action_space.high)
            self.action_space.sample = types.MethodType(
                self._sample_with_nan, self.action_space)
        if isinstance(self.observation_space, garage_space.Box):
            self.observation_space.low = np.nan_to_num(
                self.observation_space.low)
            self.observation_space.high = np.nan_to_num(
                self.observation_space.high)
            self.observation_space.sample = types.MethodType(
                self._sample_with_nan, self.observation_space)
        self.env_spec = EnvSpec(obs_space=self.observation_space,
                                action_space=self.action_space)

        self.reward_range = self._gym_env.reward_range
Ejemplo n.º 2
0
    def test_correctness(self):
        env_id = 'Pendulum-v0'

        env = make(env_id)
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        dyna = DebugDynamics(env_spec=env_spec)
        dyna = DynamicsEnvWrapper(dynamics=dyna)
        dyna.set_terminal_reward_func(terminal_func=RandomTerminalFunc(),
                                      reward_func=DebuggingCostFunc())
        policy = iLQRPolicy(env_spec=env_spec,
                            T=10,
                            delta=0.05,
                            iteration=2,
                            dynamics=dyna,
                            dynamics_model_train_iter=10,
                            cost_fn=DebuggingCostFunc())
        st = env.reset()
        dyna.st = np.zeros_like(st)
        for i in range(10):
            ac = policy.forward(st)
            st, _, _, _ = env.step(st)
            # st = dyna.step(action=ac, state=st)
            print("analytical optimal action -0.5, cost -0.25")
            print('state: {}, action: {}, cost {}'.format(
                st, ac,
                policy.iLqr_instance.cost_fn(state=st,
                                             action=ac,
                                             new_state=None)))
Ejemplo n.º 3
0
 def test_trajectory_data(self):
     env = make('Acrobot-v1')
     env_spec = EnvSpec(obs_space=env.observation_space,
                        action_space=env.action_space)
     a = TrajectoryData(env_spec)
     tmp_traj = TransitionData(env_spec)
     st = env.reset()
     re_list = []
     st_list = []
     for i in range(100):
         ac = env_spec.action_space.sample()
         st_new, re, done, _ = env.step(action=ac)
         st_list.append(st_new)
         re_list.append(re)
         if (i + 1) % 10 == 0:
             done = True
         else:
             done = False
         tmp_traj.append(state=st,
                         new_state=st_new,
                         action=ac,
                         done=done,
                         reward=re)
         if done:
             a.append(tmp_traj.get_copy())
             tmp_traj.reset()
     self.assertEqual(a.trajectories.__len__(), 10)
     for traj in a.trajectories:
         self.assertEqual(len(traj), 10)
Ejemplo n.º 4
0
    def create_mlp_v(self, env_id='Pendulum-v0', name='mlp_v'):
        env = make(env_id)
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_v = MLPVValueFunc(env_spec=env_spec,
                              name_scope=name + 'mlp_v',
                              name=name + 'mlp_v',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "L1_NORM": 0.01,
                                  "L2_NORM": 0.01,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
        return mlp_v, locals()
Ejemplo n.º 5
0
	def test_prior_eval(self):
		env = make('Pendulum-v0')
		name = 'demo_exp'
		env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space)
		data = TransitionData(env_spec=env_spec)
		policy = UniformRandomPolicy(env_spec=env_spec)

		# Do some initial sampling here to train gmm model
		st = env.reset()
		for i in range(100):
			ac = policy.forward(st)
			new_st, re, _, _ = env.step(ac)
			data.append(state=st, new_state=new_st, action=ac, reward=re, done=False)
			st = new_st

		gmm = GaussianMixtureDynamicsPrior(env_spec=env_spec, batch_data=data)
		gmm.init()
		gmm.update(batch_data=data)
		mu0, Phi, m, n0 = gmm.eval(batch_data=data)

		state_shape = data.state_set.shape[1]
		action_shape = data.action_set.shape[1]
		self.assertEqual(state_shape + action_shape + state_shape, mu0.shape[0])
		self.assertEqual(state_shape + action_shape + state_shape, Phi.shape[0])
		self.assertEqual(state_shape + action_shape + state_shape, Phi.shape[1])
Ejemplo n.º 6
0
    def test_mlp_deterministic_policy(self):
        env = make('Pendulum-v0')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        policy, locals = self.create_mlp_deterministic_policy(
            name='mlp_policy', env_spec=env_spec)
        policy.init()
        for _ in range(10):
            ac = policy.forward(obs=env.observation_space.sample())
            self.assertTrue(env.action_space.contains(ac[0]))
        p2 = policy.make_copy(name='test', name_scope='test', reuse=False)
        p2.init()
        self.assertGreater(len(policy.parameters('tf_var_list')), 0)
        self.assertGreater(len(p2.parameters('tf_var_list')), 0)
        for var1, var2 in zip(policy.parameters('tf_var_list'),
                              p2.parameters('tf_var_list')):
            self.assertEqual(var1.shape, var2.shape)
            self.assertNotEqual(id(var1), id(var2))

        p3 = policy.make_copy(name='mlp_policy_2',
                              name_scope='mlp_policy',
                              reuse=True)
        p3.init()
        self.assertGreater(len(p3.parameters('tf_var_list')), 0)
        for var1, var2 in zip(policy.parameters('tf_var_list'),
                              p3.parameters('tf_var_list')):
            self.assertEqual(var1.shape, var2.shape)
            self.assertEqual(id(var1), id(var2))
Ejemplo n.º 7
0
    def create_continue_dynamics_model(self,
                                       env_id='Acrobot-v1',
                                       name='mlp_dyna'):
        env = make(env_id)
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_dyna = ContinuousMLPGlobalDynamicsModel(
            env_spec=env_spec,
            name_scope=name + 'mlp_dyna',
            name=name + 'mlp_dyna',
            output_low=env_spec.obs_space.low,
            output_high=env_spec.obs_space.high,
            learning_rate=0.01,
            mlp_config=[{
                "ACT": "RELU",
                "B_INIT_VALUE": None,
                "NAME": "1",
                "N_UNITS": 16,
                "TYPE": "DENSE",
                "W_NORMAL_STDDEV": 0.03
            }, {
                "ACT": "LINEAR",
                "B_INIT_VALUE": 0.0,
                "NAME": "OUPTUT",
                "N_UNITS": env_spec.flat_obs_dim,
                "TYPE": "DENSE",
                "W_NORMAL_STDDEV": 0.03
            }])
        return mlp_dyna, locals()
Ejemplo n.º 8
0
    def create_mlp_q_func(self, env_id='Acrobot-v1', name='mlp_q'):
        env = make(env_id)
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name_scope=name,
                                  name=name,
                                  mlp_config=[{
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03,
                                      "L1_NORM": 0.2,
                                      "L2_NORM": 0.1
                                  }, {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }])
        return mlp_q, locals()
Ejemplo n.º 9
0
    def test_correctness(self):
        env_id = 'Pendulum-v0'
        env = make(env_id)
        n = env.observation_space.flat_dim
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        F = np.ones([env.observation_space.flat_dim,
                     env.observation_space.flat_dim + env.action_space.flat_dim]) * 0.00001
        # F[n:, n:] = 0.0001
        dyna = LinearDynamicsModel(env_spec=env_spec,
                                   state_transition_matrix=F,
                                   bias=np.zeros([env.observation_space.flat_dim]))
        C = np.ones([env.observation_space.flat_dim + env.action_space.flat_dim,
                     env.observation_space.flat_dim + env.action_space.flat_dim]) * 0.00001
        c = np.ones([env.observation_space.flat_dim + env.action_space.flat_dim])
        c[n:] = -1000
        # C[:n, :] = 0.
        # C[:, :n] = 0.
        # c[:n] = 0.0
        cost_fn = QuadraticCostFunc(C=C, c=c)

        policy = LQRPolicy(env_spec=env_spec,
                           T=5,
                           dynamics=dyna,
                           cost_fn=cost_fn)
        st = env.reset() * 0.0
        for i in range(10):
            ac = policy.forward(st)
            st = dyna.step(action=ac, state=st, allow_clip=True)
            print(cost_fn(state=st, action=ac, new_state=None))
            print(st, ac)
Ejemplo n.º 10
0
    def test_init(self):
        env = make('Pendulum-v0')

        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim],
                                     dtype=tf.float32,
                                     name='state_ph')

        mlp_v = MLPVValueFunc(env_spec=env_spec,
                              name_scope='mlp_q',
                              name='mlp_q',
                              state_input=state_input,
                              output_low=None,
                              output_high=None,
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
        mlp_v.init()
        mlp_v.forward(obs=env.observation_space.sample())
Ejemplo n.º 11
0
 def test_trajectory_data(self):
     env = make('Acrobot-v1')
     env_spec = EnvSpec(obs_space=env.observation_space,
                        action_space=env.action_space)
     a = TrajectoryData(env_spec)
     tmp_traj = TransitionData(env_spec)
     st = env.reset()
     re_list = []
     st_list = []
     for i in range(100):
         ac = env_spec.action_space.sample()
         st_new, re, done, _ = env.step(action=ac)
         st_list.append(st_new)
         re_list.append(re)
         if (i + 1) % 10 == 0:
             done = True
         else:
             done = False
         tmp_traj.append(state=st,
                         new_state=st_new,
                         action=ac,
                         done=done,
                         reward=re)
         if done is True:
             a.append(tmp_traj)
             tmp_traj.reset()
     self.assertEqual(a.trajectories.__len__(), 10)
     for traj in a.trajectories:
         self.assertEqual(len(traj), 10)
     data = a.return_as_transition_data()
     data_gen = data.return_generator()
     for d, re, st in zip(data_gen, re_list, st_list):
         self.assertEqual(d[3], re)
         self.assertTrue(np.equal(st, d[1]).all())
Ejemplo n.º 12
0
    def __init__(self, dmcs_env_id: str, name: str = None):
        """

        :param dmcs_env_id:
        :param name:
        """
        super().__init__(name=name if name else dmcs_env_id)
        self.env_id = dmcs_env_id
        self.timestep = {}
        try:
            self.env = suite.load(dmcs_env_id, name)
        except ValueError:
            raise ValueError('Env id: {} and task: {} is not supported currently'.format(dmcs_env_id, name))

        self.metadata = {'render.modes': ['human', 'rgb_array'],
                         'video.frames_per_second': int(np.round(1.0 / self.env.control_timestep()))}

        self.action_space = convert_dm_control_to_gym_space(self.env.action_spec())
        self.observation_space = convert_dm_control_to_gym_space(self.env.observation_spec())
        if isinstance(self.action_space, garage_space.Box):
            self.action_space.low = np.nan_to_num(self.action_space.low)
            self.action_space.high = np.nan_to_num(self.action_space.high)
            self.action_space.sample = types.MethodType(self._sample_with_nan, self.action_space)
        if isinstance(self.observation_space, garage_space.Box):
            self.observation_space.low = np.nan_to_num(self.observation_space.low)
            self.observation_space.high = np.nan_to_num(self.observation_space.high)
            self.observation_space.sample = types.MethodType(self._sample_with_nan, self.observation_space)

        self.env_spec = EnvSpec(obs_space=self.observation_space,
                                action_space=self.action_space)

        self.viewer = None
Ejemplo n.º 13
0
    def test_copy(self):
        env = make('Pendulum-v0')

        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim],
                                     dtype=tf.float32,
                                     name='state_ph')

        mlp_v = MLPVValueFunc(env_spec=env_spec,
                              name_scope='mlp_v',
                              name='mlp_v',
                              state_input=state_input,
                              output_low=None,
                              output_high=None,
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
        mlp_v.init()

        new_mlp = mlp_v.make_copy(name='new_mlp',
                                  name_scope='mlp_v',
                                  reuse=True)

        new_mlp.init()

        self.assertGreater(len(mlp_v.parameters('tf_var_list')), 0)
        self.assertGreater(len(new_mlp.parameters('tf_var_list')), 0)

        for var1, var2 in zip(mlp_v.parameters('tf_var_list'),
                              new_mlp.parameters('tf_var_list')):
            self.assertEqual(var1.shape, var2.shape)
            self.assertEqual(id(var1), id(var2))

        not_reuse_mlp = mlp_v.make_copy(name='no-reuse-mlp',
                                        name_scope='mlp_no_reuse',
                                        reuse=False)
        not_reuse_mlp.init()
        self.assertGreater(len(not_reuse_mlp.parameters('tf_var_list')), 0)

        for var1, var2 in zip(mlp_v.parameters('tf_var_list'),
                              not_reuse_mlp.parameters('tf_var_list')):
            self.assertEqual(var1.shape, var2.shape)
            self.assertNotEqual(id(var1), id(var2))
Ejemplo n.º 14
0
    def create_continue_dynamics_model(self,
                                       env_id='Acrobot-v1',
                                       name='mlp_dyna'):
        env = make(env_id)
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_dyna, _ = self.create_continuous_mlp_global_dynamics_model(
            env_spec=env_spec, name=name)
        return mlp_dyna, locals()
Ejemplo n.º 15
0
def mountaincar_task_fn():
    exp_config = MOUNTAINCAR_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('MountainCar-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    dqn = DQN(env_spec=env_spec,
              name=name + '_dqn',
              value_func=mlp_q,
              **exp_config['DQN'])
    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     prob_scheduler=LinearScheduler(
                                                         t_fn=lambda: get_global_status_collect()(
                                                             'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         **exp_config['EpsilonGreedy']['LinearScheduler']),
                                                     **exp_config['EpsilonGreedy']['config_or_config_dict']))
    flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                         config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'],
                         func_dict={
                             'test': {'func': agent.test,
                                      'args': list(),
                                      'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT']),
                                      },
                             'train': {'func': agent.train,
                                       'args': list(),
                                       'kwargs': dict(),
                                       },
                             'sample': {'func': agent.sample,
                                        'args': list(),
                                        'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'],
                                                       env=agent.env,
                                                       in_which_status='TRAIN',
                                                       store_flag=True),
                                        },
                         })

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
Ejemplo n.º 16
0
    def test_dynamics_model_in_pendulum(self):
        env = self.create_env('Pendulum-v0')
        env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space)
        policy, _ = self.create_uniform_policy(env_spec=env_spec)
        data = TransitionData(env_spec=env_spec)
        st = env.reset()
        for i in range(100):
            ac = policy.forward(st)
            new_st, re, _, _ = env.step(ac)
            data.append(state=st, new_state=new_st, action=ac, reward=re, done=False)
            st = new_st

        gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data)
        gp.init()
        gp.train()
        for i in range(len(data.state_set)):
            res = gp.step(action=data.action_set[i],
                          state=data.state_set[i],
                          allow_clip=True)
            _, var = gp._state_transit(action=data.action_set[i],
                                       state=data.state_set[i],
                                       required_var=True)
            print(res)
            print(data.new_state_set[i])
            print(np.sqrt(var))
            # self.assertTrue(np.isclose(res,
            #                            data.new_state_set[i], atol=1e-3).all())
            self.assertTrue(np.greater(data.new_state_set[i] + 1.96 * np.sqrt(var), res).all())
            self.assertTrue(np.less(data.new_state_set[i] - 1.96 * np.sqrt(var), res).all())

        lengthscales = {}
        variances = {}
        noises = {}
        for i, model in enumerate(gp.mgpr_model.models):
            lengthscales['GP' + str(i)] = model.kern.lengthscales.value
            variances['GP' + str(i)] = np.array([model.kern.variance.value])
            noises['GP' + str(i)] = np.array([model.likelihood.variance.value])
        print('-----Learned models------')
        pd.set_option('precision', 3)
        print('---Lengthscales---')
        print(pd.DataFrame(data=lengthscales))
        print('---Variances---')
        print(pd.DataFrame(data=variances))
        print('---Noises---')
        print(pd.DataFrame(data=noises))
Ejemplo n.º 17
0
 def test_dynamics_model_basic(self):
     env = self.create_env('Pendulum-v0')
     env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space)
     policy, _ = self.create_uniform_policy(env_spec=env_spec)
     data = TransitionData(env_spec=env_spec)
     st = env.reset()
     ac = policy.forward(st)
     for i in range(10):
         re = 0.0
         data.append(state=np.ones_like(st) * 0.5, new_state=np.ones_like(st),
                     reward=re, done=False, action=np.ones_like(ac) * 0.1)
         data.append(state=np.ones_like(st), new_state=np.ones_like(st) * 0.5,
                     reward=re, done=False, action=np.ones_like(ac) * -0.1)
     gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data)
     gp.init()
     gp.train()
     lengthscales = {}
     variances = {}
     noises = {}
     i = 0
     for model in gp.mgpr_model.models:
         lengthscales['GP' + str(i)] = model.kern.lengthscales.value
         variances['GP' + str(i)] = np.array([model.kern.variance.value])
         noises['GP' + str(i)] = np.array([model.likelihood.variance.value])
         i += 1
     print('-----Learned models------')
     pd.set_option('precision', 3)
     print('---Lengthscales---')
     print(pd.DataFrame(data=lengthscales))
     print('---Variances---')
     print(pd.DataFrame(data=variances))
     print('---Noises---')
     print(pd.DataFrame(data=noises))
     for i in range(5):
         self.assertTrue(np.isclose(gp.step(action=np.ones_like(ac) * -0.1,
                                            state=np.ones_like(st)),
                                    np.ones_like(st) * 0.5).all())
     for i in range(5):
         self.assertTrue(np.isclose(gp.step(action=np.ones_like(ac) * 0.1,
                                            state=np.ones_like(st) * 0.5),
                                    np.ones_like(st)).all())
     for i in range(5):
         print(gp.step(action=np.ones_like(ac) * -0.1,
                       state=np.ones_like(st) * 0.5))
Ejemplo n.º 18
0
    def init(self):
        if self._inited_flag:
            print('Warning: Current env has been initialized. Check if env.inited() has been called multiple times')
            print('Warning: Duplicated env initialization has been ignored')
            return
        self._status.set_status('INITED')


        self.action_space = space_converter(self.action_space)
        self.observation_space = space_converter(self.observation_space)
        if isinstance(self.action_space, garage_space.Box):
            self.action_space.low = np.nan_to_num(self.action_space.low)
            self.action_space.high = np.nan_to_num(self.action_space.high)
            self.action_space.sample = types.MethodType(self._sample_with_nan, self.action_space)
        if isinstance(self.observation_space, garage_space.Box):
            self.observation_space.low = np.nan_to_num(self.observation_space.low)
            self.observation_space.high = np.nan_to_num(self.observation_space.high)
            self.observation_space.sample = types.MethodType(self._sample_with_nan, self.observation_space)

        self.env_spec = EnvSpec(obs_space=self.observation_space, action_space=self.action_space)
        self._inited_flag = True
Ejemplo n.º 19
0
    def create_ilqr_policy(self, env_id='Pendulum-v0'):
        class DebuggingCostFunc(CostFunc):
            def __call__(self,
                         state=None,
                         action=None,
                         new_state=None,
                         **kwargs) -> float:
                return float(np.sum(action * action))

        env = make(env_id)
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        dyna = UniformRandomDynamicsModel(env_spec=env_spec)
        dyna.init()
        policy = iLQRPolicy(env_spec=env_spec,
                            T=50,
                            delta=0.0005,
                            iteration=5,
                            dynamics=dyna,
                            cost_fn=DebuggingCostFunc())
        return policy, locals()
 def test_transition_data(self):
     env = make('Acrobot-v1')
     env_spec = EnvSpec(obs_space=env.observation_space,
                        action_space=env.action_space)
     a = UniformRandomReplayBuffer(limit=10000,
                                   action_shape=env_spec.action_shape,
                                   observation_shape=env_spec.obs_shape)
     st = env.reset()
     for i in range(100):
         ac = env_spec.action_space.sample()
         st_new, re, done, _ = env.step(action=ac)
         a.append(obs0=st,
                  obs1=st_new,
                  action=ac,
                  reward=re,
                  terminal1=done)
         st = st_new
     batch = a.sample(batch_size=10)
     self.assertTrue(batch.state_set.shape[0] == 10)
     self.assertTrue(batch.action_set.shape[0] == 10)
     self.assertTrue(batch.reward_set.shape[0] == 10)
     self.assertTrue(batch.done_set.shape[0] == 10)
     self.assertTrue(batch.new_state_set.shape[0] == 10)
Ejemplo n.º 21
0
 def create_env_spec(self, env):
     return EnvSpec(action_space=env.action_space,
                    obs_space=env.observation_space)
Ejemplo n.º 22
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_v = MLPVValueFunc(env_spec=env_spec,
                          name_scope=name + 'mlp_v',
                          name=name + 'mlp_v',
                          **exp_config['MLP_V'])
    policy = NormalDistributionMLPPolicy(
        env_spec=env_spec,
        name_scope=name + 'mlp_policy',
        name=name + 'mlp_policy',
        **exp_config['POLICY'],
        output_low=env_spec.action_space.low,
        output_high=env_spec.action_space.high,
        reuse=False)

    ppo = PPO(env_spec=env_spec,
              **exp_config['PPO'],
              value_func=mlp_v,
              stochastic_policy=policy,
              name=name + 'ppo')
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=ppo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['TrainTestFlow']
        ['config_or_config_dict'],
        func_dict={
            'test': {
                'func':
                agent.test,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TEST_SAMPLES_COUNT'],
                     sample_trajectory_flag=True),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TRAIN_SAMPLES_COUNT'],
                     env=agent.env,
                     sample_type='trajectory',
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 23
0
def pendulum_task_fn():
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                policy=policy,
                value_func=mlp_q,
                name=name + '_ddpg',
                **exp_config['DDPG'])

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        **exp_config['DynamicsModel'])
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=REWARD_FUNC_DICT['Pendulum-v0']())
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(
                      noise=NormalActionNoise(),
                      noise_weight_scheduler=ConstantSchedule(value=0.3),
                      action_weight_scheduler=ConstantSchedule(value=1.0)),
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_algo': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_algo_from_synthesized_data': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training', train_iter=1)
            },
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=1, sample_trajectory_flag=True)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=10, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=50,
                     sample_type='transition',
                     env=agent.algo.dynamics_env,
                     in_which_status='TRAIN',
                     store_flag=False)
            }
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 24
0
    def create_ddpg(self, env_id='Pendulum-v0', name='ddpg'):
        env = make(env_id)
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name_scope=name + 'mlp_q',
                                  name=name + 'mlp_q',
                                  mlp_config=[{
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }, {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }])
        self.assertTrue(len(mlp_q.parameters('tf_var_list')) == 4)
        policy = DeterministicMLPPolicy(env_spec=env_spec,
                                        name_scope=name + 'mlp_policy',
                                        name=name + 'mlp_policy',
                                        mlp_config=[{
                                            "ACT": "RELU",
                                            "B_INIT_VALUE": 0.0,
                                            "NAME": "1",
                                            "N_UNITS": 16,
                                            "TYPE": "DENSE",
                                            "W_NORMAL_STDDEV": 0.03
                                        }, {
                                            "ACT": "LINEAR",
                                            "B_INIT_VALUE": 0.0,
                                            "NAME": "OUPTUT",
                                            "N_UNITS":
                                            env_spec.flat_action_dim,
                                            "TYPE": "DENSE",
                                            "W_NORMAL_STDDEV": 0.03
                                        }],
                                        reuse=False)
        self.assertTrue(len(policy.parameters('tf_var_list')) == 4)

        ddpg = DDPG(env_spec=env_spec,
                    config_or_config_dict={
                        "REPLAY_BUFFER_SIZE": 10000,
                        "GAMMA": 0.999,
                        "CRITIC_LEARNING_RATE": 0.001,
                        "ACTOR_LEARNING_RATE": 0.001,
                        "DECAY": 0.5,
                        "BATCH_SIZE": 50,
                        "TRAIN_ITERATION": 1,
                        "critic_clip_norm": 0.1,
                        "actor_clip_norm": 0.1,
                    },
                    value_func=mlp_q,
                    policy=policy,
                    name=name,
                    replay_buffer=None)
        return ddpg, locals()
Ejemplo n.º 25
0
This gives a simple example on how to use Gaussian Process (GP) to approximate the Gym environment Pendulum-v0
We use gpflow package to build the Gaussian Process.
"""
from baconian.core.core import EnvSpec
from baconian.envs.gym_env import make
import numpy as np
from baconian.common.sampler.sample_data import TransitionData
from baconian.algo.policy import UniformRandomPolicy
from baconian.algo.dynamics.gaussian_process_dynamiocs_model import GaussianProcessDyanmicsModel
from baconian.algo.dynamics.dynamics_model import DynamicsEnvWrapper
from baconian.algo.dynamics.terminal_func.terminal_func import RandomTerminalFunc
from baconian.algo.dynamics.reward_func.reward_func import RandomRewardFunc

env = make('Pendulum-v0')
name = 'demo_exp'
env_spec = EnvSpec(obs_space=env.observation_space,
                   action_space=env.action_space)
data = TransitionData(env_spec=env_spec)
policy = UniformRandomPolicy(env_spec=env_spec)
# Do some initial sampling here to train GP model
st = env.reset()
for i in range(100):
    ac = policy.forward(st)
    new_st, re, _, _ = env.step(ac)
    data.append(state=st, new_state=new_st, action=ac, reward=re, done=False)
    st = new_st

gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data)
gp.init()
gp.train()

dyna_env = DynamicsEnvWrapper(dynamics=gp)
Ejemplo n.º 26
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=ddpg,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (),
                              dict(sample_count=100,
                                   env=agent.env,
                                   in_which_status='TRAIN',
                                   store_flag=True)))

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 27
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'example_scheduler_'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[
                                  {
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }
                              ])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         LEARNING_RATE=0.001,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)
    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  algo_saving_scheduler=PeriodicalEventSchedule(
                      t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                      trigger_every_step=20,
                      after_t=10),
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     prob_scheduler=PiecewiseScheduler(
                                                         t_fn=lambda: get_global_status_collect()(
                                                             'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         endpoints=((10, 0.3), (100, 0.1), (200, 0.0)),
                                                         outside_value=0.0
                                                     ),
                                                     init_random_prob=0.5))
    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (), dict(sample_count=100,
                                                     env=agent.env,
                                                     store_flag=True))
    )
    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name + 'experiment_debug'
    )

    dqn.parameters.set_scheduler(param_key='LEARNING_RATE',
                                 scheduler=LinearScheduler(
                                     t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT,
                                     schedule_timesteps=GlobalConfig().DEFAULT_EXPERIMENT_END_POINT[
                                         'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'],
                                     final_p=0.0001,
                                     initial_p=0.01))
    experiment.run()
Ejemplo n.º 28
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        learning_rate=0.01,
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    algo = ModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=dict(SAMPLED_HORIZON=2,
                                   SAMPLED_PATH_NUM=5,
                                   dynamics_model_train_iter=10),
        name=name + '_mpc',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy'))
    algo.set_terminal_reward_function_for_dynamics_env(
        reward_func=RandomRewardFunc(name='reward_func'),
        terminal_func=RandomTerminalFunc(name='random_terminal'),
    )
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict={
            "TEST_EVERY_SAMPLE_COUNT": 10,
            "TRAIN_EVERY_SAMPLE_COUNT": 10,
            "START_TRAIN_AFTER_SAMPLE_COUNT": 5,
            "START_TEST_AFTER_SAMPLE_COUNT": 5,
        },
        func_dict={
            'test': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=10),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=100,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 29
0
    def test_l1_l2_norm(self):
        env = make('Acrobot-v1')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        name = 'dqn'

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name_scope=name + '_mlp',
                                  name=name + '_mlp',
                                  mlp_config=[{
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03,
                                      "L1_NORM": 1000.0,
                                      "L2_NORM": 1000.0
                                  }, {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "L1_NORM": 1000.0,
                                      "L2_NORM": 1000.0,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }])
        dqn = DQN(env_spec=env_spec,
                  config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                             GAMMA=0.99,
                                             BATCH_SIZE=10,
                                             LEARNING_RATE=0.01,
                                             TRAIN_ITERATION=1,
                                             DECAY=0.5),
                  name=name,
                  value_func=mlp_q)
        dqn2, _ = self.create_dqn(name='dqn_2')
        a = TransitionData(env_spec)
        st = env.reset()
        dqn.init()
        dqn2.init()
        for i in range(100):
            ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False)
            st_new, re, done, _ = env.step(action=ac)
            a.append(state=st,
                     new_state=st_new,
                     action=ac,
                     done=done,
                     reward=re)
            st = st_new
            dqn.append_to_memory(a)
        for i in range(20):
            print(
                'dqn1 loss: ',
                dqn.train(batch_data=a,
                          train_iter=10,
                          sess=None,
                          update_target=True))
            print(
                'dqn2 loss: ',
                dqn2.train(batch_data=a,
                           train_iter=10,
                           sess=None,
                           update_target=True))
        var_list = self.sess.run(dqn.q_value_func.parameters('tf_var_list'))
        print(var_list)
        var_list2 = self.sess.run(dqn2.q_value_func.parameters('tf_var_list'))
        print(var_list2)
        for var, var2 in zip(var_list, var_list2):
            diff = np.abs(var2) - np.abs(var)
            self.assertTrue(np.greater(np.mean(diff), 0.0).all())
Ejemplo n.º 30
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp_'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_v = MLPVValueFunc(env_spec=env_spec,
                          name_scope=name + 'mlp_v',
                          name=name + 'mlp_v',
                          mlp_config=[{
                              "ACT": "RELU",
                              "B_INIT_VALUE": 0.0,
                              "NAME": "1",
                              "N_UNITS": 16,
                              "L1_NORM": 0.01,
                              "L2_NORM": 0.01,
                              "TYPE": "DENSE",
                              "W_NORMAL_STDDEV": 0.03
                          }, {
                              "ACT": "LINEAR",
                              "B_INIT_VALUE": 0.0,
                              "NAME": "OUPTUT",
                              "N_UNITS": 1,
                              "TYPE": "DENSE",
                              "W_NORMAL_STDDEV": 0.03
                          }])

    policy = NormalDistributionMLPPolicy(env_spec=env_spec,
                                         name_scope=name + 'mlp_policy',
                                         name=name + 'mlp_policy',
                                         mlp_config=[{
                                             "ACT": "RELU",
                                             "B_INIT_VALUE": 0.0,
                                             "NAME": "1",
                                             "L1_NORM": 0.01,
                                             "L2_NORM": 0.01,
                                             "N_UNITS": 16,
                                             "TYPE": "DENSE",
                                             "W_NORMAL_STDDEV": 0.03
                                         }, {
                                             "ACT": "LINEAR",
                                             "B_INIT_VALUE": 0.0,
                                             "NAME": "OUPTUT",
                                             "N_UNITS":
                                             env_spec.flat_action_dim,
                                             "TYPE": "DENSE",
                                             "W_NORMAL_STDDEV": 0.03
                                         }],
                                         reuse=False)

    ppo = PPO(env_spec=env_spec,
              config_or_config_dict={
                  "gamma": 0.995,
                  "lam": 0.98,
                  "policy_train_iter": 10,
                  "value_func_train_iter": 10,
                  "clipping_range": None,
                  "beta": 1.0,
                  "eta": 50,
                  "log_var_init": -1.0,
                  "kl_target": 0.003,
                  "policy_lr": 0.01,
                  "value_func_lr": 0.01,
                  "value_func_train_batch_size": 10,
                  "lr_multiplier": 1.0
              },
              value_func=mlp_v,
              stochastic_policy=policy,
              name=name + 'ppo')
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=ppo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + 'agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (),
                              dict(sample_count=100,
                                   env=agent.env,
                                   in_which_status='TRAIN',
                                   store_flag=True)))

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()