Esempio n. 1
0
 def test_name_register(self):
     a = RandomRewardFunc()
     self.assertTrue(a.allow_duplicate_name)
     b = RandomRewardFunc()
     a = Basic(name='s')
     try:
         b = Basic(name='s')
     except GlobalNameExistedError as e:
         pass
     else:
         raise NotCatchCorrectExceptionError()
    def create_mpc(self,
                   env_id='Acrobot-v1',
                   name='mpc',
                   policy=None,
                   mlp_dyna=None,
                   env_spec=None,
                   env=None):
        if mlp_dyna is None:
            mlp_dyna, local = self.create_continue_dynamics_model(
                env_id, name + 'mlp_dyna')
            env_spec = local['env_spec']
            env = local['env']

        policy = policy if policy else UniformRandomPolicy(env_spec=env_spec,
                                                           name='unp')

        algo = ModelPredictiveControl(dynamics_model=mlp_dyna,
                                      env_spec=env_spec,
                                      config_or_config_dict=dict(
                                          SAMPLED_HORIZON=2,
                                          SAMPLED_PATH_NUM=5,
                                          dynamics_model_train_iter=10),
                                      name=name,
                                      policy=policy)
        algo.set_terminal_reward_function_for_dynamics_env(
            terminal_func=RandomTerminalFunc(name='random_p'),
            reward_func=RandomRewardFunc('re_fun'))
        return algo, locals()
 def create_dyna(self,
                 env_spec=None,
                 model_free_algo=None,
                 dyanmics_model=None,
                 name='dyna'):
     if not env_spec:
         model_free_algo, local = self.create_ddpg()
         dyanmics_model, _ = self.create_continuous_mlp_global_dynamics_model(
             env_spec=local['env_spec'])
         env_spec = local['env_spec']
         env = local['env']
     algo = Dyna(env_spec=env_spec,
                 name=name,
                 model_free_algo=model_free_algo,
                 dynamics_model=dyanmics_model,
                 config_or_config_dict=dict(dynamics_model_train_iter=1,
                                            model_free_algo_train_iter=1))
     algo.set_terminal_reward_function_for_dynamics_env(
         terminal_func=RandomTerminalFunc(), reward_func=RandomRewardFunc())
     return algo, locals()
Esempio n. 4
0
    def test_dynamics_as_env(self):
        env = self.create_env('Pendulum-v0')
        env_spec = self.create_env_spec(env)

        mlp_dyna = self.create_continuous_mlp_global_dynamics_model(env_spec=env_spec)[0]
        env = mlp_dyna.return_as_env()
        env.init()
        env.set_terminal_reward_func(terminal_func=FixedEpisodeLengthTerminalFunc(max_step_length=10,
                                                                                  step_count_fn=lambda: env.total_step_count_fn() - env._last_reset_point),
                                     reward_func=RandomRewardFunc())
        env.reset()
        self.assertEqual(env._last_reset_point, 0)
        for i in range(11):
            new_st, re, done, _ = env.step(action=env_spec.action_space.sample())
            self.assertEqual(env.total_step_count_fn(), i + 1)
            if done is True:
                self.assertEqual(i, 9)
                env.reset()
                self.assertEqual(env._last_reset_point, env.total_step_count_fn())
                self.assertEqual(env._last_reset_point, i + 1)
Esempio n. 5
0
st = env.reset()
for i in range(100):
    ac = policy.forward(st)
    new_st, re, _, _ = env.step(ac)
    data.append(state=st, new_state=new_st, action=ac, reward=re, done=False)
    st = new_st

gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data)
gp.init()
gp.train()

dyna_env = DynamicsEnvWrapper(dynamics=gp)
# Since we only care about the prediction here, so we pass the terminal function and reward function setting with
# random one
dyna_env.set_terminal_reward_func(terminal_func=RandomTerminalFunc(),
                                  reward_func=RandomRewardFunc())

st = env.reset()
real_state_list = []
dynamics_state_list = []
test_sample_count = 100
for i in range(test_sample_count):
    ac = env_spec.action_space.sample()
    gp.reset_state(state=st)
    new_state_dynamics, _, _, _ = dyna_env.step(action=ac, allow_clip=True)
    new_state_real, _, done, _ = env.step(action=ac)
    real_state_list.append(new_state_real)
    dynamics_state_list.append(new_state_dynamics)
    st = new_state_real
    if done is True:
        env.reset()
Esempio n. 6
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        learning_rate=0.01,
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    algo = ModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=dict(SAMPLED_HORIZON=2,
                                   SAMPLED_PATH_NUM=5,
                                   dynamics_model_train_iter=10),
        name=name + '_mpc',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy'))
    algo.set_terminal_reward_function_for_dynamics_env(
        reward_func=RandomRewardFunc(name='reward_func'),
        terminal_func=RandomTerminalFunc(name='random_terminal'),
    )
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict={
            "TEST_EVERY_SAMPLE_COUNT": 10,
            "TRAIN_EVERY_SAMPLE_COUNT": 10,
            "START_TRAIN_AFTER_SAMPLE_COUNT": 5,
            "START_TEST_AFTER_SAMPLE_COUNT": 5,
        },
        func_dict={
            'test': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=10),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=100,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Esempio n. 7
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = env.env_spec
    mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec,
                                                name_scope=name + '_mlp_dyna',
                                                name=name + '_mlp_dyna',
                                                learning_rate=0.01,
                                                mlp_config=[{
                                                    "ACT":
                                                    "TANH",
                                                    "B_INIT_VALUE":
                                                    0.0,
                                                    "NAME":
                                                    "1",
                                                    "L1_NORM":
                                                    0.0,
                                                    "L2_NORM":
                                                    0.0,
                                                    "N_UNITS":
                                                    128,
                                                    "TYPE":
                                                    "DENSE",
                                                    "W_NORMAL_STDDEV":
                                                    0.03
                                                }, {
                                                    "ACT":
                                                    "LINEAR",
                                                    "B_INIT_VALUE":
                                                    0.0,
                                                    "NAME":
                                                    "OUPTUT",
                                                    "L1_NORM":
                                                    0.0,
                                                    "L2_NORM":
                                                    0.0,
                                                    "N_UNITS":
                                                    env_spec.flat_obs_dim,
                                                    "TYPE":
                                                    "DENSE",
                                                    "W_NORMAL_STDDEV":
                                                    0.03
                                                }])
    algo = ModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=dict(SAMPLED_HORIZON=2,
                                   SAMPLED_PATH_NUM=5,
                                   dynamics_model_train_iter=10),
        name=name + '_mpc',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy'))
    algo.set_terminal_reward_function_for_dynamics_env(
        reward_func=RandomRewardFunc(name='reward_func'),
        terminal_func=RandomTerminalFunc(name='random_terminal'),
    )
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = create_train_test_flow(test_every_sample_count=10,
                                  train_every_sample_count=10,
                                  start_test_after_sample_count=5,
                                  start_train_after_sample_count=5,
                                  train_func_and_args=(agent.train, (),
                                                       dict()),
                                  test_func_and_args=(agent.test, (),
                                                      dict(sample_count=10)),
                                  sample_func_and_args=(agent.sample, (),
                                                        dict(sample_count=100,
                                                             env=agent.env,
                                                             store_flag=True)))
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Esempio n. 8
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        learning_rate=0.01,
        state_input_scaler=RunningStandardScaler(dims=env_spec.flat_obs_dim),
        action_input_scaler=RunningStandardScaler(
            dims=env_spec.flat_action_dim),
        output_delta_state_scaler=RunningStandardScaler(
            dims=env_spec.flat_obs_dim),
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=RandomRewardFunc())
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    flow = create_dyna_flow(
        train_algo_func=(agent.train, (), dict(state='state_agent_training')),
        train_algo_from_synthesized_data_func=(
            agent.train, (), dict(state='state_agent_training')),
        train_dynamics_func=(agent.train, (),
                             dict(state='state_dynamics_training')),
        test_algo_func=(agent.test, (), dict(sample_count=10)),
        test_dynamics_func=(agent.algo.test_dynamics, (),
                            dict(sample_count=10, env=env)),
        sample_from_real_env_func=(agent.sample, (),
                                   dict(sample_count=10,
                                        env=agent.env,
                                        in_which_status='TRAIN',
                                        store_flag=True)),
        sample_from_dynamics_env_func=(agent.sample, (),
                                       dict(sample_count=10,
                                            env=agent.algo.dynamics_env,
                                            in_which_status='TRAIN',
                                            store_flag=True)),
        train_algo_every_real_sample_count_by_data_from_real_env=10,
        train_algo_every_real_sample_count_by_data_from_dynamics_env=10,
        test_algo_every_real_sample_count=10,
        test_dynamics_every_real_sample_count=10,
        train_dynamics_ever_real_sample_count=10,
        start_train_algo_after_sample_count=1,
        start_train_dynamics_after_sample_count=1,
        start_test_algo_after_sample_count=1,
        start_test_dynamics_after_sample_count=1,
        warm_up_dynamics_samples=1)

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    experiment.run()
Esempio n. 9
0
def task_fn():
    # create the gym environment by make function
    env = make('Pendulum-v0')
    # give your experiment a name which is used to generate the log path etc.
    name = 'demo_exp'
    # construct the environment specification
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)
    # construct the neural network to approximate q function of DDPG
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    # construct the neural network to approximate policy for DDPG
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)
    # construct the DDPG algorithms
    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    # construct a neural network based global dynamics model to approximate the state transition of environment
    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        learning_rate=0.01,
        state_input_scaler=RunningStandardScaler(dims=env_spec.flat_obs_dim),
        action_input_scaler=RunningStandardScaler(
            dims=env_spec.flat_action_dim),
        output_delta_state_scaler=RunningStandardScaler(
            dims=env_spec.flat_obs_dim),
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    # finally, construct the Dyna algorithms with a model free algorithm DDGP, and a NN model.
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    # To make the NN based dynamics model a proper environment so be a sampling source for DDPG, reward function and
    # terminal function need to be set.

    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=RandomRewardFunc())
    # construct agent with additional exploration strategy if needed.
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    # construct the training flow, called Dyna flow. It defines how the training proceed, and the terminal condition
    flow = create_dyna_flow(
        train_algo_func=(agent.train, (), dict(state='state_agent_training')),
        train_algo_from_synthesized_data_func=(
            agent.train, (), dict(state='state_agent_training')),
        train_dynamics_func=(agent.train, (),
                             dict(state='state_dynamics_training')),
        test_algo_func=(agent.test, (), dict(sample_count=1)),
        test_dynamics_func=(agent.algo.test_dynamics, (),
                            dict(sample_count=10, env=env)),
        sample_from_real_env_func=(agent.sample, (),
                                   dict(sample_count=10,
                                        env=agent.env,
                                        store_flag=True)),
        sample_from_dynamics_env_func=(agent.sample, (),
                                       dict(sample_count=10,
                                            env=agent.algo.dynamics_env,
                                            store_flag=True)),
        train_algo_every_real_sample_count_by_data_from_real_env=40,
        train_algo_every_real_sample_count_by_data_from_dynamics_env=40,
        test_algo_every_real_sample_count=40,
        test_dynamics_every_real_sample_count=40,
        train_dynamics_ever_real_sample_count=20,
        start_train_algo_after_sample_count=1,
        start_train_dynamics_after_sample_count=1,
        start_test_algo_after_sample_count=1,
        start_test_dynamics_after_sample_count=1,
        warm_up_dynamics_samples=1)
    # construct the experiment
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    # run!
    experiment.run()