Esempio n. 1
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec,
                                                name_scope=name + '_mlp_dyna',
                                                name=name + '_mlp_dyna',
                                                **exp_config['DynamicsModel'])
    dyna_env = DynamicsEnvWrapper(mlp_dyna)
    dyna_env.set_terminal_reward_func(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=dyna_env.total_step_count_fn),
        reward_func=REWARD_FUNC_DICT['Pendulum-v0']())

    policy = iLQRPolicy(env_spec=env_spec,
                        **exp_config['ILQR'],
                        dynamics=dyna_env,
                        cost_fn=RewardFuncCostWrapper(
                            reward_func=REWARD_FUNC_DICT['Pendulum-v0']()))

    algo = iLQRAlogWrapper(policy=policy,
                           env_spec=env_spec,
                           dynamics_env=dyna_env)

    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'train_algo': None,
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=1)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=100, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': None,
            'train_algo_from_synthesized_data': None
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Esempio n. 2
0
                   action_space=env.action_space)
data = TransitionData(env_spec=env_spec)
policy = UniformRandomPolicy(env_spec=env_spec)
# Do some initial sampling here to train GP model
st = env.reset()
for i in range(100):
    ac = policy.forward(st)
    new_st, re, _, _ = env.step(ac)
    data.append(state=st, new_state=new_st, action=ac, reward=re, done=False)
    st = new_st

gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data)
gp.init()
gp.train()

dyna_env = DynamicsEnvWrapper(dynamics=gp)
# Since we only care about the prediction here, so we pass the terminal function and reward function setting with
# random one
dyna_env.set_terminal_reward_func(terminal_func=RandomTerminalFunc(),
                                  reward_func=RandomRewardFunc())

st = env.reset()
real_state_list = []
dynamics_state_list = []
test_sample_count = 100
for i in range(test_sample_count):
    ac = env_spec.action_space.sample()
    gp.reset_state(state=st)
    new_state_dynamics, _, _, _ = dyna_env.step(action=ac, allow_clip=True)
    new_state_real, _, done, _ = env.step(action=ac)
    real_state_list.append(new_state_real)
Esempio n. 3
0
 def return_as_env(self) -> Env:
     return DynamicsEnvWrapper(dynamics=self, name=self._name + '_env')