Example #1
0
    def test_dagger_1(self):
        mlp_dyna, local = self.create_continue_dynamics_model(
            env_id='ModifiedHalfCheetah', name='mlp_dyna_model')
        mlp_dyna.init()
        env = local['env']
        assert isinstance(env, ModifiedHalfCheetahEnv)
        env_spec = env.env_spec

        random_buffer = MPC_TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \
                                       action_shape=env_spec.action_shape, size=5)
        rl_buffer = MPC_TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \
                                   action_shape=env_spec.action_shape, size=10)

        obs = np.zeros((18), dtype=np.float32)
        for i in range(5):
            act = np.zeros((6), dtype=np.float32)
            obs_, reward, done, _ = np.zeros((18),
                                             dtype=np.float32), 0., False, 0.
            random_buffer.append(obs, act, obs_, done, reward)
            obs = obs_
        dagger_buffer = random_buffer.union(rl_buffer, rand_rl_ratio=0.1)
        self.assertEqual(dagger_buffer, random_buffer)

        obs = env.reset()
        for i in range(10):
            act = self.RandomController_get_action(env=env, state=obs)
            obs_, reward, done, _ = env.step(act)
            rl_buffer.append(obs, act, obs_, done, reward)
            assert not done
            obs = obs_

        dagger_buffer = random_buffer.union(rl_buffer, rand_rl_ratio=0.1)
        self.assertEqual(len(dagger_buffer), 11)
Example #2
0
    def test_algo_ModelBasedModelPredictiveControl(self):
        '''
        Test algo = ModelBasedModelPredictiveControl()
        :return:

        '''
        name = 'mbmpc'
        mlp_dyna, local = self.create_continue_dynamics_model(
            env_id='ModifiedHalfCheetah', name='mlp_dyna_model')
        env = local['env']
        env_spec = env.env_spec

        policy = UniformRandomPolicy(env_spec=env_spec, name='urp')

        algo = ModelBasedModelPredictiveControl(
            dynamics_model=mlp_dyna,
            env_spec=env_spec,
            config_or_config_dict=dict(SAMPLED_HORIZON=20,
                                       SAMPLED_PATH_NUM=50,
                                       dynamics_model_train_iter=10),
            name=name,
            policy=policy)

        algo.set_terminal_reward_function_for_dynamics_env(
            reward_func=MBMPC_HalfCheetah_CostFunc(name='cost_fn'),
            terminal_func=MBMPC_HalfCheetah_CostFunc(name='terminal_fn'))
        algo.init()

        num_trajectory = 1  # default 10
        max_step = 50  # default 1000
        on_policy_iter = 10
        batch_size = 16
        num_simulated_paths = 100  # default 1000

        random_buffer_size = 500  # default 1000
        rl_buffer_size = 500  # default 1000
        random_buffer = MPC_TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \
                                       action_shape=env_spec.action_shape, size=max_step)
        rl_buffer = MPC_TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \
                                   action_shape=env_spec.action_shape, size=max_step)

        print("====> Prepare random_buffer")
        random_buffer = self.random_buffer_sample(env, random_buffer,
                                                  num_trajectory, max_step)
        normalized_random_buffer, mean_dict, var_dict = random_buffer.apply_normalization(
        )

        print("====> Start Training")
        for iter in range(on_policy_iter):
            data = normalized_random_buffer.union(rl_buffer, rand_rl_ratio=0.1)
            batch_data_list = data.sample_batch_as_Transition(
                batch_size=batch_size, shuffle_flag=True, all_as_batch=True)
            for batch_data in batch_data_list:
                print(algo.train(batch_data=batch_data, train_iter=10))
            rl_buffer = self.rl_buffer_sample_in_algo(algo, env, rl_buffer,
                                                      num_trajectory, max_step,
                                                      num_simulated_paths,
                                                      iter)
Example #3
0
    def test_dagger_3(self):
        '''
        MPC Training
        Add normalization and denormalization
        '''
        mlp_dyna, local = self.create_continue_dynamics_model(
            env_id='ModifiedHalfCheetah', name='mlp_dyna_model')
        mlp_dyna.init()
        print(mlp_dyna.state_input_scaler)

        env = local['env']
        assert isinstance(env, ModifiedHalfCheetahEnv)
        env_spec = env.env_spec

        num_trajectory = 1  # default 10
        max_step = 50  # default 1000
        on_policy_iter = 10
        train_iter = 10
        batch_size = 16
        rand_rl_ratio = 0.1
        num_simulated_paths = 100  # default 1000

        random_buffer_size = 500  # default 1000
        rl_buffer_size = 500  # default 1000

        random_buffer = MPC_TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \
                                       action_shape=env_spec.action_shape, size=max_step)
        rl_buffer = MPC_TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \
                                   action_shape=env_spec.action_shape, size=max_step)

        print("====> Prepare random_buffer")
        random_buffer = self.random_buffer_sample(env, random_buffer,
                                                  num_trajectory, max_step)
        normalized_random_buffer, mean_dict, var_dict = random_buffer.apply_normalization(
        )

        print("====> Start Training")
        for iter in range(on_policy_iter):
            data = normalized_random_buffer.union(rl_buffer,
                                                  rand_rl_ratio=rand_rl_ratio)
            batch_data_list = data.sample_batch_as_Transition(
                batch_size=batch_size, shuffle_flag=True, all_as_batch=True)
            for batch_data in batch_data_list:
                print(
                    mlp_dyna.train(batch_data=batch_data,
                                   train_iter=train_iter))
            rl_buffer = self.rl_buffer_sample(mlp_dyna, env, rl_buffer,
                                              num_trajectory, max_step,
                                              num_simulated_paths, iter)
Example #4
0
    def __init__(self,
                 train_sample_count_func,
                 config_or_config_dict: (DictConfig, dict),
                 func_dict: dict
                 ):
        super(MBMPC_TrainFlow, self).__init__(func_dict=func_dict)
        config = construct_dict_config(config_or_config_dict, obj=self)
        self.parameters = Parameters(source_config=config, parameters=dict())  # hyper parameter instance
        if train_sample_count_func:
            assert callable(train_sample_count_func)    # return TOTAL_AGENT_TRAIN_SAMPLE_COUNT

        from baconian.common.sampler.sample_data import MPC_TransitionData
        self.env = self.parameters('env')
        self.env_spec = self.env.env_spec
        env_spec = self.env_spec
        self.random_buffer = MPC_TransitionData(env_spec=env_spec,
                                                obs_shape=env_spec.obs_shape,
                                                action_shape=env_spec.action_shape,
                                                size=self.parameters('random_size'))
        self.rl_buffer = MPC_TransitionData(env_spec=env_spec,
                                                obs_shape=env_spec.obs_shape,
                                                action_shape=env_spec.action_shape,
                                                size=self.parameters('rl_size'))
Example #5
0
    def test_agent_MPC(self):
        '''
        Capsule in agent.
        :return:
        '''

        name = 'mb_mpc'
        mlp_dyna, local = self.create_continue_dynamics_model(
            env_id='ModifiedHalfCheetah', name='mlp_dyna_model')
        env = local['env']
        env_spec = env.env_spec

        policy = UniformRandomPolicy(env_spec=env_spec, name='urp')

        algo = ModelBasedModelPredictiveControl(
            dynamics_model=mlp_dyna,
            env_spec=env_spec,
            config_or_config_dict=dict(SAMPLED_HORIZON=20,
                                       SAMPLED_PATH_NUM=50,
                                       dynamics_model_train_iter=10),
            name=name,
            policy=policy)

        algo.set_terminal_reward_function_for_dynamics_env(
            reward_func=MBMPC_HalfCheetah_CostFunc(name='cost_fn'),
            terminal_func=MBMPC_HalfCheetah_CostFunc(name='terminal_fn'))
        agent = MB_MPC_Agent(name=name + '_agent',
                             env=env,
                             env_spec=env_spec,
                             algo=algo,
                             exploration_strategy=None,
                             algo_saving_scheduler=None)
        agent.init()

        num_trajectory = 1  # default 10
        max_step = 50  # default 1000
        on_policy_iter = 10
        batch_size = 10
        num_simulated_paths = 100  # default 1000

        # TODO: 9.22 Is there relations between buffer_size and max_step?
        #  Besides, why multiple num_simluated_paths

        random_size = 500  # default 1000
        rl_size = 500  # default 1000
        random_size = max_step
        rl_size = max_step

        random_buffer = MPC_TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \
                                           action_shape=env_spec.action_shape, size=r)
        rl_buffer = MPC_TransitionData(env_spec=env_spec, obs_shape=env_spec.obs_shape, \
                                       action_shape=env_spec.action_shape, size=max_step)

        print("====> Prepare random_buffer")
        random_buffer = self.random_buffer_sample(env, random_buffer,
                                                  num_trajectory, max_step)
        normalized_random_buffer, mean_dict, var_dict = random_buffer.apply_normalization(
        )

        print("====> Start Training")
        for iter in range(on_policy_iter):
            data = normalized_random_buffer.union(rl_buffer, rand_rl_ratio=0.1)
            batch_data_list = data.sample_batch_as_Transition(
                batch_size=batch_size, shuffle_flag=True, all_as_batch=True)
            for batch_data in batch_data_list:
                agent.train(batch_data=batch_data, train_iter=60)

            rl_buffer = agent.sample(env=env,
                                     sample_count=iter,
                                     buffer=rl_buffer,
                                     num_trajectory=num_trajectory,
                                     max_step=max_step,
                                     num_simulated_paths=num_simulated_paths,
                                     in_which_status='TRAIN',
                                     store_flag=False)
Example #6
0
class MBMPC_TrainFlow(Flow):
    '''
    TrainFlow for MBMPC.
    '''
    required_func = ('train', 'sample')
    required_key_dict = {
        "TRAIN_EVERY_SAMPLE_COUNT": 1,
        "TEST_EVERY_SAMPLE_COUNT": None,
        "START_TRAIN_AFTER_SAMPLE_COUNT": None,
        "START_TEST_AFTER_SAMPLE_COUNT": None,
    }

    def __init__(self,
                 train_sample_count_func,
                 config_or_config_dict: (DictConfig, dict),
                 func_dict: dict
                 ):
        super(MBMPC_TrainFlow, self).__init__(func_dict=func_dict)
        config = construct_dict_config(config_or_config_dict, obj=self)
        self.parameters = Parameters(source_config=config, parameters=dict())  # hyper parameter instance
        if train_sample_count_func:
            assert callable(train_sample_count_func)    # return TOTAL_AGENT_TRAIN_SAMPLE_COUNT

        from baconian.common.sampler.sample_data import MPC_TransitionData
        self.env = self.parameters('env')
        self.env_spec = self.env.env_spec
        env_spec = self.env_spec
        self.random_buffer = MPC_TransitionData(env_spec=env_spec,
                                                obs_shape=env_spec.obs_shape,
                                                action_shape=env_spec.action_shape,
                                                size=self.parameters('random_size'))
        self.rl_buffer = MPC_TransitionData(env_spec=env_spec,
                                                obs_shape=env_spec.obs_shape,
                                                action_shape=env_spec.action_shape,
                                                size=self.parameters('rl_size'))

    def _launch(self) -> bool:
        env = self.parameters('env')
        max_step = self.parameters('max_step')
        train_iter = self.parameters('train_iter')
        batch_size = self.parameters('batch_size')
        rand_rl_ratio = self.parameters('rand_rl_ratio')
        random_trajectory = self.parameters('random_trajectory')
        on_policy_trajectory = self.parameters('on_policy_trajectory')
        on_policy_iter = self.parameters('on_policy_iter')
        num_simulated_paths = self.parameters('num_simulated_paths')

        print("====> Preprocessing Data")   # normalization
        self.random_buffer, self.mean_dict, self.var_dict = self.data_preprocess(env, self.random_buffer, random_trajectory, max_step)

        print("====> Start Training")
        for iter in range(on_policy_iter):
            data = self.random_buffer.union(self.rl_buffer, rand_rl_ratio=rand_rl_ratio)
            batch_data_list = data.sample_batch_as_Transition(batch_size=batch_size, shuffle_flag=True, all_as_batch=True)
            for batch_data in batch_data_list:
                self._call_func('train', batch_data=batch_data) # dyna_epoch defined in agent
            self.rl_buffer = self._call_func('sample',
                                             env=env,
                                             sample_count=iter,
                                             buffer=self.rl_buffer,
                                             num_trajectory=on_policy_trajectory,
                                             max_step=max_step,
                                             num_simulated_paths=num_simulated_paths,
                                             in_which_status='TRAIN',
                                             store_flag=False)
        return True


    def random_buffer_sample(self, env, buffer, num_trajectory, max_step):
        '''RandomController.sample()'''
        for i in range(num_trajectory):
            obs = env.reset()
            ep_len = 0
            for j in range(max_step):
                act = self.env.action_space.sample()
                obs_, rew, done, _ = env.step(act)
                buffer.append(obs, act, obs_, done, rew)
                if done:
                    break
                else:
                    obs = obs_
        return buffer

    def data_preprocess(self, env, buffer, num_trajectory, max_step):
        buffer = self.random_buffer_sample(env, buffer, num_trajectory, max_step)
        normalized_buffer, mean_dict, var_dict = buffer.apply_normalization()
        return normalized_buffer, mean_dict, var_dict