Example #1
0
class MyEnvironment(Env):
    def __init__(self):
        self.action_space = Box(low=np.array([-1.]), high=np.array([1.]))
        self.observation_space = Box(low=np.array([-1.]), high=np.array([1.]))
        self.reward_range = (-1., 1.)

    def _step(self, action):
        return self.observation_space.sample()

    def _reset(self):
        pass

    def _render(self, mode='human', close=False):
        pass

    def _configure(self):
        pass

    def _seed(self, seed=None):
        pass
class BaselineTrainerEnv(BasicEnv):
    key_list = Config.load_json(file_path=CONFIG_KEY +
                                '/baselineTrainerEnvKey.json')

    def __init__(self, config, cyber_env, real_env, target_agent, test_env):
        super(BaselineTrainerEnv, self).__init__(config=config)
        self.action_space = Box(low=0,
                                high=1e10,
                                shape=np.array(
                                    self.config.config_dict['ACTION_SPACE']))

        self.observation_space = Box(
            low=-1e10,
            high=1e10,
            shape=np.array(self.config.config_dict['STATE_SPACE']))

        self.cyber_env = cyber_env
        self.real_env = real_env
        self.target_agent = target_agent
        self.test_env = test_env
        self.random_agent = RandomAgent(config=None,
                                        env=test_env,
                                        sampler=Sampler())
        self.stepper = BaselineTrainerEnvStep(
            config=None,
            registred_type=self.config.config_dict['BASELINE_ENV_STEP_TYPE'])

        self.real_env_sample_memory = DynamicsEnvironmentMemory()

        self.target_agent_real_env_reward_deque = \
            deque(maxlen=self.config.config_dict['TARGET_AGENT_REAL_ENV_REWARD_QUEUE_MAX_LENGTH'])

        self.target_agent_cyber_env_reward_deque = \
            deque(maxlen=self.config.config_dict['TARGET_AGENT_CYBER_ENV_REWARD_QUEUE_MAX_LENGTH'])
        self.dyna_error_dequeu = \
            deque(maxlen=self.config.config_dict['TARGET_AGENT_CYBER_ENV_REWARD_QUEUE_MAX_LENGTH'])

        self.critic_change = 0.
        self.actor_change = 0.
        self.critic_loss = 0.
        self.actor_loss = 0.
        self.sample_count = 0
        self.last_test = 0
        self.last_train = 0

    def step(self, action):
        return self.stepper.step(env=self, action=action)

    def reset(self):
        super().reset()
        return self.observation_space.sample()

    def init(self):
        self.cyber_env.init()
        self.cyber_env.reset()
        if hasattr(self.real_env, 'init') and callable(self.real_env.init):
            self.real_env.init()
        if hasattr(self.test_env, 'init') and callable(self.test_env.init):
            self.test_env.init()
        self.real_env.reset()
        self.test_env.reset()
        self.target_agent.init()
        self.init_train()
        super().init()

    def init_train(self):
        print("\nInit train----------------------")
        self.random_agent.sampler.env_status = self.random_agent.sampler.config.config_dict[
            'REAL_ENVIRONMENT_STATUS']
        sample_data = self.random_agent.sample(
            env=self.real_env,
            sample_count=self.config.
            config_dict['CYBER_INIT_TRAIN_SAMPLE_COUNT'],
            store_flag=False,
            agent_print_log_flag=False)
        for j in range(len(sample_data.state_set)):
            data_dict = {
                'obs0':
                sample_data.state_set[j],
                'obs1':
                sample_data.new_state_set[j],
                'action':
                sample_data.action_set[j],
                'reward':
                sample_data.reward_set[j],
                'terminal1':
                sample_data.done_set[j],
                'delta_state':
                sample_data.new_state_set[j] - sample_data.state_set[j]
            }
            self.real_env_sample_memory.append(data_dict)
        self.cyber_env.model.update_mean_var(
            state_input=np.array(sample_data.state_set),
            action_input=np.array(sample_data.action_set),
            delta_state_label=np.array(sample_data.new_state_set) -
            np.array(sample_data.state_set))
        for i in range(self.config.config_dict['DYNAMICS_TRAIN_ITERATION']):
            data = self.real_env_sample_memory.sample(
                batch_size=self.cyber_env.model.config.
                config_dict['BATCH_SIZE'])
            self.cyber_env.status = self.status_key['TRAIN']
            self.cyber_env.fit(state_set=data['obs0'],
                               action_set=data['action'],
                               delta_state_label_set=data['delta'],
                               sess=tf.get_default_session())
            self.cyber_env.print_log_queue(status=self.status_key['TRAIN'])
        self.real_env.reset()
        sample_data = self.target_agent.sample(env=self.real_env,
                                               sample_count=250,
                                               store_flag=False,
                                               agent_print_log_flag=False)
        unscaled_data = np.concatenate([[state.tolist() + [0]]
                                        for state in sample_data.state_set],
                                       axis=0)
        self.target_agent.model.update_scale(unscaled_data=unscaled_data)
        self.real_env.reset()

    def close(self):
        print('Close the trainer environment')

    def configure(self):
        pass

    def seed(self, seed=None):
        pass

    def get_state(self, env):
        return 0.

    def _sample_from_real_env(self, sample_count, sample_step):
        self.target_agent.status = self.target_agent.status_key['TRAIN']
        self.target_agent.env_status = self.target_agent.config.config_dict[
            'REAL_ENVIRONMENT_STATUS']
        real_reward_data_this_step = []
        for i in range(sample_step):
            sample_data = self.target_agent.sample(env=self.real_env,
                                                   sample_count=sample_count,
                                                   store_flag=True,
                                                   agent_print_log_flag=True)

            for j in range(len(sample_data.state_set)):
                real_reward_data_this_step.append(sample_data.reward_set[j])

                data_dict = {
                    'obs0':
                    sample_data.state_set[j],
                    'obs1':
                    sample_data.new_state_set[j],
                    'action':
                    sample_data.action_set[j],
                    'reward':
                    sample_data.reward_set[j],
                    'terminal1':
                    sample_data.done_set[j],
                    'delta_state':
                    sample_data.new_state_set[j] - sample_data.state_set[j]
                }
                self.real_env_sample_memory.append(data_dict)
            self.cyber_env.model.update_mean_var(
                state_input=np.array(sample_data.state_set),
                action_input=np.array(sample_data.action_set),
                delta_state_label=np.array(sample_data.new_state_set) -
                np.array(sample_data.state_set))
        self.target_agent_real_env_reward_deque.append(
            np.mean(real_reward_data_this_step))

    def _sample_from_cyber_env(self, sample_count, sample_step):
        cyber_reward_data_this_step = []
        self.target_agent.env_status = self.target_agent.config.config_dict[
            'CYBER_ENVIRONMENT_STATUS']
        self.target_agent.status = self.target_agent.status_key['TRAIN']

        for i in range(sample_step):
            sample_data = self.target_agent.sample(env=self.cyber_env,
                                                   sample_count=sample_count,
                                                   store_flag=True,
                                                   agent_print_log_flag=True)
            for j in range(len(sample_data.state_set)):
                cyber_reward_data_this_step.append(sample_data.reward_set[j])
        self.target_agent_cyber_env_reward_deque.append(
            np.mean(cyber_reward_data_this_step))
Example #3
0
class DynamicsEnv(BasicEnv):
    key_list = Config.load_json(file_path=CONFIG_KEY + '/dynamicsEnvKey.json')

    # TODO Modify cost function

    def __init__(self, config, model, sess, max_episode_steps=1000, init_env=None, cost=None, done=None, reset=None):
        super(DynamicsEnv, self).__init__(config=config)

        self.cost_fn = cost
        self.done_fn = done
        self.reset_fn = reset

        if init_env:
            self.action_space = copy.deepcopy(init_env.action_space)
            self.observation_space = copy.deepcopy(init_env.observation_space)
        else:
            low = np.array(
                [self.config.config_dict['ACTION_LOW'] for i in range(self.config.config_dict['ACTION_SPACE'][0])])
            high = np.array(
                [self.config.config_dict['ACTION_HIGH'] for i in range(self.config.config_dict['ACTION_SPACE'][0])])
            self.action_space = Box(low, high)

            low = np.array(
                [self.config.config_dict['STATE_LOW'] for i in range(self.config.config_dict['STATE_SPACE'][0])])
            high = np.array(
                [self.config.config_dict['STATE_HIGH'] for i in range(self.config.config_dict['STATE_SPACE'][0])])
            self.observation_space = Box(low, high)

        self.model = model
        self.sess = sess
        if 'SWIMMER_HORIZON' in cfg.config_dict:
            self.config.config_dict["MAX_SAMPLE_HORIZON"] = cfg.config_dict['SWIMMER_HORIZON']
        self._max_episode_steps = self.config.config_dict["MAX_SAMPLE_HORIZON"]

        self._max_episode_steps_fixed = self._max_episode_steps
        self._elapsed_steps = 0
        self.state = self.reset()

    @property
    def status(self):
        return self._status

    @property
    def max_step(self):
        return self.config.config_dict["MAX_SAMPLE_HORIZON"]

    def set_max_step(self, val):
        assert 0 < val <= self.config.config_dict["MAX_SAMPLE_HORIZON"]
        self._max_episode_steps = val

    @status.setter
    def status(self, new_value):
        if new_value != self.status_key['TRAIN'] and new_value != self.status_key['TEST']:
            raise KeyError('New Status: %d did not existed' % new_value)

        if self._status == new_value:
            return
        self._status = new_value
        self.model.status = new_value

    def step(self, action):

        super().step(action=action)
        prev_state = self._get_obs()
        state = self.model.predict(sess=self.sess,
                                   state_input=self._get_obs(),
                                   action_input=action)
        state = model_util.squeeze_array(state, dim=1)

        state = np.clip(state, a_min=self.observation_space.low, a_max=self.observation_space.high)

        reward = self.cost_fn(state=prev_state, action=action, next_state=state)
        info = None

        self._elapsed_steps += 1
        done = self.done_check(prev_state=prev_state, state=state, action=action)

        if done is True:
            state = self.reset()

        self.state = state
        return state, reward, done, info

    def done_check(self, prev_state, state, action):
        if self.done_fn:
            done = self.done_fn(prev_state, state, action)
        else:
            done = False

        if self._elapsed_steps >= self._max_episode_steps:
            done = True
        return done

    def fit(self, state_set, action_set, delta_state_label_set, sess):
        loss = self.model.update(sess=sess,
                                 state_input=state_set,
                                 action_input=action_set,
                                 delta_state_label=delta_state_label_set)
        return loss

    def test(self, state_set, action_set, delta_state_label_set, sess):
        self.model.test(sess=sess,
                        state_input=state_set,
                        action_input=action_set,
                        delta_state_label=delta_state_label_set)

    def print_log_queue(self, status):
        self.status = status
        self.model.print_log_queue(status)

    def _get_obs(self):
        return model_util.squeeze_array(np.array(self.state), dim=1)

    def get_state(self, env):
        return env._get_obs()

    def set_state(self, state):
        self.state = model_util.squeeze_array(state, dim=1)

    def close(self):
        print("Close Dynamics Env")

    def _configure(self):
        pass

    def seed(self, seed=None):
        pass

    def reset(self):
        # TODO MODIFY THE RANGE OF
        super().reset()
        self._elapsed_steps = 0
        if self.reset_fn:
            self.state = self.reset_fn()
        else:
            self.state = self.observation_space.sample()
        return self.state

    def init(self):
        super().init()
        if hasattr(self.model, 'init') and callable(self.model.init):
            self.model.init()