Ejemplo n.º 1
0
class TrpoModel(TensorflowBasedModel):
    key_list = Config.load_json(file_path=CONFIG_KEY + '/trpoModelKey.json')

    def __init__(self, config, action_bound, obs_bound):
        super().__init__(config=config)
        self.obs_dim = self.config.config_dict['STATE_SPACE']
        self.obs_dim = self.obs_dim[0] + 1
        self.act_dim = self.config.config_dict['ACTION_SPACE'][0]
        with tf.variable_scope(name_or_scope=self.config.config_dict['NAME']):
            self.scaler = Scaler(self.obs_dim)
            self.val_func = NNValueFunction(self.obs_dim,
                                            hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'],
                                            name_scope=self.config.config_dict['NAME'])
            self.policy = Policy(self.obs_dim,
                                 self.act_dim,
                                 kl_targ=self.config.config_dict['KL_TARG'],
                                 hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'],
                                 policy_logvar=self.config.config_dict['POLICY_LOGVAR'],
                                 name_scope=self.config.config_dict['NAME'])

        self._real_trajectories = {'observes': [],
                                   'actions': [],
                                   'rewards': [],
                                   'unscaled_obs': []}

        self._cyber_trajectories = {'observes': [],
                                    'actions': [],
                                    'rewards': [],
                                    'unscaled_obs': []}
        self._real_trajectories_memory = deque(maxlen=self.config.config_dict['EPISODE_REAL_MEMORY_SIZE'])
        self._cyber_trajectories_memory = deque(maxlen=self.config.config_dict['EPISODE_CYBER_MEMORY_SIZE'])
        self._real_step_count = 0.0
        self._cyber_step_count = 0.0
        self.action_low = action_bound[0]
        self.action_high = action_bound[1]
        self._env_status = self.config.config_dict['REAL_ENVIRONMENT_STATUS']

        self.real_data_memory = Memory(limit=10000,
                                       action_shape=self.config.config_dict['ACTION_SPACE'],
                                       observation_shape=self.config.config_dict['STATE_SPACE'])
        self.simulation_data_memory = Memory(limit=10000,
                                             action_shape=self.config.config_dict['ACTION_SPACE'],
                                             observation_shape=self.config.config_dict['STATE_SPACE'])

    @property
    def env_status(self):
        return self._env_status

    @env_status.setter
    def env_status(self, new):
        assert (new == self.config.config_dict['REAL_ENVIRONMENT_STATUS'] or new == self.config.config_dict[
            'CYBER_ENVIRONMENT_STATUS'])
        self._env_status = new
        # TODO change
        if new == self.config.config_dict['REAL_ENVIRONMENT_STATUS']:
            self.memory = self.real_data_memory
        elif new == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']:
            self.memory = self.simulation_data_memory
        else:
            raise KeyError('Environment status did not existed')

    @property
    def memory_length(self):
        count = 0
        # self._save_trajectories_to_memory(reset_step_count=False)
        for espiode in self.trajectories_memory:
            count += len(espiode['observes'])
        return count

    @property
    def current_env_status(self):
        if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']:
            return 'REAL_ENVIRONMENT_STATUS'
        elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']:
            return 'CYBER_ENVIRONMENT_STATUS'

    @property
    def trajectories_memory(self):
        if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']:
            return self._real_trajectories_memory
        elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']:
            return self._cyber_trajectories_memory

    @property
    def trajectories(self):
        if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']:
            return self._real_trajectories
        elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']:
            return self._cyber_trajectories

    @trajectories.setter
    def trajectories(self, new_val):
        if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']:
            self._real_trajectories = new_val
        elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']:
            self._cyber_trajectories = new_val

    @property
    def step_count(self):
        if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']:
            return self._real_step_count
        elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']:
            return self._cyber_step_count
        else:
            raise KeyError('Environment status did not existed')

    @step_count.setter
    def step_count(self, new_val):
        if self._env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']:
            self._real_step_count = new_val
        elif self._env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']:
            self._cyber_step_count = new_val
        else:
            raise KeyError('Environment status did not existed')

    def copy_model(self, new_model):
        assert isinstance(new_model, type(self))
        self.policy.copy_weight(new_model.policy)
        self.val_func.copy_weight(new_model.val_func)
        from copy import deepcopy
        self.scaler = deepcopy(new_model.scaler)

    def update(self):

        observes, actions, advantages, disc_sum_rew = self._return_train_data()
        # TODO FIX LOGGER AND UODATE LOG DATA
        loss, entropy, kl, beta, lr_multiplier = self.policy.update(observes=observes,
                                                                    actions=actions,
                                                                    advantages=advantages,
                                                                    logger=None)

        loss_val, exp_var, old_exp_var = self.val_func.fit(x=observes,
                                                           y=disc_sum_rew,
                                                           logger=None)
        res_dict = {
            self.name + '_POLICY_LOSS': loss,
            self.name + '_ENTROPY': entropy,
            self.name + '_KL': kl,
            self.name + '_BETA': beta,
            self.name + '_LR_MULTIPLIER': lr_multiplier,
            self.name + '_VAL_FUNCTION_LOSS': loss_val,
            self.name + '_EXP_VAR': exp_var,
            self.name + '_OLD_EXP_VAR': old_exp_var,
            self.name + '_ENV_STATUS': self.current_env_status,
            self.name + '_TRAIN_SAMPLE_COUNT': len(observes)
        }
        self.log_queue.put(res_dict)

        return {
            'VALUE_FUNCTION_LOSS': loss_val,
            'CONTROLLER_LOSS': loss
        }

    def predict(self, obs, step_count=None):
        obs = np.reshape(obs, [1, -1])

        if step_count is not None:
            obs = np.append(obs, [[step_count * self.config.config_dict['INCREMENT_ENV_STEP']]],
                            axis=1)
        else:
            obs = np.append(obs, [[self.step_count * self.config.config_dict['INCREMENT_ENV_STEP']]],
                            axis=1)
        scale, offset = self.scaler.get()
        scale[-1] = 1.0  # don't scale time step feature
        offset[-1] = 0.0  # don't offset time step feature
        obs = (obs - offset) * scale
        action = self.policy.sample(np.reshape(obs, [1, -1])).reshape((1, -1)).astype(np.float32)
        action = np.clip(action, a_min=self.action_low, a_max=self.action_high)
        return action

    def print_log_queue(self, status):
        self.status = status
        while self.log_queue.qsize() > 0:
            log = self.log_queue.get()
            print("%s: Policy Loss %f, Entropy %f, Kl %f, Beta %f, Lr multiplier %f, Val function loss %f, "
                  "Exp var %f, Old exp var %f" % (self.name,
                                                  log[self.name + '_POLICY_LOSS'],
                                                  log[self.name + '_ENTROPY'],
                                                  log[self.name + '_KL'],
                                                  log[self.name + '_BETA'],
                                                  log[self.name + '_LR_MULTIPLIER'],
                                                  log[self.name + '_VAL_FUNCTION_LOSS'],
                                                  log[self.name + '_EXP_VAR'],
                                                  log[self.name + '_OLD_EXP_VAR']
                                                  ))
            log['INDEX'] = self.log_print_count
            self.log_print_count += 1
            self.log_file_content.append(log)

    def reset(self):
        self.trajectories = {'observes': [],
                             'actions': [],
                             'rewards': [],
                             'unscaled_obs': []}
        self.step_count = 0

    def init(self):
        self.var_list = self.val_func.var_list + self.policy.var_list
        self.val_func.init()
        self.policy.init()
        self.trajectories = {'observes': [],
                             'actions': [],
                             'rewards': [],
                             'unscaled_obs': []}
        self.step_count = 0
        self.env_status = self.config.config_dict['REAL_ENVIRONMENT_STATUS']
        super().init()
        pass

    def store_one_sample(self, state, next_state, action, reward, done, *arg, **kwargs):
        # TODO HOW TO SET AND RESET STEP
        self.memory.append(obs0=state,
                           obs1=next_state,
                           action=action,
                           reward=reward,
                           terminal1=done)
        obs = state.astype(np.float32).reshape((1, -1))
        obs = np.append(obs, [[self.step_count * self.config.config_dict['INCREMENT_ENV_STEP']]],
                        axis=1)  # add time step feature
        self.trajectories['unscaled_obs'].append(obs)

        scale, offset = self.scaler.get()
        scale[-1] = 1.0  # don't scale time step feature
        offset[-1] = 0.0  # don't offset time step feature
        obs = (obs - offset) * scale  # center and scale observations
        self.trajectories['observes'].append(np.reshape(obs, [-1]))
        self.trajectories['actions'].append(np.reshape(action, [-1]))
        self.trajectories['rewards'].append(reward)
        self.step_count += 1
        if done is True:
            self._save_trajectories_to_memory(reset_step_count=True)

    def _return_train_data(self):
        trajectories = list(self.trajectories_memory)
        trpo_main.add_value(trajectories, val_func=self.val_func)
        trpo_main.add_disc_sum_rew(trajectories=trajectories,
                                   gamma=self.config.config_dict['GAMMA'])
        trpo_main.add_gae(trajectories=trajectories,
                          gamma=self.config.config_dict['GAMMA'],
                          lam=self.config.config_dict['LAM'])
        observes, actions, advantages, disc_sum_rew = trpo_main.build_train_set(trajectories=trajectories)
        # NO MORE CLEAR OF MEMORY
        if 'NOT_TRPO_CLEAR_MEMORY' in cfg.config_dict and cfg.config_dict['NOT_TRPO_CLEAR_MEMORY'] is True:
            pass
        else:
            self.trajectories_memory.clear()
        return observes, actions, advantages, disc_sum_rew

    def _save_trajectories_to_memory(self, reset_step_count=True):
        if len(self.trajectories['observes']) > 0:
            self.update_scale(unscaled_data=np.array(self.trajectories['unscaled_obs']).squeeze())
            if reset_step_count is True:
                self.step_count = 0
            for key, val in self.trajectories.items():
                self.trajectories[key] = np.array(val)
            self.trajectories_memory.append(self.trajectories)
            self.trajectories = {'observes': [],
                                 'actions': [],
                                 'rewards': [],
                                 'unscaled_obs': []}

    def update_scale(self, unscaled_data):
        self.scaler.update(x=unscaled_data)

    def q_value(self, state, step=0):
        return self.val_func.predict(x=np.array(state), step=step * self.config.config_dict['INCREMENT_ENV_STEP'])

    def return_most_recent_sample(self, sample_count, env_status, *args, **kwargs):
        if env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']:
            memory = self.real_data_memory
        elif env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']:
            memory = self.simulation_data_memory
        else:
            raise ValueError('Wrong Environment status')

        length = memory.nb_entries
        enough_flag = True
        if length < sample_count:
            enough_flag = False
        from src.util.sampler.sampler import SamplerData
        sample_data = SamplerData()
        for i in range(max(0, length - sample_count), length):
            sample_data.append(state=memory.observations0[i],
                               action=memory.actions[i],
                               new_state=memory.observations1[i],
                               done=memory.terminals1[i],
                               reward=memory.rewards[i])
        return sample_data, enough_flag

    def enough_data(self, sample_count, env_status):
        if env_status == self.config.config_dict['REAL_ENVIRONMENT_STATUS']:
            memory = self.real_data_memory
        elif env_status == self.config.config_dict['CYBER_ENVIRONMENT_STATUS']:
            memory = self.simulation_data_memory
        else:
            raise ValueError('Wrong Environment status')

        length = memory.nb_entries
        return length >= sample_count
class DynamicsEnvMlpModel(TensorflowBasedModel):
    key_list = Config.load_json(file_path=CONFIG_KEY +
                                '/dynamicsEnvMlpModelKey.json')

    def __init__(self, config, output_bound):
        # TODO THE PLACEHODER SHOULD MOVE TO AGENT AND USE IT AS INPUT FOR __init__
        super(DynamicsEnvMlpModel, self).__init__(config)

        with tf.variable_scope(name_or_scope=self.config.config_dict['NAME']):
            self.state_means = tf.placeholder(shape=list(
                self.config.config_dict['STATE_SPACE']),
                                              dtype=tf.float32,
                                              name='state_means')
            self.state_vars = tf.placeholder(shape=list(
                self.config.config_dict['STATE_SPACE']),
                                             dtype=tf.float32,
                                             name='state_vars')
            self.action_means = tf.placeholder(shape=list(
                self.config.config_dict['ACTION_SPACE']),
                                               dtype=tf.float32)
            self.action_vars = tf.placeholder(shape=list(
                self.config.config_dict['ACTION_SPACE']),
                                              dtype=tf.float32)
            self.output_means = tf.placeholder(shape=list(
                self.config.config_dict['STATE_SPACE']),
                                               dtype=tf.float32,
                                               name='delta_means')
            self.output_vars = tf.placeholder(shape=list(
                self.config.config_dict['STATE_SPACE']),
                                              dtype=tf.float32,
                                              name='delta_vars')

            self.state_input = tf.placeholder(
                shape=[None] + list(self.config.config_dict['STATE_SPACE']),
                dtype=tf.float32)
            self.action_input = tf.placeholder(
                shape=[None] + list(self.config.config_dict['ACTION_SPACE']),
                dtype=tf.float32)
            self.state_delta_label = tf.placeholder(
                shape=[None] + list(self.config.config_dict['STATE_SPACE']),
                dtype=tf.float32)

            self.norm_state_input = (self.state_input -
                                     self.state_means) / self.state_vars
            self.norm_action_input = (self.action_input -
                                      self.action_means) / self.action_vars
            self.norm_state_delta_label = (
                self.state_delta_label - self.output_means) / self.output_vars

            self.input = tf.concat(
                values=[self.norm_state_input, self.norm_action_input], axis=1)

            self.action_scalar = Scaler(
                obs_dim=self.config.config_dict['ACTION_SPACE'])
            self.state_scalar = Scaler(
                obs_dim=self.config.config_dict['STATE_SPACE'])
            self.delta_scalar = Scaler(
                obs_dim=self.config.config_dict['STATE_SPACE'])

            self.net, self.delta_state_output, self.trainable_var_list = \
                NetworkCreator.create_network(input=self.input,
                                              network_config=self.config.config_dict['NET_CONFIG'],
                                              net_name=self.config.config_dict['NAME'])
            # output_low=output_bound[0] - output_bound[1],
            # output_high=output_bound[1] - output_bound[0])

            self.loss, self.optimizer, self.optimize = self.create_training_method(
            )
            self.denorm_delta_state_output = self.delta_state_output * self.output_vars + self.output_means
            self.denorm_state_input = self.norm_state_input * self.state_vars + self.state_means

            self.output = self.state_input + self.denorm_delta_state_output

        self.var_list = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=self.config.config_dict['NAME'])

        self.variables_initializer = tf.variables_initializer(
            var_list=self.var_list)

    def create_training_method(self):
        l2_loss = tf.reduce_sum(
            [tf.nn.l2_loss(var) for var in self.trainable_var_list])

        loss = tf.reduce_mean(
            tf.reduce_sum(tf.square(self.norm_state_delta_label -
                                    self.delta_state_output),
                          reduction_indices=[1])) + 0.0 * l2_loss

        optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config.config_dict['LEARNING_RATE'])
        optimize = optimizer.minimize(loss=loss,
                                      var_list=self.trainable_var_list)
        return loss, optimizer, optimize

    def update_mean_var(self, state_input, action_input, delta_state_label):
        self.state_scalar.update(x=state_input)
        self.action_scalar.update(x=action_input)
        self.delta_scalar.update(x=delta_state_label)

    def update(self, sess, state_input, action_input, delta_state_label):

        state_input = np.reshape(state_input,
                                 newshape=[-1] +
                                 list(self.config.config_dict['STATE_SPACE']))
        action_input = np.reshape(
            action_input,
            newshape=[-1] + list(self.config.config_dict['ACTION_SPACE']))
        delta_state_label = np.reshape(
            delta_state_label,
            newshape=[-1] + list(self.config.config_dict['STATE_SPACE']))

        total_loss = 0.0
        batch_count = len(state_input) // self.config.config_dict['BATCH_SIZE']
        if batch_count <= 0:
            raise ValueError(
                'Batch count is zero, input data size: %d, batch size %d' %
                (len(state_input), self.config.config_dict['BATCH_SIZE']))
        for j in range(batch_count):
            state_intput_j = state_input[
                self.config.config_dict['BATCH_SIZE'] *
                j:self.config.config_dict['BATCH_SIZE'] * (j + 1), :]
            action_input_j = action_input[
                self.config.config_dict['BATCH_SIZE'] *
                j:self.config.config_dict['BATCH_SIZE'] * (j + 1), :]
            delta_state_label_j = delta_state_label[
                self.config.config_dict['BATCH_SIZE'] *
                j:self.config.config_dict['BATCH_SIZE'] * (j + 1), :]

            _, loss = sess.run(fetches=[self.optimize, self.loss],
                               feed_dict={
                                   self.state_input:
                                   state_intput_j,
                                   self.action_input:
                                   action_input_j,
                                   self.state_delta_label:
                                   delta_state_label_j,
                                   self.state_vars:
                                   np.sqrt(self.state_scalar.vars),
                                   self.state_means:
                                   self.state_scalar.means,
                                   self.action_vars:
                                   np.sqrt(self.action_scalar.vars),
                                   self.action_means:
                                   self.action_scalar.means,
                                   self.output_means:
                                   self.delta_scalar.means,
                                   self.output_vars:
                                   np.sqrt(self.delta_scalar.vars)
                               })
            total_loss += loss
        average_loss = total_loss / batch_count
        self.log_queue.put({self.name + '_LOSS': average_loss})
        return average_loss

    def test(self, sess, state_input, action_input, delta_state_label):
        state_input = np.reshape(state_input,
                                 newshape=[-1] +
                                 list(self.config.config_dict['STATE_SPACE']))
        action_input = np.reshape(
            action_input,
            newshape=[-1] + list(self.config.config_dict['ACTION_SPACE']))
        delta_state_label = np.reshape(
            delta_state_label,
            newshape=[-1] + list(self.config.config_dict['STATE_SPACE']))

        loss = sess.run(fetches=self.loss,
                        feed_dict={
                            self.state_input: state_input,
                            self.action_input: action_input,
                            self.state_delta_label: delta_state_label,
                            self.state_vars: np.sqrt(self.state_scalar.vars),
                            self.state_means: self.state_scalar.means,
                            self.action_vars: np.sqrt(self.action_scalar.vars),
                            self.action_means: self.action_scalar.means,
                            self.output_means: self.delta_scalar.means,
                            self.output_vars: np.sqrt(self.delta_scalar.vars)
                        })

        self.log_queue.put({self.name + '_LOSS': np.mean(loss)})

    def predict(self, sess, state_input, action_input):
        state_input = np.reshape(state_input,
                                 newshape=[-1] +
                                 list(self.config.config_dict['STATE_SPACE']))
        action_input = np.reshape(
            action_input,
            newshape=[-1] + list(self.config.config_dict['ACTION_SPACE']))

        res = sess.run(fetches=[self.output],
                       feed_dict={
                           self.state_input: state_input,
                           self.action_input: action_input,
                           self.state_vars: np.sqrt(self.state_scalar.vars),
                           self.state_means: self.state_scalar.means,
                           self.action_vars: np.sqrt(self.action_scalar.vars),
                           self.action_means: self.action_scalar.means,
                           self.output_means: self.delta_scalar.means,
                           self.output_vars: np.sqrt(self.delta_scalar.vars)
                       })

        return utl.squeeze_array(res,
                                 dim=1 +
                                 len(self.config.config_dict['STATE_SPACE']))

    def init(self):
        sess = tf.get_default_session()
        sess.run(self.variables_initializer)
        super().init()