def getAdvantageGrads(self, states, next_states, alreadyNormed=False):
     """
         The states should be normalized
     """
     if (alreadyNormed == False):
         states = norm_state(states, self._state_bounds)
         next_states = norm_state(next_states, self._state_bounds)
     states = np.array(states, dtype=self._settings['float_type'])
     self._model.setStates(states)
     self._model.setResultStates(next_states)
     return self._get_grad()
Beispiel #2
0
 def predict(self,
             state,
             deterministic_=True,
             evaluation_=False,
             p=None,
             sim_index=None,
             bootstrapping=False):
     # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
     # states[0, ...] = state
     # state = np.array(state, dtype=self._settings['float_type'])
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     # action_ = lasagne.layers.get_output(self._model.getActorNetwork(), state, deterministic=deterministic_).mean()
     # action_ = scale_action(self._q_action()[0], self._action_bounds)
     # if deterministic_:
     action_ = scale_action(
         self._model.getActorNetwork().predict(
             state, batch_size=1)[:, :self._action_length],
         self._action_bounds)
     # action_ = scale_action(self._q_action_target()[0], self._action_bounds)
     # else:
     # action_ = scale_action(self._q_action()[0], self._action_bounds)
     # action_ = q_valsActA[0]
     return action_
Beispiel #3
0
    def q_value(self, state):
        """
            For returning a vector of q values, state should NOT be normalized
        """
        # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
        # states[0, ...] = state
        """
        if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])):
            pass
        else:
        """
        # print ("Agent state bounds: ", self._state_bounds)
        state = norm_state(state, self._state_bounds)
        # print ("Agent normalized state: ", state)
        state = np.array(state, dtype=self._settings['float_type'])
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            value = scale_reward(self._q_val(), self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
            # return (self._q_val())[0]
        else:
            value = scale_reward(self._q_val(), self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
#         print ("Agent scaled value: ", value)
        return value
Beispiel #4
0
 def q_value(self, state):
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     value = scale_reward(
         self._value([state, 0])[0], self.getRewardBounds()) * (
             1.0 / (1.0 - self.getSettings()['discount_factor']))
     return value
Beispiel #5
0
 def predict_std(self, state, deterministic_=True):
     # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
     # states[0, ...] = state
     """
     if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])):
         pass
     else:
     """
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     # action_ = lasagne.layers.get_output(self._model.getActorNetwork(), state, deterministic=deterministic_).mean()
     # action_ = scale_action(self._q_action()[0], self._action_bounds)
     # if deterministic_:
     # action_std = scale_action(self._q_action_std()[0], self._action_bounds)
     if (('disable_parameter_scaling' in self._settings)
             and (self._settings['disable_parameter_scaling'])):
         action_std = self._q_action_std()
         # action_std = self._q_action_std()[0] * (action_bound_std(self._action_bounds))
     else:
         action_std = self._q_action_std() * (action_bound_std(
             self._action_bounds))
     # else:
     # action_ = scale_action(self._q_action()[0], self._action_bounds)
     # action_ = q_valsActA[0]
     return action_std
Beispiel #6
0
 def predict(self,
             state,
             deterministic_=True,
             evaluation_=False,
             p=None,
             sim_index=None,
             bootstrapping=False):
     # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
     # states[0, ...] = state
     """
     if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])):
         pass
     else:
     """
     # print ("Agent state bounds: ", self._state_bounds)
     state = norm_state(state, self._state_bounds)
     # print ("Agent normalized state: ", state)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     # action_ = lasagne.layers.get_output(self._model.getActorNetwork(), state, deterministic=deterministic_).mean()
     # action_ = scale_action(self._q_action()[0], self._action_bounds)
     # if deterministic_:
     if (('disable_parameter_scaling' in self._settings)
             and (self._settings['disable_parameter_scaling'])):
         action_ = self._q_action()
         # action_ = scale_action(self._q_action()[0], self._action_bounds)
     else:
         action_ = scale_action(self._q_action(), self._action_bounds)
     # print ("Agent Scaled action: ", action_)
     # action_ = scale_action(self._q_action_target()[0], self._action_bounds)
     # else:
     # action_ = scale_action(self._q_action()[0], self._action_bounds)
     # action_ = q_valsActA[0]
     return action_
Beispiel #7
0
    def q_values(self, state):
        """
            For returning a vector of q values, state should already be normalized
        """
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        action = self._q_action()
        self._model.setActions(action)
        self._modelTarget.setActions(action)

        if ('train_extra_value_function' in self.getSettings() and
            (self.getSettings()['train_extra_value_function'] == True)):
            q_vals = self._vals_extra()
        else:
            q_vals = self._q_val()
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            return scale_reward(q_vals, self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
            # return (self._q_val())[0]
        else:
            return scale_reward(q_vals, self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
Beispiel #8
0
 def q_valueWithDropout(self, state):
     # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
     # states[0, ...] = state
     state = np.array(state, dtype=self._settings['float_type'])
     state = norm_state(state, self._state_bounds)
     self._model.setStates(state)
     return scale_reward(self._q_val_drop(), self.getRewardBounds())
Beispiel #9
0
    def predict_std(self, state, deterministic_=True):
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=self._settings['float_type'])

        # action_std = self._model.getActorNetwork().predict(state, batch_size=1)[:,self._action_length:] * (action_bound_std(self._action_bounds))
        action_std = self._q_action_std([state])[0] * action_bound_std(
            self._action_bounds)
        # print ("Policy std: ", repr(action_std))
        return action_std
Beispiel #10
0
 def getGrads(self, states, alreadyNormed=False):
     """
         The states should be normalized
     """
     # self.setData(states, actions, rewards, result_states)
     if (alreadyNormed == False):
         states = norm_state(states, self._state_bounds)
     states = np.array(states, dtype=self._settings['float_type'])
     self._model.setStates(states)
     return self._get_grad()
 def predict_std(self, state, deterministic_=True):
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     if (('disable_parameter_scaling' in self._settings)
             and (self._settings['disable_parameter_scaling'])):
         action_std = self._q_action_std()[0]
     else:
         action_std = self._q_action_std()[0] * (action_bound_std(
             self._action_bounds))
     return action_std
Beispiel #12
0
 def getGrads(self, states, actions=None, alreadyNormed=False):
     """
         The states should be normalized
     """
     # self.setData(states, actions, rewards, result_states)
     if (alreadyNormed == False):
         states = norm_state(states, self._state_bounds)
     states = np.array(states, dtype=theano.config.floatX)
     self._model.setStates(states)
     if (actions is None):
         actions = self.predict_batch(states)
     self._model.setActions(actions)
     return self._get_state_grad()
Beispiel #13
0
 def predictWithDropout(self, state, deterministic_=True):
     # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
     # states[0, ...] = state
     state = np.array(state, dtype=self._settings['float_type'])
     state = norm_state(state, self._state_bounds)
     action_ = scale_action(
         self._model.getActorNetwork().predict(
             states, batch_size=1)[:, :self._action_length],
         self._action_bounds)
     # else:
     # action_ = scale_action(self._q_action()[0], self._action_bounds)
     # action_ = q_valsActA[0]
     return action_
Beispiel #14
0
 def q_value(self, state):
     # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
     # states[0, ...] = state
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     self._modelTarget.setStates(state)
     # return scale_reward(self._q_valTarget(), self.getRewardBounds())[0]
     value = scale_reward(
         self._model.getCriticNetwork().predict(state, batch_size=1),
         self.getRewardBounds()) * (
             1.0 / (1.0 - self.getSettings()['discount_factor']))
     return value
Beispiel #15
0
 def predict(self,
             state,
             deterministic_=True,
             evaluation_=False,
             p=None,
             sim_index=None,
             bootstrapping=False):
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     action_ = scale_action(
         self._model.getActorNetwork().predict(
             state, batch_size=1)[:, :self._action_length],
         self._action_bounds)
     return action_
Beispiel #16
0
 def getGrads(self, states, alreadyNormed=False):
     """
         The states should be normalized
     """
     # self.setData(states, actions, rewards, result_states)
     if (alreadyNormed == False):
         states = norm_state(states, self._state_bounds)
     states = np.array(states, dtype=self._settings['float_type'])
     # grads = np.reshape(np.array(self._get_gradients([states])[0], dtype=self._settings['float_type']), (states.shape[0],states.shape[1]))
     grads = np.array(self._get_gradients([states, 0]),
                      dtype=self._settings['float_type'])
     # print ("State grads: ", grads.shape)
     # print ("State grads: ", repr(grads))
     return grads
Beispiel #17
0
 def predict_std(self, state, deterministic_=True):
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     if (('disable_parameter_scaling' in self._settings)
             and (self._settings['disable_parameter_scaling'])):
         action_std = self._model.getActorNetwork().predict(
             state, batch_size=1)[:, self._action_length:]
         # action_std = self._q_action_std()[0] * (action_bound_std(self._action_bounds))
     else:
         action_std = self._model.getActorNetwork().predict(
             state, batch_size=1)[:, self._action_length:] * (
                 action_bound_std(self._action_bounds))
     return action_std
 def q_values(self, state):
     """
         For returning a vector of q values, state should already be normalized
     """
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     self._modelTarget.setStates(state)
     if (('disable_parameter_scaling' in self._settings)
             and (self._settings['disable_parameter_scaling'])):
         return scale_reward(self._q_val(), self.getRewardBounds()) * (
             1.0 / (1.0 - self.getSettings()['discount_factor']))
     else:
         return scale_reward(self._q_val(), self.getRewardBounds()) * (
             1.0 / (1.0 - self.getSettings()['discount_factor']))
 def predict(self,
             state,
             deterministic_=True,
             evaluation_=False,
             p=None,
             sim_index=None,
             bootstrapping=False):
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     if (('disable_parameter_scaling' in self._settings)
             and (self._settings['disable_parameter_scaling'])):
         action_ = self._q_action()[0]
     else:
         action_ = scale_action(
             self._q_action()[0],
             self._action_bounds)  # transform the action value to a range
     return action_
    def q_valueWithDropout(self, state):
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            pass
        else:
            state = norm_state(state, self._state_bounds)

        state = np.array(state, dtype=self._settings['float_type'])
        self._model.setStates(state)
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            return scale_reward(
                self._q_val_drop(), self.getRewardBounds())[0] * (
                    1.0 / (1.0 - self.getSettings()['discount_factor']))
        else:
            return scale_reward(
                self._q_val_drop(), self.getRewardBounds())[0] * (
                    1.0 / (1.0 - self.getSettings()['discount_factor']))
Beispiel #21
0
    def q_valueWithDropout(self, state):
        # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
        # states[0, ...] = state
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            pass
        else:
            state = norm_state(state, self._state_bounds)

        state = np.array(state, dtype=self._settings['float_type'])
        self._model.setStates(state)
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            return scale_reward(self._q_val_drop(), self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
            # return (self._q_val_drop())[0]
        else:
            return scale_reward(self._q_val_drop(), self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
 def predictWithDropout(self, state, deterministic_=True):
     state = np.array(state, dtype=self._settings['float_type'])
     state = norm_state(state, self._state_bounds)
     self._model.setStates(state)
     action_ = scale_action(self._q_action_drop()[0], self._action_bounds)
     return action_