Esempio n. 1
0
File: DPG.py Progetto: skylbc/SMBAE
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):
        """
            In order to get this to work we need to be careful not to update the actor parameters
            when updating the critic. This can be an issue when the Concatenating networks together.
            The first first network becomes a part of the second. However you can still access the first
            network by itself but an updates on the second network will effect the first network.
            Care needs to be taken to make sure only the parameters of the second network are updated.
        """

        super(DPG, self).__init__(model, n_in, n_out, state_bounds,
                                  action_bounds, reward_bound, settings_)

        self._Fallen = T.bcol("Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._Fallen.tag.test_value = np.zeros((self._batch_size, 1),
                                               dtype=np.dtype('int8'))

        self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                     dtype='int8'),
                                            broadcastable=(False, True))

        self._Action = T.matrix("Action2")
        self._Action.tag.test_value = np.random.rand(self._batch_size,
                                                     self._action_length)

        self._Tmp_Target = T.col("Tmp_Target")
        self._Tmp_Target.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                                broadcastable=(False, True))

        self._modelTarget = copy.deepcopy(model)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = self.getSettings()['learning_rate']
        self._discount_factor = self.getSettings()['discount_factor']
        self._rho = self.getSettings()['rho']
        self._rms_epsilon = self.getSettings()['rms_epsilon']

        self._weight_update_steps = self.getSettings(
        )['steps_until_target_network_update']
        self._updates = 0
        self._decay_weight = self.getSettings()['regularization_weight']
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]

        # self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        # self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        # self._q_valsNextState = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        # self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        # self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)

        inputs_1 = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions()
        }
        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_1)
        inputs_1_policy = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._q_valsActA
        }
        self._q_vals_train_policy = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_1_policy)
        inputs_2 = {
            self._modelTarget.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._modelTarget.getActionSymbolicVariable():
            self._model.getActions()
        }
        self._q_valsB_ = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(), inputs_2, deterministic=True)

        self._q_func = self._q_valsA
        self._q_funcB = self._q_valsB_
        # self._q_funcTarget = self._q_valsTarget
        # self._q_func_drop = self._q_valsA_drop
        # self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        # self._q_funcAct_drop = self._q_valsActA_drop

        # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True)

        # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsB )), self._Fallen)
        self._diff = self._Tmp_Target - self._q_func
        # self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        # self._loss_drop = T.mean(0.5 * self._diff_drop ** 2)

        # assert len(lasagne.layers.helper.get_all_params(self._l_outA)) == 16
        # Need to remove the action layers from these params
        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        print("******Number of Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getCriticNetwork()))))
        print("******Number of Action Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getActorNetwork()))))
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._Action:  self._q_valsActTarget,
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._Fallen: self._fallen_shared
            self._Tmp_Target:
            self._tmp_target_shared
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._model.getActions(),
            # self._Fallen: self._fallen_shared
            # self._tmp_diff: self._tmp_diff_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))

        ## MSE update
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_ = lasagne.updates.adam(self._value_grad,
                                              self._params,
                                              self._critic_learning_rate,
                                              beta1=0.9,
                                              beta2=0.9,
                                              epsilon=self._rms_epsilon)

        self._givens_grad = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
        }

        ## Some cool stuff to backprop action gradients

        self._action_grad = T.matrix("Action_Grad")
        self._action_grad.tag.test_value = np.zeros(
            (self._batch_size, self._action_length),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._action_grad_shared = theano.shared(
            np.zeros((self._batch_size, self._action_length),
                     dtype=self.getSettings()['float_type']))

        ### Maximize wrt q function

        self._action_mean_grads = T.grad(
            cost=None,
            wrt=self._actionParams,
            known_grads={self._q_valsActA: self._action_grad_shared}),
        print("Action grads: ", self._action_mean_grads[0])
        ## When passing in gradients it needs to be a proper list of gradient expressions
        self._action_mean_grads = list(self._action_mean_grads[0])
        # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list))
        # print ("Action grads: ", self._action_mean_grads)
        self._actionGRADUpdates = lasagne.updates.adam(
            self._action_mean_grads,
            self._actionParams,
            self._learning_rate,
            beta1=0.9,
            beta2=0.9,
            epsilon=self._rms_epsilon)

        self._actGradGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._model.getActions(),
            # self._Fallen: self._fallen_shared,
            # self._advantage: self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }

        # theano.gradient.grad_clip(x, lower_bound, upper_bound) # // TODO
        # self._actionUpdates = lasagne.updates.adam(-T.mean(self._q_vals_train_policy) +
        #   (self._decay_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), self._actionParams,
        #           self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon)

        if ('train_extra_value_function' in self.getSettings() and
            (self.getSettings()['train_extra_value_function'] == True)):
            self._valsA = lasagne.layers.get_output(
                self._model._value_function,
                self._model.getStateSymbolicVariable(),
                deterministic=True)
            self._valsA_drop = lasagne.layers.get_output(
                self._model._value_function,
                self._model.getStateSymbolicVariable(),
                deterministic=False)
            self._valsNextState = lasagne.layers.get_output(
                self._model._value_function,
                self._model.getResultStateSymbolicVariable(),
                deterministic=True)
            self._valsTargetNextState = lasagne.layers.get_output(
                self._modelTarget._value_function,
                self._model.getResultStateSymbolicVariable(),
                deterministic=True)
            self._valsTarget = lasagne.layers.get_output(
                self._modelTarget._value_function,
                self._model.getStateSymbolicVariable(),
                deterministic=True)

            # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsB )), self._Fallen)
            # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1)
            self._v_target = self._model.getRewardSymbolicVariable() + (
                self._discount_factor * self._valsTargetNextState)
            self._v_diff = self._v_target - self._valsA
            # loss = 0.5 * self._diff ** 2
            loss_v = T.pow(self._v_diff, 2)
            self._v_loss = T.mean(loss_v)

            self._params_value = lasagne.layers.helper.get_all_params(
                self._model._value_function)
            self._givens_value = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                self._model.getRewardSymbolicVariable():
                self._model.getRewards(),
                # self._NotFallen: self._NotFallen_shared
                # self._model.getActionSymbolicVariable(): self._actions_shared,
            }
            self._value_regularization = (
                self._critic_regularization_weight *
                lasagne.regularization.regularize_network_params(
                    self._model._value_function, lasagne.regularization.l2))

            self._value_grad = T.grad(
                self._v_loss + self._value_regularization, self._params_value)
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_value = lasagne.updates.adam(
                self._value_grad,
                self._params_value,
                self._critic_learning_rate,
                beta1=0.9,
                beta2=0.9,
                epsilon=self._rms_epsilon)
            ## TD update
        DPG.compile(self)
Esempio n. 2
0
    def __init__(self, num_actions):
        
        # remember parameters
        self.num_actions = num_actions
        self.batch_size = BATCH_SIZE
        self.discount_rate = DISCOUNT_RATE
        self.history_length = HISTORY_LENGTH
        self.screen_dim = DIMS
        self.img_height = SCREEN_HEIGHT
        self.img_width = SCREEN_WIDTH
        self.clip_error = CLIP_ERROR
        self.input_color_scale = COLOR_SCALE

        self.target_steps = TARGET_STEPS
        self.train_iterations = TRAIN_STEPS
        self.train_counter = 0
        self.momentum = MOMENTUM
        self.update_rule = UPDATE_RULE
        self.learning_rate = LEARNING_RATE
        self.rms_decay = RMS_DECAY
        self.rms_epsilon = RMS_EPSILON        
        
        self.rng = np.random.RandomState(RANDOM_SEED)

        # set seed
        lasagne.random.set_rng(self.rng)

        # prepare tensors once and reuse them
        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        # terminals are bool for our case
        terminals = T.bcol('terminals')

        # create shared theano variables
        self.states_shared = theano.shared(
            np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width),
                     dtype=theano.config.floatX))

        # !broadcast ?
        self.rewards_shared = theano.shared(
            np.zeros((self.batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((self.batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            #np.zeros((self.batch_size, 1), dtype='int32'),
            np.zeros((self.batch_size, 1), dtype='int8'),
            broadcastable=(False, True))

        # can add multiple nets here
        self.l_primary = self.build_network()

        if self.target_steps > 0:
            self.l_secondary = self.build_network()
            self.copy_to_secondary()

        
        """
        # input scale i.e. division can be applied to input directly also to normalize
        """

        # define output symbols
        q_vals = lasagne.layers.get_output(self.l_primary, states / self.input_color_scale)
        
        if self.target_steps > 0:
            q_vals_secondary = lasagne.layers.get_output(self.l_secondary, next_states / self.input_color_scale)
        else:
            # why this ?
            q_vals_secondary = lasagne.layers.get_output(self.l_primary, next_states / self.input_color_scale)
            q_vals_secondary = theano.gradient.disconnected_grad(q_vals_secondary)

        # target = r + max
        target = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(q_vals_secondary, axis=1, keepdims=True))
        
        """
        # check what this does
        """
        diff = target - q_vals[T.arange(self.batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        # print shape ? 

        if self.clip_error > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_error)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_error * linear_part
        else:
            loss = 0.5 * diff ** 2

        loss = T.sum(loss)
        
        params = lasagne.layers.helper.get_all_params(self.l_primary)  
        
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        g_time = time.time()
        logger.info("graph compiling")


        if self.update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.learning_rate, self.rms_decay,
                                       self.rms_epsilon)
        elif self.update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.learning_rate, self.rms_decay,
                                              self.rms_epsilon)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})

        logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
Esempio n. 3
0
    def __init__(self, n_actions, replay_memory, build_network, updates, screen_size, initial_weights_file=None):
        self.screen_width, self.screen_height = screen_size
        self.mood_q = None
        self.last_q = 0
        self.n_parameter_updates = 0
        self.alpha = 0.00025
        # update frequency ?
        # gradient momentum ? 0.95
        # squared gradient momentum ? 0.95
        # min squared gradient ? 0.01
        self.save_every_n_frames = 100000  # ~ once per hour

        self.final_exploration_frame = 1000000
        self.replay_start_size = 50000
        self.i_action = 0

        self.state = None
        self.initial_epsilon = 1
        self.final_epsilon = 0.1
        self.epsilon = self.initial_epsilon
        self.gamma = 0.99
        self.replay_memory = replay_memory

        self.log_frequency = 1

        self.minibatch_size = 32
        # self.replay_memory_size = 1000000

        self.target_network_update_frequency = 10000

        s0_var = T.tensor4("s0", dtype=theano.config.floatX)
        a0_var = T.bmatrix("a0")
        r0_var = T.wcol("r0")
        s1_var = T.tensor4("s1", dtype=theano.config.floatX)
        future_reward_indicator_var = T.bcol("future_reward_indicator")

        self.n_actions = n_actions
        self.a_lookup = np.eye(self.n_actions, dtype=np.int8)

        self.network = build_network(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256),
                                     screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward.")
        self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True))

        self.network_stale = build_network(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256),
                                           screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward_stale.")
        self.forward_stale = theano.function([s1_var],
                                             lasagne.layers.get_output(self.network_stale, deterministic=True))

        self._update_network_stale()

        out = lasagne.layers.get_output(self.network)
        out_stale = lasagne.layers.get_output(self.network_stale)
        self.loss, self.err, __y, __q = build_loss(out=out,
                                                   out_stale=out_stale,
                                                   a0_var=a0_var,
                                                   r0_var=r0_var,
                                                   future_reward_indicator_var=future_reward_indicator_var,
                                                   gamma=self.gamma)

        params = lasagne.layers.get_all_params(self.network, trainable=True)

        print("Compiling train_fn.")
        self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                        [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale],
                                        updates=updates(self.loss, params))
        print("Compiling loss_fn.")
        self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                       self.loss)
Esempio n. 4
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_):

        super(CACLA,self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_)
        
        # create a small convolutional neural network
        
        self._Fallen = T.bcol("Fallen")
        self._Fallen.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype('int8'))
        
        self._fallen_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='int8'),
            broadcastable=(False, True))
        self._usingDropout = True
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings()["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        # primary network
        self._model = model
        # Target network
        self._modelTarget = copy.deepcopy(model)
        
        self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        
        self._q_valsActA = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        
        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop
        # self._q_funcAct = theano.function(inputs=[self._model.getStateSymbolicVariable()], outputs=self._q_valsActA, allow_input_downcast=True)
        
        # self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * theano.tensor.maximum(1.0, theano.tensor.ceil(self._model.getRewardSymbolicVariable())) # Did not understand how the maximum was working
        # self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * theano.tensor.ceil(self._model.getRewardSymbolicVariable())
        ## Don't need to use dropout for the target network
        self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * self._Fallen
        # self._target = self._model.getTargetSymbolicVariable()
        ## When there is no dropout in the network it will have no affect here
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop 
        loss = 0.5 * self._diff ** 2 
        self._loss = T.mean(loss)
        # self._loss_drop = T.mean(0.5 * (self._diff_drop ** 2))
        
        self._params = lasagne.layers.helper.get_all_params(self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._Fallen: self._fallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._next_states_shared,
            # self._model.getRewardSymbolicVariable(): self._rewards_shared,
            self._model.getActionSymbolicVariable(): self._model.getActions()
        }
        
        self._critic_regularization = (self._critic_regularization_weight * lasagne.regularization.regularize_network_params(
        self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (self._regularization_weight * lasagne.regularization.regularize_network_params(
                self._model.getActorNetwork(), lasagne.regularization.l2))
        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss + self._critic_regularization, self._params, 
        #                         self._learning_rate, self._rho, self._rms_epsilon)
        # TD update
        self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, 
                    self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon)
        
        
        # actDiff1 = (self._model.getActionSymbolicVariable() - self._q_valsActTarget) #TODO is this correct?
        # actDiff = (actDiff1 - (self._model.getActionSymbolicVariable() - self._q_valsActA))
        # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here?
        self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here?
        # self._actLoss = 0.5 * (self._actDiff ** 2) 
        ## Should produce a single column vector or costs for each sample in the batch
        self._actLoss_ = T.mean(T.pow(self._actDiff, 2),axis=1) 
        # self._actLoss = T.sum(self._actLoss)/float(self._batch_size) 
        self._actLoss = T.mean(self._actLoss_)
        # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink
        # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2))
        ## Computes the distance between actions weighted by the distances between the states that result in those actions
        """
        state_sum = T.mean(T.pow(self._model.getStateSymbolicVariable(),2), axis=1)
        Distance = ((((state_sum + T.reshape(state_sum, (1,-1)).T) - 2*T.dot(self._model.getStateSymbolicVariable(), self._model.getStateSymbolicVariable().T))))
        action_sum = T.mean(T.pow(self._q_valsActA_drop,2), axis=1)
        Distance_action = ((((action_sum + T.reshape(action_sum, (1,-1)).T) - 2*T.dot(self._q_valsActA_drop, self._q_valsActA_drop.T))))
        weighted_dist = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(Distance, Distance_action)
        self._weighted_mean_dist = T.mean(weighted_dist, axis=1)
        """
        ## Entropy from A3C, make sure network is not producing same action for everything..
        # self.entropy = -T.mean(T.sum(self._q_valsActA_drop, axis=0))
        # self._weighted_entropy = -T.mean(self._weighted_mean_dist)
        self._weighted_entropy = 0
        
        
        self._actionUpdates = lasagne.updates.rmsprop(self._actLoss + self._actor_regularization + (0.00001 * self._weighted_entropy), 
                                self._actionParams, self._learning_rate , self._rho, self._rms_epsilon)
        
        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + 
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, 
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        
        ## Bellman error
        self._bellman = self._target - self._q_funcTarget
        CACLA.compile(self)
    def __init__(self,
                 input_size,
                 output_size,
                 build_network=simple_network2,
                 discount=0.99,
                 learningRate=0.001,
                 frozen_network_update_time=1000):

        print "Initializing new Q network"

        self.input_size = input_size
        self.output_size = output_size
        self.discount = discount
        self.learningRate = learningRate

        self.frozen_network_update_time = frozen_network_update_time
        self.frozen_timer = 0
        self.epoch = 0

        # logging variables
        self.log = {
            "batchMeanQValue": [],
            "batchMeanTargetQValue": [],
            "cost": [],
            'performance': [],
            'epoch': []
        }

        # symbolic inputs
        sym_state = T.tensor4('state')  #Batchsize, channels, X, Y
        sym_action = T.icol('action')
        sym_reward = T.col('reward')
        sym_isDone = T.bcol('isDone')
        sym_nextState = T.tensor4('nextState')

        # networks
        self.network = build_network(input_size, output_size)
        self.frozen_network = build_network(input_size, output_size)
        self.update_frozen_network()

        # forward pass
        print "Compiling forward passes"
        self.forward_pass = theano.function([sym_state],
                                            lasagne.layers.get_output(
                                                self.network,
                                                sym_state,
                                                deterministic=True))

        self.frozen_forward_pass = theano.function([sym_state],
                                                   lasagne.layers.get_output(
                                                       self.frozen_network,
                                                       sym_state,
                                                       deterministic=True))

        #clipped_reward = T.clip(sym_reward,-1,1)
        #cost function definition
        cost, error, q_action, q_target = self.build_cost_function(
            sym_state, sym_action, sym_reward, sym_isDone, sym_nextState)

        params = lasagne.layers.get_all_params(self.network, trainable=True)
        update_function = lasagne.updates.rmsprop(
            cost, params, learning_rate=self.learningRate)

        # training function
        print "Compiling training function"
        self._train = theano.function(
            [sym_state, sym_action, sym_reward, sym_isDone, sym_nextState],
            [cost, error, q_action, q_target],
            updates=update_function)
Esempio n. 6
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(MBPG, self).__init__(model, n_in, n_out, state_bounds,
                                   action_bounds, reward_bound, settings_)
        # scale = (bounds[1][i]-bounds[0][i])/2.0
        # create a small convolutional neural network

        # self._action_std_scaling = (self._action_bounds[1] - self._action_bounds[0]) / 2.0

        self._NotFallen = T.bcol("Not_Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1),
                                                  dtype=np.dtype('int8'))

        self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                        dtype='int8'),
                                               broadcastable=(False, True))

        self._advantage = T.col("Advantage")
        self._advantage.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._advantage_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                               broadcastable=(False, True))

        self._dyna_target = T.col("DYNA_Target")
        self._dyna_target.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._dyna_target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                                 broadcastable=(False, True))

        self._KL_Weight = T.scalar("KL_Weight")
        self._KL_Weight.tag.test_value = np.zeros(
            (1), dtype=np.dtype(self.getSettings()['float_type']))[0]

        self._kl_weight_shared = theano.shared(
            np.ones((1), dtype=self.getSettings()['float_type'])[0])
        self._kl_weight_shared.set_value(
            self.getSettings()['previous_value_regularization_weight'])
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        # primary network
        self._model = model
        # Target network
        self._modelTarget = copy.deepcopy(model)

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._q_valsNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, :self._action_length]
        # self._q_valsActA = scale_action(self._q_valsActA, self._action_bounds)
        self._q_valsActASTD = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, self._action_length:]

        ## prevent value from being 0
        """
        if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])): 
            self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate']
            # self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate']
        else:
        """
        self._q_valsActASTD = ((self._q_valsActASTD) *
                               self.getSettings()['exploration_rate']) + 2e-2

        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable())[:, :self._action_length]
        # self._q_valsActTarget = scale_action(self._q_valsActTarget, self._action_bounds)
        self._q_valsActTargetSTD = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable())[:, self._action_length:]
        """
        if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])): 
            self._q_valsActTargetSTD = (T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate']
            # self._q_valsActTargetSTD = (self._action_std_scaling * T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate']
        else:
        """
        self._q_valsActTargetSTD = (
            (self._q_valsActTargetSTD) *
            self.getSettings()['exploration_rate']) + 2e-2
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen
        # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._NotFallen) + (self._NotFallen - 1)
        self._target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._q_valsTargetNextState)
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = 0.5 * T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared,
            self._advantage:
            self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }

        self._allGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared,
            self._advantage:
            self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (
            self._regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getActorNetwork(), lasagne.regularization.l2))
        self._kl_firstfixed = T.mean(
            kl(self._q_valsActTarget, self._q_valsActTargetSTD,
               self._q_valsActA, self._q_valsActASTD, self._action_length))
        # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed )
        # self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])*
        #                                                                              T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold']))
        self._actor_entropy = 0.5 * T.mean((2 * np.pi * self._q_valsActASTD))
        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params(
        # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        ## Clipping the max gradient
        """
        for x in range(len(self._value_grad)): 
            self._value_grad[x] = T.clip(self._value_grad[x] ,  -0.1, 0.1)
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.rmsprop(self._value_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     self._rho,
                                                     self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.momentum(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adam(self._value_grad,
                                                  self._params,
                                                  self._critic_learning_rate,
                                                  beta1=0.9,
                                                  beta2=0.9,
                                                  epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adagrad(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA),
        #                                                                    theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?

        ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s)
        # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._NotFallen)) - self._q_func

        self._Advantage = self._advantage  #  * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards
        # self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length)
        # self._log_prob_target = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length)
        ### Only change the std
        self._prob = likelihood(self._model.getActionSymbolicVariable(),
                                self._q_valsActTarget, self._q_valsActASTD,
                                self._action_length)
        self._prob_target = likelihood(self._model.getActionSymbolicVariable(),
                                       self._q_valsActTarget,
                                       self._q_valsActTargetSTD,
                                       self._action_length)
        ## This does the sum already
        self._r = (self._prob / self._prob_target)
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (self._r), self._Advantage)
        ppo_epsilon = self.getSettings()['kl_divergence_threshold']
        self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (theano.tensor.clip(self._r, 1.0 - ppo_epsilon,
                                1 + ppo_epsilon), self._Advantage))
        self._actLoss_ = theano.tensor.minimum((self._actLoss_),
                                               (self._actLoss_2))

        self._actLoss = (-1.0 *
                         (T.mean(self._actLoss_) +
                          (self.getSettings()['std_entropy_weight'] *
                           self._actor_entropy))) + self._actor_regularization

        self._policy_grad = T.grad(self._actLoss, self._actionParams)
        self._policy_grad = lasagne.updates.total_norm_constraint(
            self._policy_grad, 5)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(self._policy_grad,
                                                       self._actionParams,
                                                       self._learning_rate,
                                                       beta1=0.9,
                                                       beta2=0.999,
                                                       epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        if (('train_state_encoding' in self.getSettings())
                and (self.getSettings()['train_state_encoding'])):
            self._encoded_state = lasagne.layers.get_output(
                self._model.getEncodeNet(),
                self._model.getStateSymbolicVariable(),
                deterministic=True)
            self._encoding_loss = T.mean(
                T.pow(self._encoded_state - self._model.getStates(), 2))
            self._full_loss = (
                self._loss + self._critic_regularization +
                (-1.0 * self.getSettings()['policy_loss_weight'] *
                 (T.mean(self._actLoss_) +
                  (self.getSettings()['std_entropy_weight'] *
                   self._actor_entropy))) +
                (self._actor_regularization + self._encoding_loss))
        else:
            self._full_loss = (
                self._loss + self._critic_regularization +
                (-1.0 * self.getSettings()['policy_loss_weight'] *
                 (T.mean(self._actLoss_) +
                  (self.getSettings()['std_entropy_weight'] *
                   self._actor_entropy))) + self._actor_regularization)

        if (('train_state_encoding' in self.getSettings())
                and (self.getSettings()['train_state_encoding'])):
            self._encodeParams = lasagne.layers.helper.get_all_params(
                self._model.getEncodeNet())
            self._all_Params = self._params + self._actionParams + self._encodeParams
        else:
            # self._all_Params = self._params + self._actionParams[-3:]
            self._all_Params = self._params + self._actionParams
        print("Num params: ", len(self._all_Params), " params: ",
              len(self._params), " act params: ", len(self._actionParams))
        self._both_grad = T.grad(self._full_loss, self._all_Params)
        self._both_grad = lasagne.updates.total_norm_constraint(
            self._both_grad, 5)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._collectiveUpdates = lasagne.updates.rmsprop(
                self._both_grad, self._all_Params, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._collectiveUpdates = lasagne.updates.momentum(
                self._both_grad,
                self._all_Params,
                self._learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._collectiveUpdates = lasagne.updates.adam(self._both_grad,
                                                           self._all_Params,
                                                           self._learning_rate,
                                                           beta1=0.9,
                                                           beta2=0.999,
                                                           epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) +
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams,
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }

        ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable()
        self._diff_dyna = self._dyna_target - self._q_valsNextState
        # loss = 0.5 * self._diff ** 2
        loss = 0.5 * T.pow(self._diff_dyna, 2)
        self._loss_dyna = T.mean(loss)

        self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization,
                                 self._params)

        self._givens_dyna = {
            # self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
            self._dyna_target:
            self._dyna_target_shared
        }
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._DYNAUpdates = lasagne.updates.rmsprop(
                self._dyna_grad, self._params, self._learning_rate, self._rho,
                self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad,
                                                         self._params,
                                                         self._learning_rate,
                                                         momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     beta1=0.9,
                                                     beta2=0.999,
                                                     epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._DYNAUpdates = lasagne.updates.adagrad(
                self._dyna_grad,
                self._params,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        ## Some cool stuff to backprop action gradients

        self._action_grad = T.matrix("Action_Grad")
        self._action_grad.tag.test_value = np.zeros(
            (self._batch_size, self._action_length),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._action_grad_shared = theano.shared(
            np.zeros((self._batch_size, self._action_length),
                     dtype=self.getSettings()['float_type']))

        self._action_mean_grads = T.grad(
            cost=None,
            wrt=self._actionParams,
            known_grads={self._q_valsActA: self._action_grad_shared}),
        # print ("Action grads: ", self._action_mean_grads[0])
        ## When passing in gradients it needs to be a proper list of gradient expressions
        self._action_mean_grads = list(self._action_mean_grads[0])
        # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list))
        # print ("Action grads: ", self._action_mean_grads)
        self._actionGRADUpdates = lasagne.updates.adagrad(
            self._action_mean_grads,
            self._actionParams,
            self._learning_rate,
            epsilon=self._rms_epsilon)

        self._actGradGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared,
            # self._advantage: self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }
        """
        self._get_grad = theano.function([], outputs=T.grad(cost=None, wrt=[self._model._actionInputVar] + self._params,
                                                            known_grads={self._forward: self._fd_grad_target_shared}), 
                                         allow_input_downcast=True, 
                                         givens= {
            self._model.getStateSymbolicVariable() : self._model.getStates(),
            # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            # self._fd_grad_target : self._fd_grad_target_shared
        })
        """
        MBPG.compile(self)
Esempio n. 7
0
    def fit(self, data, sample_store=10000000, store_type='gpu'):
        '''
        Trains the network.

        Parameters
        --------
        data : pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
        sample_store : int
            If additional negative samples are used (n_sample > 0), the efficiency of GPU utilization can be sped up, by precomputing a large batch of negative samples (and recomputing when necessary).
            This parameter regulizes the size of this precomputed ID set. Its value is the maximum number of int values (IDs) to be stored. Precomputed IDs are stored in the RAM.
            For the most efficient computation, a balance must be found between storing few examples and constantly interrupting GPU computations for a short time vs. computing many examples and interrupting GPU computations for a long time (but rarely).
        store_type : 'cpu', 'gpu'
            Where to store the negative sample buffer (sample store). The cpu mode is legacy and is no longer supported.

        '''
        self.predict = None
        self.error_during_train = False
        itemids = data[self.item_key].unique()
        self.n_items = len(itemids)
        self.itemidmap = pd.Series(data=np.arange(self.n_items),
                                   index=itemids,
                                   name='ItemIdx')
        data['ItemIdx'] = self.itemidmap[data[self.item_key].values].values
        offset_sessions = self.init(data)
        pop = data.groupby(self.item_key).size()
        if self.logq:
            self.P0 = theano.shared(
                pop[self.itemidmap.index.values].values.astype(
                    theano.config.floatX),
                name='P0',
                borrow=False)
        if self.n_sample:
            pop = pop[self.itemidmap.index.values].values**self.sample_alpha
            pop = pop.cumsum() / pop.sum()
            pop[-1] = 1
            if sample_store:
                generate_length = sample_store // self.n_sample
                if generate_length <= 1:
                    sample_store = 0
                    print('No example store was used')
                elif store_type == 'cpu':
                    neg_samples = self.generate_neg_samples(
                        pop, generate_length)
                    sample_pointer = 0
                    print(
                        'Created sample store with {} batches of samples (type=CPU)'
                        .format(generate_length))
                elif store_type == 'gpu':
                    P = theano.shared(pop.astype(theano.config.floatX),
                                      name='P')
                    self.ST = theano.shared(
                        np.zeros((generate_length, self.n_sample),
                                 dtype='int64'))
                    self.STI = theano.shared(np.asarray(0, dtype='int64'))
                    X = mrng.uniform((generate_length * self.n_sample, ))
                    updates_st = OrderedDict()
                    updates_st[self.ST] = gpu_searchsorted(
                        P, X, dtype_int64=True).reshape(
                            (generate_length, self.n_sample))
                    updates_st[self.STI] = np.asarray(0, dtype='int64')
                    generate_samples = theano.function([], updates=updates_st)
                    generate_samples()
                    sample_pointer = 0
                    print(
                        'Created sample store with {} batches of samples (type=GPU)'
                        .format(generate_length))
                else:
                    print('Invalid store type {}'.format(store_type))
                    raise NotImplementedError
            else:
                print('No example store was used')
        X = T.ivector(name='X')
        Y = T.ivector(name='Y')
        M = T.iscalar(name='M')
        R = T.bcol(name='R')
        H_new, Y_pred, sparams, full_params, sidxs = self.model(
            X, self.H, M, R, Y, self.dropout_p_hidden, self.dropout_p_embed)
        cost = self.loss_function(Y_pred, M) / self.batch_size
        params = [
            self.Wx if self.embedding or self.constrained_embedding else
            self.Wx[1:], self.Wh, self.Wrz, self.Bh
        ]
        updates = self.RMSprop(cost, params, full_params, sparams, sidxs)
        for i in range(len(self.H)):
            updates[self.H[i]] = H_new[i]
        if hasattr(self, 'STI'):
            updates[self.STI] = self.STI + 1
        train_function = function(inputs=[X, Y, M, R],
                                  outputs=cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
        base_order = np.argsort(
            data.groupby(self.session_key)[self.time_key].min().values
        ) if self.time_sort else np.arange(len(offset_sessions) - 1)
        data_items = data.ItemIdx.values
        for epoch in range(self.n_epochs):
            t0 = time.time()
            for i in range(len(self.layers)):
                self.H[i].set_value(np.zeros((self.batch_size, self.layers[i]),
                                             dtype=theano.config.floatX),
                                    borrow=True)
            c = []
            cc = []
            session_idx_arr = np.random.permutation(
                len(offset_sessions) -
                1) if self.train_random_order else base_order
            iters = np.arange(self.batch_size)
            maxiter = iters.max()
            start = offset_sessions[session_idx_arr[iters]]
            end = offset_sessions[session_idx_arr[iters] + 1]
            finished = False
            while not finished:
                minlen = (end - start).min()
                out_idx = data_items[start]
                for i in range(minlen - 1):
                    in_idx = out_idx
                    out_idx = data_items[start + i + 1]
                    if self.n_sample and store_type == 'cpu':
                        if sample_store:
                            if sample_pointer == generate_length:
                                neg_samples = self.generate_neg_samples(
                                    pop, generate_length)
                                sample_pointer = 0
                            sample = neg_samples[sample_pointer]
                            sample_pointer += 1
                        else:
                            sample = self.generate_neg_samples(pop, 1)
                        y = np.hstack([out_idx, sample])
                    else:
                        y = out_idx
                        if self.n_sample:
                            if sample_pointer == generate_length:
                                generate_samples()
                                sample_pointer = 0
                            sample_pointer += 1
                    reset = (start + i + 1 == end - 1)
                    cost = train_function(in_idx, y, len(iters),
                                          reset.reshape(len(reset), 1))
                    c.append(cost)
                    cc.append(len(iters))
                    if np.isnan(cost):
                        print(str(epoch) + ': NaN error!')
                        self.error_during_train = True
                        return
                start = start + minlen - 1
                finished_mask = (end - start <= 1)
                n_finished = finished_mask.sum()
                iters[finished_mask] = maxiter + np.arange(1, n_finished + 1)
                maxiter += n_finished
                valid_mask = (iters < len(offset_sessions) - 1)
                n_valid = valid_mask.sum()
                if (n_valid == 0) or (n_valid < 2 and self.n_sample == 0):
                    finished = True
                    break
                mask = finished_mask & valid_mask
                sessions = session_idx_arr[iters[mask]]
                start[mask] = offset_sessions[sessions]
                end[mask] = offset_sessions[sessions + 1]
                iters = iters[valid_mask]
                start = start[valid_mask]
                end = end[valid_mask]
                if n_valid < len(valid_mask):
                    for i in range(len(self.H)):
                        tmp = self.H[i].get_value(borrow=True)
                        tmp = tmp[valid_mask]
                        self.H[i].set_value(tmp, borrow=True)
            c = np.array(c)
            cc = np.array(cc)
            avgc = np.sum(c * cc) / np.sum(cc)
            if np.isnan(avgc):
                print('Epoch {}: NaN error!'.format(str(epoch)))
                self.error_during_train = True
                return
            t1 = time.time()
            dt = t1 - t0
            print(
                'Epoch{} --> loss: {:.6f} \t({:.2f}s) \t[{:.2f} mb/s | {:.0f} e/s]'
                .format(epoch + 1, avgc, dt,
                        len(c) / dt,
                        np.sum(cc) / dt))
        if hasattr(self, 'ST'):
            del (self.ST)
            del (self.STI)
Esempio n. 8
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(A_CACLA, self).__init__(model, n_in, n_out, state_bounds,
                                      action_bounds, reward_bound, settings_)

        # create a small convolutional neural network

        self._actor_buffer_states = []
        self._actor_buffer_result_states = []
        self._actor_buffer_actions = []
        self._actor_buffer_rewards = []
        self._actor_buffer_falls = []
        self._actor_buffer_diff = []

        self._NotFallen = T.bcol("Not_Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1),
                                                  dtype=np.dtype('int8'))

        self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                        dtype='int8'),
                                               broadcastable=(False, True))

        self._tmp_diff = T.col("Tmp_Diff")
        self._tmp_diff.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_diff_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                              broadcastable=(False, True))

        self._dyna_target = T.col("DYNA_Target")
        self._dyna_target.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._dyna_target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                                 broadcastable=(False, True))

        self._KL_Weight = T.scalar("KL_Weight")
        self._KL_Weight.tag.test_value = np.zeros(
            (1), dtype=np.dtype(self.getSettings()['float_type']))[0]

        self._kl_weight_shared = theano.shared(
            np.ones((1), dtype=self.getSettings()['float_type'])[0])
        self._kl_weight_shared.set_value(1.0)
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        ## Target network
        self._modelTarget = copy.deepcopy(model)

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._q_valsNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen
        # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1)
        self._target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._q_valsTargetNextState)
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared
            self._tmp_diff:
            self._tmp_diff_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (
            (self._regularization_weight *
             lasagne.regularization.regularize_network_params(
                 self._model.getActorNetwork(), lasagne.regularization.l2)))
        if (self.getSettings()['use_previous_value_regularization']):
            self._actor_regularization = self._actor_regularization + (
                (self.getSettings()['previous_value_regularization_weight']) *
                change_penalty(self._model.getActorNetwork(),
                               self._modelTarget.getActorNetwork()))
        elif ('regularization_type' in self.getSettings() and
              (self.getSettings()['regularization_type'] == 'KL_Divergence')):
            self._kl_firstfixed = T.mean(
                kl(
                    self._q_valsActTarget,
                    T.ones_like(self._q_valsActTarget) *
                    self.getSettings()['exploration_rate'], self._q_valsActA,
                    T.ones_like(self._q_valsActA) *
                    self.getSettings()['exploration_rate'],
                    self._action_length))
            #self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])*
            #                                                                         T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold']))
            self._actor_regularization = (self._kl_firstfixed) * (
                self.getSettings()['kl_divergence_threshold'])

            print("Using regularization type : ",
                  self.getSettings()['regularization_type'])
        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss, self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        ## Clipping the max gradient
        """
        for x in range(len(self._value_grad)): 
            self._value_grad[x] = T.clip(self._value_grad[x] ,  -0.1, 0.1)
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.rmsprop(self._value_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     self._rho,
                                                     self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.momentum(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adam(self._value_grad,
                                                  self._params,
                                                  self._critic_learning_rate,
                                                  beta1=0.9,
                                                  beta2=0.9,
                                                  epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adagrad(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## TD update
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho)
        elif ( self.getSettings()['optimizer'] == 'adam'):
            self._updates_ = lasagne.updates.adam(T.mean(self._q_func), self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            print ("Unknown optimization method: ", self.getSettings()['optimizer'])
            sys.exit(-1)
        """
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), theano.tensor.tile((self._diff * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?
        self._actDiff = (self._model.getActionSymbolicVariable() -
                         self._q_valsActA_drop)
        # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here?
        # self._actDiff_drop = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here?
        ## This should be a single column vector
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.transpose(T.sum(T.pow(self._actDiff, 2),axis=1) )), (self._diff * (1.0/(1.0-self._discount_factor))))
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.reshape(T.sum(T.pow(self._actDiff, 2),axis=1), (self._batch_size, 1) )),
        #                                                                        (self._tmp_diff * (1.0/(1.0-self._discount_factor)))
        # self._actLoss_ = (T.mean(T.pow(self._actDiff, 2),axis=1))

        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (T.mean(T.pow(self._actDiff, 2), axis=1)),
            (self._tmp_diff * (1.0 / (1.0 - self._discount_factor))))
        # self._actLoss = T.sum(self._actLoss)/float(self._batch_size)
        self._actLoss = T.mean(self._actLoss_)
        # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink
        # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2))
        self._policy_grad = T.grad(self._actLoss + self._actor_regularization,
                                   self._actionParams)
        ## Clipping the max gradient
        """
        for x in range(len(self._policy_grad)): 
            self._policy_grad[x] = T.clip(self._policy_grad[x] ,  -0.5, 0.5)
        """
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                beta1=0.9,
                beta2=0.999,
                epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._actionUpdates = lasagne.updates.adagrad(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) +
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams,
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._model.getActions(),
        }

        ### Noisey state updates
        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen
        # self._target_dyna = theano.gradient.disconnected_grad(self._q_func)

        ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable()
        self._diff_dyna = self._dyna_target - self._q_valsNextState
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff_dyna, 2)
        self._loss_dyna = T.mean(loss)

        self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization,
                                 self._params)

        self._givens_dyna = {
            # self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._NotFallen: self._NotFallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
            self._dyna_target:
            self._dyna_target_shared
        }
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._DYNAUpdates = lasagne.updates.rmsprop(
                self._dyna_grad, self._params, self._learning_rate, self._rho,
                self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad,
                                                         self._params,
                                                         self._learning_rate,
                                                         momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     beta1=0.9,
                                                     beta2=0.999,
                                                     epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._DYNAUpdates = lasagne.updates.adagrad(
                self._dyna_grad,
                self._params,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        # self._target = self._model.getRewardSymbolicVariable() +  (self._discount_factor * self._q_valsTargetNextState )
        ### Give v(s') the next state and v(s) (target) the current state
        self._diff_adv = (self._discount_factor *
                          self._q_func) - (self._q_valsTargetNextState)
        self._diff_adv_givens = {
            self._model.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getStates(),
        }

        A_CACLA.compile(self)
Esempio n. 9
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_):

        super(PPOCritic,self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_)
        
        # create a small convolutional neural network
        
        self._Fallen = T.bcol("Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._Fallen.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype('int8'))
        
        self._fallen_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='int8'),
            broadcastable=(False, True))
        
        self._advantage = T.col("Advantage")
        self._advantage.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype(self.getSettings()['float_type']))
        
        self._advantage_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype=self.getSettings()['float_type']),
            broadcastable=(False, True))
        
        self._dyna_target = T.col("DYNA_Target")
        self._dyna_target.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype(self.getSettings()['float_type']))
        
        self._dyna_target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype=self.getSettings()['float_type']),
            broadcastable=(False, True))
        
        self._KL_Weight = T.scalar("KL_Weight")
        self._KL_Weight.tag.test_value = np.zeros((1),dtype=np.dtype(self.getSettings()['float_type']))[0]
        
        self._kl_weight_shared = theano.shared(
            np.ones((1), dtype=self.getSettings()['float_type'])[0])
        self._kl_weight_shared.set_value(self.getSettings()['previous_value_regularization_weight'])
        
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings()["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        # primary network
        self._model = model
        # Target network
        self._modelTarget = copy.deepcopy(model)
        
        self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        self._q_valsNextState = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        
        self._q_valsActA = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:,:self._action_length]
        self._q_valsActASTD = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:,self._action_length:]
        
        ## prevent value from being 0
        self._q_valsActASTD = (self._q_valsActASTD * self.getSettings()['exploration_rate']) + 1e-1
        self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:,:self._action_length]
        self._q_valsActTargetSTD = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:,self._action_length:]
        self._q_valsActTargetSTD = (self._q_valsActTargetSTD  * self.getSettings()['exploration_rate']) + 1e-1
        self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        
        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop
        
        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._Fallen
        self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._Fallen)
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop 
        # loss = 0.5 * self._diff ** 2 
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop ** 2)
        
        self._params = lasagne.layers.helper.get_all_params(self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._Fallen: self._fallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            # self._Fallen: self._fallen_shared,
            self._advantage: self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }
        
        self._critic_regularization = (self._critic_regularization_weight * lasagne.regularization.regularize_network_params(
        self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (self._regularization_weight * lasagne.regularization.regularize_network_params(
                self._model.getActorNetwork(), lasagne.regularization.l2))
        self._kl_firstfixed = T.mean(kl(self._q_valsActTarget, self._q_valsActTargetSTD, self._q_valsActA, self._q_valsActASTD, self._action_length))
        # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed )
        # self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])*
        #                                                                              T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold']))
        self._actor_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 )
        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params(
        # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        # TD update
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho)
        elif ( self.getSettings()['optimizer'] == 'adam'):
            self._updates_ = lasagne.updates.adam(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            print ("Unknown optimization method: ", self.getSettings()['optimizer'])
            sys.exit(-1)
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), 
        #                                                                    theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?
        
        ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s) 
        # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._Fallen)) - self._q_func
        
        # self._Advantage = self._diff # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards
        self._Advantage = self._advantage * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards
        # self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length)
        # self._log_prob_target = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length)
        self._prob = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length)
        self._prob_target = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length)
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) )
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) )
        # self._actLoss_ = ( ((self._log_prob) * self._Advantage) )
        # self._actLoss_ = ( ((self._log_prob)) )
        ## This does the sum already
        # self._actLoss_ =  ( (self._log_prob).dot( self._Advantage) )
        self._r = (self._prob / self._prob_target)
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._r), self._Advantage)
        ppo_epsilon = self.getSettings()['kl_divergence_threshold']
        self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((theano.tensor.clip(self._r, 1.0 - ppo_epsilon, 1+ppo_epsilon), self._Advantage))
        self._actLoss_ = theano.tensor.minimum((self._actLoss_), (self._actLoss_2))
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(T.exp(self._log_prob - self._log_prob_target), self._Advantage)
        
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage)
        # self._actLoss_ = T.mean(self._log_prob) 
        # self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 )
        ## - because update computes gradient DESCENT updates
        # self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._actor_regularization ))
        # self._entropy = -1. * T.sum(T.log(self._q_valsActA + 1e-8) * self._q_valsActA, axis=1, keepdims=True)
        ## - because update computes gradient DESCENT updates
        self._actLoss = (-1.0 * (T.mean(self._actLoss_) + (1e-2 * self._actor_entropy))) + self._actor_regularization
        # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink
        # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2))
        self._policy_grad = T.grad(self._actLoss ,  self._actionParams)
        self._policy_grad = lasagne.updates.total_norm_constraint(self._policy_grad, 5)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(self._policy_grad, self._actionParams, 
                    self._learning_rate , self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, 
                    self._learning_rate , momentum=self._rho)
        elif ( self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(self._policy_grad, self._actionParams, 
                    self._learning_rate , beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            print ("Unknown optimization method: ", self.getSettings()['optimizer'])
            
            
        
        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + 
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, 
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        
        ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable()
        self._diff_dyna = self._dyna_target - self._q_valsNextState
        # loss = 0.5 * self._diff ** 2 
        loss = T.pow(self._diff_dyna, 2)
        self._loss_dyna = T.mean(loss)
        
        self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization ,  self._params)
        
        self._givens_dyna = {
            # self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._Fallen: self._fallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
            self._dyna_target: self._dyna_target_shared
        }
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._DYNAUpdates = lasagne.updates.rmsprop(self._dyna_grad, self._params, 
                    self._learning_rate , self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad, self._params, 
                    self._learning_rate , momentum=self._rho)
        elif ( self.getSettings()['optimizer'] == 'adam'):
            self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad, self._params, 
                    self._learning_rate , beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon)
        elif ( self.getSettings()['optimizer'] == 'adagrad'):
            self._DYNAUpdates = lasagne.updates.adagrad(self._dyna_grad, self._params, 
                    self._learning_rate, epsilon=self._rms_epsilon)
        else:
            print ("Unknown optimization method: ", self.getSettings()['optimizer'])
        
        ## Bellman error
        self._bellman = self._target - self._q_funcTarget
        
        PPOCritic.compile(self)
Esempio n. 10
0
    def __init__(self, n_actions, replay_memory, initial_weights_file=None):
        self.mood_q = None
        self.last_q = 0
        self.n_parameter_updates = 0
        self.ignore_feedback = False
        self.alpha = 0.00025
        # update frequency ?
        # gradient momentum ? 0.95
        # squared gradient momentum ? 0.95
        # min squared gradient ? 0.01
        self.save_every_n_frames = 100000  # ~ once per hour

        self.final_exploration_frame = 1000000
        self.replay_start_size = 50000
        self.i_frames = 0

        self.state = None
        self.initial_epsilon = 1
        self.final_epsilon = 0.1
        self.epsilon = self.initial_epsilon
        self.gamma = 0.99
        self.replay_memory = replay_memory

        self.log_frequency = 50

        self.minibatch_size = 32
        # self.replay_memory_size = 1000000

        self.target_network_update_frequency = 10000

        s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var = T.tensor4("s0",
                                                                                dtype=theano.config.floatX), T.bmatrix(
            "a0"), T.wcol(
            "r0"), T.tensor4("s1", dtype=theano.config.floatX), T.bcol(
            "future_reward_indicator")
        self.n_actions = n_actions
        self.a_lookup = np.eye(self.n_actions, dtype=np.int8)

        self.network = build_cnn(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256))
        print("Compiling forward.")
        self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True))

        self.network_stale = build_cnn(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256))
        print("Compiling forward stale.")
        self.forward_stale = theano.function([s1_var],
                                             lasagne.layers.get_output(self.network_stale, deterministic=True))

        if initial_weights_file is not None:
            with np.load(initial_weights_file) as initial_weights:
                param_values = [initial_weights['arr_%d' % i] for i in range(len(initial_weights.files))]
                lasagne.layers.set_all_param_values(self.network, param_values)

        self._update_network_stale()

        out = lasagne.layers.get_output(self.network)
        out_stale = lasagne.layers.get_output(self.network_stale)
        self.loss, self.err, __y, __q = build_loss(out=out,
                                                   out_stale=out_stale,
                                                   a0_var=a0_var,
                                                   r0_var=r0_var,
                                                   future_reward_indicator_var=future_reward_indicator_var,
                                                   gamma=self.gamma)

        params = lasagne.layers.get_all_params(self.network, trainable=True)
        updates = lasagne.updates.rmsprop(self.loss, params, learning_rate=0.0002, rho=0.95,
                                          epsilon=1e-6)  # TODO RMSPROP in the paper has slightly different definition (see Lua)
        print("Compiling train_fn.")
        self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                        [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale],
                                        updates=updates)
        print("Compiling loss_fn.")
        self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                       self.loss)
Esempio n. 11
0
File: A3C2.py Progetto: skylbc/SMBAE
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_):

        super(A3C2,self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_)
        
        # create a small convolutional neural network
        
        self._Fallen = T.bcol("Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._Fallen.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype('int8'))
        
        self._fallen_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='int8'),
            broadcastable=(False, True))
        
        self._advantage = T.col("Tmp_Diff")
        self._advantage.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype(self.getSettings()['float_type']))
        
        self._advantage_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype=self.getSettings()['float_type']),
            broadcastable=(False, True))
        
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings()["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        # primary network
        self._model = model
        # Target network
        self._modelTarget = copy.deepcopy(model)
        
        self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable())
        self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable())
        self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        
        self._q_valsActA = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:,:self._action_length]
        self._q_valsActASTD = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:,self._action_length:]
        
        ## prevent value from being 0
        self._q_valsActASTD = self._q_valsActASTD + 1e-3
        self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:,:self._action_length]
        self._q_valsActTargetSTD = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:,self._action_length:]
        self._q_valsActTargetSTD = self._q_valsActTargetSTD + 1e-3
        self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        
        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop
        
        N = self._model.getStateSymbolicVariable().shape[0]
        
        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._Fallen
        self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._Fallen)
        self._diff = self._target - self._q_func
        # self._Advantage = self._diff
        self._diff_drop = self._target - self._q_func_drop 
        # loss = 0.5 * self._diff ** 2 
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop ** 2)
        
        self._params = lasagne.layers.helper.get_all_params(self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._Fallen: self._fallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            self._Fallen: self._fallen_shared
            # self._advantage: self._advantage_shared
        }
        
        self._critic_regularization = (self._critic_regularization_weight * lasagne.regularization.regularize_network_params(
        self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (self._regularization_weight * lasagne.regularization.regularize_network_params(
                self._model.getActorNetwork(), lasagne.regularization.l2))
        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params(
        # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        # TD update
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho)
        elif ( self.getSettings()['optimizer'] == 'adam'):
            self._updates_ = lasagne.updates.adam(T.mean(self._q_func) + self._critic_regularization, self._params, 
                        self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            print ("Unknown optimization method: ", self.getSettings()['optimizer'])
            sys.exit(-1)
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), 
        #                                                                theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?
        # self._actLoss
        # self._actDiff = (self._model.getActionSymbolicVariable() - self._q_valsActA)
        # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here?
        # self._actDiff_drop = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here?
        ## This should be a single column vector
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( (T.mean(T.pow(self._actDiff, 2),axis=1) )), (self._diff * (1.0/(1.0-self._discount_factor))))
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.reshape(T.sum(T.pow(self._actDiff, 2),axis=1), (self._batch_size, 1) )), 
        #                                                                        (self._advantage * (1.0/(1.0-self._discount_factor)))
        # self._actLoss_ = (T.mean(T.pow(self._actDiff, 2),axis=1))
        # self._Advantage = theano.tensor.tile(theano.gradient.disconnected_grad(self._diff), self._action_length)
        # self._Advantage = theano.gradient.disconnected_grad(self._diff)
        # self._Advantage = theano.tensor.clip(self._diff * (1.0/(1.0-self._discount_factor)), 0, 100000.0) ## scale back to same as rewards
        self._Advantage = self._diff # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards
        self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length)
        self._log_prob_target = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length)
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) )
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) )
        # self._actLoss_ = ( ((self._log_prob) * self._Advantage) )
        # self._actLoss_ = ( ((self._log_prob)) )
        ## This does the sum already
        # self._actLoss_ =  ( (self._log_prob).dot( self._Advantage) )
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(T.exp(self._log_prob - self._log_prob_target), self._Advantage)
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage)
        # self._actLoss_ = T.mean(self._log_prob) 
        self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 )
        ## - because update computes gradient DESCENT updates
        self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._policy_entropy * 1e-2))
        # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink
        # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2))
        # self._policy_grad = T.grad(self._actLoss ,  self._actionParams)
        # self._policy_grad = self._actLoss
        # self._policy_grad = lasagne.updates.total_norm_constraint(self._policy_grad, 5)
        # steps, self._actionUpdates = get_adam_steps_and_updates(self._policy_grad, self._actionParams, self._learning_rate)
        # self._actionUpdates = adam_updates(self._actLoss, self._actionParams, self._learning_rate)
        
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(self._actLoss , self._actionParams, 
                    self._learning_rate , self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._actLoss , self._actionParams, 
                    self._learning_rate , momentum=self._rho)
        elif ( self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(self._actLoss , self._actionParams, 
                    self._learning_rate , beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            print ("Unknown optimization method: ", self.getSettings()['optimizer'])
            
        
        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + 
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, 
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        
        ## Bellman error
        self._bellman = self._target - self._q_funcTarget
        A3C2.compile(self)
Esempio n. 12
0
    def __init__(self,
                 n_actions,
                 replay_memory,
                 build_network,
                 updates,
                 screen_size,
                 initial_weights_file=None):
        self.screen_width, self.screen_height = screen_size
        self.mood_q = None
        self.last_q = 0
        self.n_parameter_updates = 0
        self.alpha = 0.00025
        # update frequency ?
        # gradient momentum ? 0.95
        # squared gradient momentum ? 0.95
        # min squared gradient ? 0.01
        self.save_every_n_frames = 100000  # ~ once per hour

        self.final_exploration_frame = 1000000
        self.replay_start_size = 50000
        self.i_action = 0

        self.state = None
        self.initial_epsilon = 1
        self.final_epsilon = 0.1
        self.epsilon = self.initial_epsilon
        self.gamma = 0.99
        self.replay_memory = replay_memory

        self.log_frequency = 1

        self.minibatch_size = 32
        # self.replay_memory_size = 1000000

        self.target_network_update_frequency = 10000

        s0_var = T.tensor4("s0", dtype=theano.config.floatX)
        a0_var = T.bmatrix("a0")
        r0_var = T.wcol("r0")
        s1_var = T.tensor4("s1", dtype=theano.config.floatX)
        future_reward_indicator_var = T.bcol("future_reward_indicator")

        self.n_actions = n_actions
        self.a_lookup = np.eye(self.n_actions, dtype=np.int8)

        self.network = build_network(
            n_actions=self.n_actions,
            input_var=T.cast(s0_var, 'float32') / np.float32(256),
            screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward.")
        self.forward = theano.function([s0_var],
                                       lasagne.layers.get_output(
                                           self.network, deterministic=True))

        self.network_stale = build_network(
            n_actions=self.n_actions,
            input_var=T.cast(s1_var, 'float32') / np.float32(256),
            screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward_stale.")
        self.forward_stale = theano.function([s1_var],
                                             lasagne.layers.get_output(
                                                 self.network_stale,
                                                 deterministic=True))

        self._update_network_stale()

        out = lasagne.layers.get_output(self.network)
        out_stale = lasagne.layers.get_output(self.network_stale)
        self.loss, self.err, __y, __q = build_loss(
            out=out,
            out_stale=out_stale,
            a0_var=a0_var,
            r0_var=r0_var,
            future_reward_indicator_var=future_reward_indicator_var,
            gamma=self.gamma)

        params = lasagne.layers.get_all_params(self.network, trainable=True)

        print("Compiling train_fn.")
        self.train_fn = theano.function(
            [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [
                self.loss, self.err,
                T.transpose(__y),
                T.transpose(__q), out, out_stale
            ],
            updates=updates(self.loss, params))
        print("Compiling loss_fn.")
        self.loss_fn = theano.function(
            [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
            self.loss)
Esempio n. 13
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(TRPOCritic,
              self).__init__(model, n_in, n_out, state_bounds, action_bounds,
                             reward_bound, settings_)

        # create a small convolutional neural network

        self._Fallen = T.bcol("Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._Fallen.tag.test_value = np.zeros((self._batch_size, 1),
                                               dtype=np.dtype('int8'))

        self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                     dtype='int8'),
                                            broadcastable=(False, True))

        self._advantage = T.col("Advantage")
        self._advantage.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._advantage_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                               broadcastable=(False, True))

        self._KL_Weight = T.scalar("KL_Weight")
        self._KL_Weight.tag.test_value = np.zeros(
            (1), dtype=np.dtype(self.getSettings()['float_type']))[0]

        self._kl_weight_shared = theano.shared(
            np.ones((1), dtype=self.getSettings()['float_type'])[0])
        self._kl_weight_shared.set_value(
            self.getSettings()['previous_value_regularization_weight'])
        """
        self._target_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype='float64'),
            broadcastable=(False, True))
        """
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        # primary network
        self._model = model
        # Target network
        self._modelTarget = copy.deepcopy(model)

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, :self._action_length]
        self._q_valsActASTD = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)[:, self._action_length:]

        ## prevent value from being 0
        self._q_valsActASTD = (self._q_valsActASTD *
                               self.getSettings()['exploration_rate']) + 1e-3
        self._q_valsActTarget_ = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable())
        self._q_valsActTarget = self._q_valsActTarget_[:, :self._action_length]
        self._q_valsActTargetSTD = self._q_valsActTarget_[:,
                                                          self._action_length:]
        self._q_valsActTargetSTD = (
            self._q_valsActTargetSTD *
            self.getSettings()['exploration_rate']) + 1e-3
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._Fallen
        self._target = T.mul(
            T.add(self._model.getRewardSymbolicVariable(),
                  T.mul(self._discount_factor, self._q_valsTargetNextState)),
            self._Fallen)
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            self._Fallen:
            self._fallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._Fallen:
            self._fallen_shared,
            # self._advantage: self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        # self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #         self._model.getActorNetwork(), lasagne.regularization.l2)) )
        self._kl_firstfixed = kl(self._q_valsActTarget,
                                 self._q_valsActTargetSTD, self._q_valsActA,
                                 self._q_valsActASTD,
                                 self._action_length).mean()
        # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed )
        self._actor_regularization = (
            (self._KL_Weight) * self._kl_firstfixed) + (
                (self._kl_firstfixed >
                 self.getSettings()['kl_divergence_threshold']) *
                T.square(self._kl_firstfixed -
                         self.getSettings()['kl_divergence_threshold']))

        # SGD update
        # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params(
        # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        # TD update
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._updates_ = lasagne.updates.rmsprop(
                T.mean(self._q_func) + self._critic_regularization,
                self._params, self._critic_learning_rate * -T.mean(self._diff),
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._updates_ = lasagne.updates.momentum(
                T.mean(self._q_func) + self._critic_regularization,
                self._params,
                self._critic_learning_rate * -T.mean(self._diff),
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._updates_ = lasagne.updates.adam(
                T.mean(self._q_func) + self._critic_regularization,
                self._params,
                self._critic_learning_rate * -T.mean(self._diff),
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA),
        #                                                                    theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here?

        ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s)
        # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._Fallen)) - self._q_func

        self._Advantage = self._diff  # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards
        self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(),
                                       self._q_valsActA, self._q_valsActASTD,
                                       self._action_length)
        self._log_prob_target = loglikelihood(
            self._model.getActionSymbolicVariable(), self._q_valsActTarget,
            self._q_valsActTargetSTD, self._action_length)
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) )
        # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) )
        # self._actLoss_ = ( ((self._log_prob) * self._Advantage) )
        # self._actLoss_ = ( ((self._log_prob)) )
        ## This does the sum already
        # self._actLoss_ =  ( (self._log_prob).dot( self._Advantage) )
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            T.exp(self._log_prob - self._log_prob_target), self._Advantage)
        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(T.exp(self._log_prob - self._log_prob_target), self._advantage)

        # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage)
        # self._actLoss_ = T.mean(self._log_prob)
        # self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 )
        ## - because update computes gradient DESCENT updates
        # self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._actor_regularization ))
        # self._entropy = -1. * T.sum(T.log(self._q_valsActA + 1e-8) * self._q_valsActA, axis=1, keepdims=True)
        ## - because update computes gradient DESCENT updates
        self._actLoss = (T.mean(self._actLoss_))
        # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink
        # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2))
        self._policy_grad = T.grad(self._actLoss, self._actionParams)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(self._policy_grad,
                                                       self._actionParams,
                                                       self._learning_rate,
                                                       beta1=0.9,
                                                       beta2=0.999,
                                                       epsilon=1e-08)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
        N = self._model.getStateSymbolicVariable().shape[0]
        params = self._actionParams
        surr = self._actLoss * (-1.0)
        self.pg = flatgrad(surr, params)

        prob_mean_fixed = theano.gradient.disconnected_grad(self._q_valsActA)
        prob_std_fixed = theano.gradient.disconnected_grad(self._q_valsActASTD)
        kl_firstfixed = kl(prob_mean_fixed, prob_std_fixed, self._q_valsActA,
                           self._q_valsActASTD, self._action_length).sum() / N
        grads = T.grad(kl_firstfixed, params)
        self.flat_tangent = T.vector(name="flat_tan")
        shapes = [var.get_value(borrow=True).shape for var in params]
        start = 0
        tangents = []
        for shape in shapes:
            size = np.prod(shape)
            tangents.append(
                T.reshape(self.flat_tangent[start:start + size], shape))
            start += size
        self.gvp = T.add(
            *[T.sum(g * tangent) for (g, tangent) in zipsame(grads, tangents)])  #pylint: disable=E1111
        # Fisher-vector product
        self.fvp = flatgrad(self.gvp, params)

        self.ent = entropy(self._q_valsActASTD).mean()
        self.kl = kl(self._q_valsActTarget, self._q_valsActTargetSTD,
                     self._q_valsActA, self._q_valsActASTD,
                     self._action_length).mean()

        self.losses = [surr, self.kl, self.ent]
        self.loss_names = ["surr", "kl", "ent"]

        self.args = [
            self._model.getStateSymbolicVariable(),
            self._model.getActionSymbolicVariable(),
            self._model.getResultStateSymbolicVariable(),
            self._model.getRewardSymbolicVariable(), self._Fallen
            # self._advantage
            # self._q_valsActTarget_
        ]

        self.args_fvp = [
            self._model.getStateSymbolicVariable(),
            # self._model.getActionSymbolicVariable()
            # self._advantage,
            # self._q_valsActTarget_
        ]

        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) +
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams,
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        TRPOCritic.compile(self)
Esempio n. 14
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(Distillation,
              self).__init__(model, n_in, n_out, state_bounds, action_bounds,
                             reward_bound, settings_)

        # create a small convolutional neural network

        ### Load expert policy files
        self._expert_policies = []
        file_name_ = ""
        for i in range(len(self.getSettings()['expert_policy_files'])):
            file_name = self.getSettings(
            )['expert_policy_files'][i] + '/' + self.getSettings(
            )['model_type'] + '/' + getAgentName() + '.pkl'
            if (file_name_ == file_name):
                ## To help save memory when experts are the same
                self._expert_policies.append(model_)
            else:
                print("Loading pre compiled network: ", file_name)
                f = open(file_name, 'rb')
                model_ = dill.load(f)
                f.close()
                self._expert_policies.append(
                    model_)  # expert model, load the 2 expert models
            file_name_ = file_name

        self._actor_buffer_states = []
        self._actor_buffer_result_states = []
        self._actor_buffer_actions = []
        self._actor_buffer_rewards = []
        self._actor_buffer_falls = []
        self._actor_buffer_diff = []

        self._NotFallen = T.bcol("Not_Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1),
                                                  dtype=np.dtype('int8'))

        self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                        dtype='int8'),
                                               broadcastable=(False, True))

        self._tmp_diff = T.col("Tmp_Diff")
        self._tmp_diff.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_diff_shared = theano.shared(
            np.zeros((self._batch_size, 1),
                     dtype=self.getSettings()['float_type']),
            broadcastable=(False, True))  #定义一个共享变量,初始值为为0

        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        ## Target network
        self._modelTarget = copy.deepcopy(model)  # target model 是要更新的模型

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)  #确定性原始模型的state值输出
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)  #非确定的state值输出
        self._q_valsNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)  #下一步的state值
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)  #目标模型的下一步的state值
        self._q_valsTarget = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)  #目标模型的state值
        self._q_valsTarget_drop = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)  #目标模型的state

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)  #remove the random
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)  #actor 值

        self._q_func = self._q_valsA
        self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop

        self._target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._q_valsTargetNextState)
        # self._model.getRewardSymbolicVariable() 获取rewards的值getRewards() =self._rewards_shared 从0开始一直更新
        self._diff = self._target - self._q_func
        self._diff_drop = self._target - self._q_func_drop  #更新的模型的reward减去原始模型的critic的输出值
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)  # 两个模型的reward的差值
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards()
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            self._tmp_diff: self._tmp_diff_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (
            (self._regularization_weight *
             lasagne.regularization.regularize_network_params(
                 self._model.getActorNetwork(), lasagne.regularization.l2)))
        if (self.getSettings()['use_previous_value_regularization']):
            self._actor_regularization = self._actor_regularization + (
                (self.getSettings()['previous_value_regularization_weight']) *
                change_penalty(self._model.getActorNetwork(),
                               self._modelTarget.getActorNetwork()))
        elif ('regularization_type' in self.getSettings() and
              (self.getSettings()['regularization_type'] == 'KL_Divergence')):
            self._kl_firstfixed = T.mean(
                kl(
                    self._q_valsActTarget,
                    T.ones_like(self._q_valsActTarget) *
                    self.getSettings()['exploration_rate'], self._q_valsActA,
                    T.ones_like(self._q_valsActA) *
                    self.getSettings()['exploration_rate'],
                    self._action_length))
            self._actor_regularization = (self._kl_firstfixed) * (
                self.getSettings()['kl_divergence_threshold'])

            print("Using regularization type : ",
                  self.getSettings()['regularization_type'])
        # SGD update
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.rmsprop(self._value_grad,
                                                     self._params,
                                                     self._learning_rate,
                                                     self._rho,
                                                     self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.momentum(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adam(self._value_grad,
                                                  self._params,
                                                  self._critic_learning_rate,
                                                  beta1=0.9,
                                                  beta2=0.9,
                                                  epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_ = lasagne.updates.adagrad(
                self._value_grad,
                self._params,
                self._critic_learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])
            sys.exit(-1)
        ## TD update

        ## Need to perform an element wise operation or replicate _diff for this to work properly.
        self._actDiff = (self._model.getActionSymbolicVariable() -
                         self._q_valsActA_drop)  # 更新模型的actor的输出减去原始模型的actor值

        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (T.mean(T.pow(self._actDiff, 2), axis=1)), (self._tmp_diff))
        self._actLoss = T.mean(self._actLoss_)
        self._policy_grad = T.grad(self._actLoss + self._actor_regularization,
                                   self._actionParams)
        ## Clipping the max gradient
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                beta1=0.9,
                beta2=0.999,
                epsilon=self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'adagrad'):
            self._actionUpdates = lasagne.updates.adagrad(
                self._policy_grad,
                self._actionParams,
                self._learning_rate,
                epsilon=self._rms_epsilon)
        else:
            print("Unknown optimization method: ",
                  self.getSettings()['optimizer'])

        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates()
        }

        ## Bellman error
        self._bellman = self._target - self._q_funcTarget

        ### Give v(s') the next state and v(s) (target) the current state
        self._diff_adv = (self._discount_factor * self._q_func) - (
            self._q_valsTargetNextState
        )  #\gamma*critic模型的输出-critic模型在下一个状态的输出值
        self._diff_adv_givens = {
            self._model.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getStates(),
        }

        Distillation.compile(self)
Esempio n. 15
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):
        """
            In order to get this to work we need to be careful not to update the actor parameters
            when updating the critic. This can be an issue when the Concatenating networks together.
            The first first network becomes a part of the second. However you can still access the first
            network by itself but an updates on the second network will effect the first network.
            Care needs to be taken to make sure only the parameters of the second network are updated.
        """

        super(QProp, self).__init__(model, n_in, n_out, state_bounds,
                                    action_bounds, reward_bound, settings_)

        # if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)):
        self._experience = ExperienceMemory(
            n_in,
            n_out,
            self.getSettings()['expereince_length'],
            continuous_actions=True,
            settings=self.getSettings())

        self._experience.setStateBounds(copy.deepcopy(self.getStateBounds()))
        self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds()))
        self._experience.setActionBounds(copy.deepcopy(self.getActionBounds()))

        self._use_basic_polcy_grad = False

        self._Fallen = T.bcol("Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._Fallen.tag.test_value = np.zeros((self._batch_size, 1),
                                               dtype=np.dtype('int8'))

        self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                     dtype='int8'),
                                            broadcastable=(False, True))

        self._Action = T.matrix("Action2")
        self._Action.tag.test_value = np.random.rand(self._batch_size,
                                                     self._action_length)

        self._Tmp_Target = T.col("Tmp_Target")
        self._Tmp_Target.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                                broadcastable=(False, True))

        self._Advantage = T.col("Advantage")
        self._Advantage.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))
        self._advantage_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                               broadcastable=(False, True))
        self._QProp_N = T.col("QProp_N")
        self._QProp_N.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))
        self._QProp_N_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                             broadcastable=(False, True))

        self._modelTarget = copy.deepcopy(model)
        self._modelTarget2 = copy.deepcopy(model)

        self._learning_rate = self.getSettings()['learning_rate']
        self._discount_factor = self.getSettings()['discount_factor']
        self._rho = self.getSettings()['rho']
        self._rms_epsilon = self.getSettings()['rms_epsilon']

        self._weight_update_steps = self.getSettings(
        )['steps_until_target_network_update']
        self._updates = 0
        self._decay_weight = self.getSettings()['regularization_weight']
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget_State = lasagne.layers.get_output(
            self._modelTarget2.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)

        self._q_valsActASTD = (T.ones_like(
            self._q_valsActA)) * self.getSettings()['exploration_rate']
        self._q_valsActTargetSTD = (T.ones_like(self._q_valsActTarget_State)
                                    ) * self.getSettings()['exploration_rate']

        inputs_1 = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions()
        }
        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_1)
        inputs_2 = {
            self._modelTarget.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._modelTarget.getActionSymbolicVariable():
            self._model.getActions()
        }
        self._q_valsB_ = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(), inputs_2, deterministic=True)

        self._q_func = self._q_valsA
        self._q_funcB = self._q_valsB_
        self._q_funcAct = self._q_valsActA

        self._diff = self._Tmp_Target - self._q_func
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        print("******Number of Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getCriticNetwork()))))
        print("******Number of Action Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getActorNetwork()))))
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            self._Tmp_Target: self._tmp_target_shared
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))

        ## MSE update
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_ = lasagne.updates.adam(self._value_grad,
                                              self._params,
                                              self._critic_learning_rate,
                                              beta1=0.9,
                                              beta2=0.9,
                                              epsilon=self._rms_epsilon)

        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
        }

        ## Some cool stuff to backprop action gradients
        self._action_grad = T.matrix("Action_Grad")
        self._action_grad.tag.test_value = np.zeros(
            (self._batch_size, self._action_length),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._action_grad_shared = theano.shared(
            np.zeros((self._batch_size, self._action_length),
                     dtype=self.getSettings()['float_type']))

        ### Maximize wrt q function
        self._action_mean_grads = T.grad(
            cost=None,
            wrt=self._actionParams,
            known_grads={self._q_valsActA: self._action_grad_shared}),
        print("Action grads: ", self._action_mean_grads[0])
        ## When passing in gradients it needs to be a proper list of gradient expressions
        self._action_mean_grads = list(self._action_mean_grads[0])
        self._actionGRADUpdates = lasagne.updates.adam(
            self._action_mean_grads,
            self._actionParams,
            self._learning_rate,
            beta1=0.9,
            beta2=0.9,
            epsilon=self._rms_epsilon)

        self._actGradGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
        }
        self._actor_regularization = (
            self._regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getActorNetwork(), lasagne.regularization.l2))

        ### update Actor wrt to Q function
        """
        inputs_1_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._q_valsActA: self._model.getActions()
        }
        q = self._model.getCriticNetwork()(self._model.getStateSymbolicVariable(), self._q_valsActA)
        self._q_valsA_ = lasagne.layers.get_output(self._model.getCriticNetwork(), inputs_1_)
        # self._q_valsA_ = lasagne.layers.get_output(self._model.getCriticNetwork(), self._q_valsActA)
        self._q_val2 = theano.function([self._model.getStateSymbolicVariable()], self._q_valsA_)
        self._actionUpdates = lasagne.updates.adam(-T.mean(self._q_valsA_), self._actionParams, 
                    self._learning_rate,  beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon)
        """
        ## Compute on-policy policy gradient
        self._prob = likelihood(self._model.getActionSymbolicVariable(),
                                self._q_valsActA, self._q_valsActASTD,
                                self._action_length)
        ### How should this work if the target network is very odd, as in not a slightly outdated copy.
        self._prob_target = likelihood(self._model.getActionSymbolicVariable(),
                                       self._q_valsActTarget_State,
                                       self._q_valsActTargetSTD,
                                       self._action_length)
        ## This does the sum already
        self._r = (self._prob / self._prob_target)
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (self._r), self._Advantage)
        ppo_epsilon = self.getSettings()['kl_divergence_threshold']
        self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (theano.tensor.clip(self._r, 1.0 - ppo_epsilon,
                                1 + ppo_epsilon), self._Advantage))
        self._actLoss_ = theano.tensor.minimum((self._actLoss_),
                                               (self._actLoss_2))
        self._actLoss = (
            (T.mean(self._actLoss_))) + -self._actor_regularization

        self._policy_grad = T.grad(-1.0 * self._actLoss, self._actionParams)
        self._policy_grad = lasagne.updates.total_norm_constraint(
            self._policy_grad, 5)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(self._policy_grad,
                                                       self._actionParams,
                                                       self._learning_rate,
                                                       beta1=0.9,
                                                       beta2=0.999,
                                                       epsilon=1e-08)

        self._qprop_loss = self._actLoss + T.mean(
            (self._QProp_N * self._q_func))
        self._policy_grad_loss = self._actLoss
        # if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)):
        self._valsA = lasagne.layers.get_output(
            self._model._value_function,
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._valsA_drop = lasagne.layers.get_output(
            self._model._value_function,
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._valsNextState = lasagne.layers.get_output(
            self._model._value_function,
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget._value_function,
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._valsTarget = lasagne.layers.get_output(
            self._modelTarget._value_function,
            self._model.getStateSymbolicVariable(),
            deterministic=True)

        self._v_target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._valsTargetNextState)
        self._v_diff = self._v_target - self._valsA
        loss_v = T.pow(self._v_diff, 2)
        self._v_loss = T.mean(loss_v)

        self._params_value = lasagne.layers.helper.get_all_params(
            self._model._value_function)
        self._givens_value = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
        }
        self._value_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model._value_function, lasagne.regularization.l2))

        self._value_grad = T.grad(self._v_loss + self._value_regularization,
                                  self._params_value)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_value = lasagne.updates.adam(self._value_grad,
                                                   self._params_value,
                                                   self._critic_learning_rate,
                                                   beta1=0.9,
                                                   beta2=0.9,
                                                   epsilon=self._rms_epsilon)

        self._actGivens_PPO = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared,
            self._Advantage:
            self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }
        QProp.compile(self)