Example #1
0
File: GAN.py Project: skylbc/SMBAE
class GAN(AlgorithmInterface):
    """
        0 is a generated sample
        1 is a true sample
        maximize D while minimizing G
    """
    def __init__(self, model, state_length, action_length, state_bounds,
                 action_bounds, settings_):

        print("Building GAN Model")
        super(GAN, self).__init__(model, state_length, action_length,
                                  state_bounds, action_bounds, 0, settings_)
        self._noise_mean = 0.0
        self._noise_std = 1.0
        self._noise_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                           broadcastable=(False, True))

        # if settings['action_space_continuous']:
        if ('size_of_result_state' in self.getSettings()):
            self._experience = ExperienceMemory(
                state_length,
                action_length,
                self.getSettings()['expereince_length'],
                continuous_actions=True,
                settings=self.getSettings(),
                result_state_length=self.getSettings()['size_of_result_state'])
        else:
            self._experience = ExperienceMemory(
                state_length,
                action_length,
                self.getSettings()['expereince_length'],
                continuous_actions=True,
                settings=self.getSettings())

        self._experience.setStateBounds(copy.deepcopy(self.getStateBounds()))
        self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds()))
        self._experience.setActionBounds(copy.deepcopy(self.getActionBounds()))

        self._modelTarget = copy.deepcopy(model)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = self.getSettings()["fd_learning_rate"]
        self._regularization_weight = 1e-5
        self._discount_factor = self.getSettings()['discount_factor']
        self._rho = self.getSettings()['rho']
        self._rms_epsilon = self.getSettings()['rms_epsilon']

        self._weight_update_steps = self.getSettings(
        )['steps_until_target_network_update']
        self._updates = 0
        self._decay_weight = self.getSettings()['regularization_weight']
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]

        # self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        # self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        # self._q_valsNextState = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        # self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)

        if ("train_gan_with_gaussian_noise" in self.getSettings()
                and (self.getSettings()["train_gan_with_gaussian_noise"])):
            inputs_1 = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                self._model._Noise:
                self._noise_shared
            }
            self._generator_drop = lasagne.layers.get_output(
                self._model.getForwardDynamicsNetwork(),
                inputs_1,
                deterministic=True)
            self._generator = lasagne.layers.get_output(
                self._model.getForwardDynamicsNetwork(),
                inputs_1,
                deterministic=True)
        else:
            inputs_1 = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                # self._model._Noise: self._noise_shared
            }
            self._generator = lasagne.layers.get_output(
                self._model.getForwardDynamicsNetwork(),
                inputs_1,
                deterministic=True)
            self._generator_drop = lasagne.layers.get_output(
                self._model.getForwardDynamicsNetwork(),
                inputs_1,
                deterministic=False)
        # self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getForwardDynamicsNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsActA_drop = lasagne.layers.get_output(self._model.getForwardDynamicsNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)

        inputs_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model._Noise: self._noise_shared
        }
        self._discriminator = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_, deterministic=True)
        self._discriminator_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_, deterministic=False)
        """
        inputs_2 = {
            self._modelTarget.getStateSymbolicVariable(): self._model.getResultStates(),
            self._modelTarget.getActionSymbolicVariable(): self._model.getActions()
        }
        """

        self._diff = self._model.getRewardSymbolicVariable(
        ) - self._discriminator_drop
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)

        self._diff_g = self._model.getResultStateSymbolicVariable(
        ) - self._generator_drop
        loss_g = T.pow(self._diff_g, 2)
        self._loss_g = T.mean(loss_g)

        # assert len(lasagne.layers.helper.get_all_params(self._l_outA)) == 16
        # Need to remove the action layers from these params
        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        print("******Number of Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getCriticNetwork()))))
        print("******Number of Action Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getForwardDynamicsNetwork()))))
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getForwardDynamicsNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            # self._model._Noise: self._noise_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))

        ## MSE update
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_ = lasagne.updates.adam(self._value_grad,
                                              self._params,
                                              self._learning_rate,
                                              beta1=0.9,
                                              beta2=0.9,
                                              epsilon=self._rms_epsilon)

        if ("train_gan_with_gaussian_noise" in settings_
                and (settings_["train_gan_with_gaussian_noise"])):
            self._actGivens = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                self._model._Noise:
                self._noise_shared
            }
            self._actGivens_MSE = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                self._model._Noise:
                self._noise_shared
            }
        else:
            self._actGivens = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                # self._model._Noise: self._noise_shared
            }
            self._actGivens_MSE = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                # self._model._Noise: self._noise_shared
            }

        self._actor_regularization = (
            self._regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getForwardDynamicsNetwork(),
                lasagne.regularization.l2))
        ## MSE update
        self._gen_grad = T.grad(self._loss_g + self._actor_regularization,
                                self._actionParams)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_generator = lasagne.updates.adam(
            self._gen_grad,
            self._actionParams,
            self._learning_rate,
            beta1=0.9,
            beta2=0.9,
            epsilon=self._rms_epsilon)

        ## Some cool stuff to backprop action gradients

        self._result_state_grad = T.matrix("Action_Grad")
        self._result_state_grad.tag.test_value = np.zeros(
            (self._batch_size, self._state_length),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._result_state_grad_shared = theano.shared(
            np.zeros((self._batch_size, self._state_length),
                     dtype=self.getSettings()['float_type']))

        ### Maximize wrt q function

        self._result_state_mean_grads = T.grad(
            cost=None,
            wrt=self._actionParams,
            known_grads={self._generator: self._result_state_grad_shared}),
        print("Action grads: ", self._result_state_mean_grads[0])
        ## When passing in gradients it needs to be a proper list of gradient expressions
        self._result_state_mean_grads = list(self._result_state_mean_grads[0])
        # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list))
        # print ("Action grads: ", self._action_mean_grads)
        self._generatorGRADUpdates = lasagne.updates.adam(
            self._result_state_mean_grads,
            self._actionParams,
            self._learning_rate * 0.1,
            beta1=0.9,
            beta2=0.9,
            epsilon=self._rms_epsilon)

        self._givens_grad = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
        }

        ### Some other stuff to learn a reward function
        self._inputs_reward_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
        }
        self._reward = lasagne.layers.get_output(
            self._model.getRewardNetwork(),
            self._inputs_reward_,
            deterministic=True)
        self._reward_drop = lasagne.layers.get_output(
            self._model.getRewardNetwork(),
            self._inputs_reward_,
            deterministic=False)
        ## because rewards are noramlized then scaled by the discount factor to the value stay between -1,1.
        self._reward_diff = (self._model.getRewardSymbolicVariable() *
                             (1.0 /
                              (1.0 - self.getSettings()['discount_factor']))
                             ) - self._reward_drop
        self.__Reward = self._model.getRewardSymbolicVariable()
        print("self.__Reward", self.__Reward)
        # self._reward_diff = (self._model.getRewardSymbolicVariable()) - self._reward_drop
        self._reward_loss_ = T.mean(T.pow(self._reward_diff, 2), axis=1)
        self._reward_loss = T.mean(self._reward_loss_)

        self._reward_diff_NoDrop = (
            self._model.getRewardSymbolicVariable() *
            (1.0 /
             (1.0 - self.getSettings()['discount_factor']))) - self._reward
        # self._reward_diff_NoDrop = (self._model.getRewardSymbolicVariable()) - self._reward
        self._reward_loss_NoDrop_ = T.mean(T.pow(self._reward_diff_NoDrop, 2),
                                           axis=1)
        self._reward_loss_NoDrop = T.mean(self._reward_loss_NoDrop_)
        self._reward_params = lasagne.layers.helper.get_all_params(
            self._model.getRewardNetwork())
        self._reward_givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
        }
        self._reward_updates_ = lasagne.updates.adam(
            self._reward_loss +
            (self._regularization_weight *
             lasagne.regularization.regularize_network_params(
                 self._model.getRewardNetwork(), lasagne.regularization.l2)),
            self._reward_params,
            self._learning_rate,
            beta1=0.9,
            beta2=0.999,
            epsilon=self._rms_epsilon)

        GAN.compile(self)

    def compile(self):

        self._train = theano.function([], [self._loss, self._discriminator],
                                      updates=self._updates_,
                                      givens=self._givens_)

        # self._trainActor = theano.function([], [actLoss, self._q_valsActA], updates=actionUpdates, givens=actGivens)
        # self._trainActor = theano.function([], [self._q_func], updates=self._actionUpdates, givens=self._actGivens)
        self._trainGenerator = theano.function(
            [], [], updates=self._generatorGRADUpdates, givens=self._actGivens)
        self._trainGenerator_MSE = theano.function(
            [], [],
            updates=self._updates_generator,
            givens=self._actGivens_MSE)
        self._discriminate = theano.function(
            [],
            self._discriminator,
            givens={
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
            })

        #self._q_val_Target = theano.function([], self._q_valsB_, givens=self._givens_grad)
        if ("train_gan_with_gaussian_noise" in self.getSettings()
                and (self.getSettings()["train_gan_with_gaussian_noise"])):
            self._generate = theano.function(
                [],
                self._generator,
                givens={
                    self._model.getStateSymbolicVariable():
                    self._model.getStates(),
                    self._model.getActionSymbolicVariable():
                    self._model.getActions(),
                    # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                    self._model._Noise:
                    self._noise_shared
                })
        else:
            self._generate = theano.function(
                [],
                self._generator,
                givens={
                    self._model.getStateSymbolicVariable():
                    self._model.getStates(),
                    self._model.getActionSymbolicVariable():
                    self._model.getActions(),
                    # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                    self._model._Noise:
                    self._noise_shared
                })
        """
        inputs_ = [
                   self._model.getStateSymbolicVariable(), 
                   self._model.getRewardSymbolicVariable(), 
                   # ResultState
                   ]
        self._bellman_error = theano.function(inputs=inputs_, outputs=self._diff, allow_input_downcast=True)
        """
        # self._diffs = theano.function(input=[State])
        self._bellman_error = theano.function(
            inputs=[],
            outputs=self._loss_g,
            allow_input_downcast=True,
            givens={
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                self._model._Noise:
                self._noise_shared
            })

        # self._get_action_grad = theano.function([], outputs=lasagne.updates.get_or_compute_grads(T.mean(self._discriminator), [self._model._actionInputVar] + self._params), allow_input_downcast=True, givens=self._givens_grad)
        self._get_state_grad = theano.function(
            [],
            outputs=lasagne.updates.get_or_compute_grads(
                T.mean(self._discriminator),
                [self._model._stateInputVar] + self._params),
            allow_input_downcast=True,
            givens=self._givens_grad)
        self._get_result_state_grad = theano.function(
            [],
            outputs=lasagne.updates.get_or_compute_grads(
                T.mean(self._discriminator),
                [self._model._resultStateInputVar] + self._params),
            allow_input_downcast=True,
            givens=self._givens_grad)
        self._get_action_grad = theano.function(
            [],
            outputs=T.grad(
                cost=None,
                wrt=[self._model._actionInputVar] + self._actionParams,
                known_grads={self._generator: self._result_state_grad_shared}),
            allow_input_downcast=True,
            givens=self._actGivens)

        # self._get_grad_reward = theano.function([], outputs=lasagne.updates.get_or_compute_grads((self._reward_loss_NoDrop), [lasagne.layers.get_all_layers(self._model.getRewardNetwork())[0].input_var] + self._reward_params), allow_input_downcast=True,
        self._get_grad_reward = theano.function(
            [],
            outputs=lasagne.updates.get_or_compute_grads(
                T.mean(self._reward),
                [self._model._actionInputVar] + self._reward_params),
            allow_input_downcast=True,
            givens=self._inputs_reward_)

        self._train_reward = theano.function([], [self._reward_loss],
                                             updates=self._reward_updates_,
                                             givens=self._reward_givens_)
        self._predict_reward = theano.function([],
                                               self._reward,
                                               givens=self._inputs_reward_)
        self._reward_error = theano.function(inputs=[],
                                             outputs=self._reward_diff,
                                             allow_input_downcast=True,
                                             givens=self._reward_givens_)
        self._reward_values = theano.function(
            inputs=[],
            outputs=self.__Reward,
            allow_input_downcast=True,
            givens={
                # self._model.getStateSymbolicVariable() : self._model.getStates(),
                # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(),
                # self._model.getActionSymbolicVariable(): self._model.getActions(),
                self._model.getRewardSymbolicVariable():
                self._model.getRewards(),
            })

    def getStateGrads(self, states, actions=None, alreadyNormed=False):
        """
            The states should be normalized
        """
        # self.setData(states, actions, rewards, result_states)
        if (alreadyNormed == False):
            states = norm_state(states, self._state_bounds)
        states = np.array(states, dtype=theano.config.floatX)
        self._model.setStates(states)

        return self._get_state_grad()

    def getResultStateGrads(self,
                            result_states,
                            actions=None,
                            alreadyNormed=False):
        """
            The states should be normalized
        """
        # self.setData(states, actions, rewards, result_states)
        if (alreadyNormed == False):
            result_states = norm_state(result_states, self._state_bounds)
        result_states = np.array(result_states, dtype=theano.config.floatX)
        self._model.setResultStates(result_states)

        return self._get_result_state_grad()

    def setGradTarget(self, grad):
        self._result_state_grad_shared.set_value(grad)

    def getGrads(self,
                 states,
                 actions,
                 result_states,
                 v_grad=None,
                 alreadyNormed=False):
        if (alreadyNormed == False):
            states = np.array(norm_state(states, self._state_bounds),
                              dtype=self.getSettings()['float_type'])
            actions = np.array(norm_action(actions, self._action_bounds),
                               dtype=self.getSettings()['float_type'])
            result_states = np.array(norm_state(result_states,
                                                self._state_bounds),
                                     dtype=self.getSettings()['float_type'])
        # result_states = np.array(result_states, dtype=self.getSettings()['float_type'])
        self.setData(states, actions, result_states)
        # if (v_grad != None):
        self.setGradTarget(v_grad)
        return self._get_action_grad()

    def getRewardGrads(self, states, actions, alreadyNormed=False):
        # states = np.array(states, dtype=self.getSettings()['float_type'])
        # actions = np.array(actions, dtype=self.getSettings()['float_type'])
        if (alreadyNormed is False):
            states = np.array(norm_state(states, self._state_bounds),
                              dtype=self.getSettings()['float_type'])
            actions = np.array(norm_action(actions, self._action_bounds),
                               dtype=self.getSettings()['float_type'])
            # rewards = np.array(norm_state(rewards, self._reward_bounds), dtype=self.getSettings()['float_type'])
        self.setData(states, actions)
        return self._get_grad_reward()

    def getNetworkParameters(self):
        params = []
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._model.getCriticNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._model.getForwardDynamicsNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._model.getRewardNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget.getCriticNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget.getForwardDynamicsNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget.getRewardNetwork()))
        return params

    def setNetworkParameters(self, params):
        lasagne.layers.helper.set_all_param_values(
            self._model.getCriticNetwork(), params[0])
        lasagne.layers.helper.set_all_param_values(
            self._model.getForwardDynamicsNetwork(), params[1])
        lasagne.layers.helper.set_all_param_values(
            self._model.getRewardNetwork(), params[2])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getCriticNetwork(), params[3])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getForwardDynamicsNetwork(), params[4])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getRewardNetwork(), params[5])

    def setData(self, states, actions, result_states=None, rewards=None):
        self._model.setStates(states)
        self._model.setActions(actions)
        if not (result_states is None):
            self._model.setResultStates(result_states)
        if not (rewards is None):
            self._model.setRewards(rewards)
        noise = np.random.normal(self._noise_mean,
                                 self._noise_std,
                                 size=(states.shape[0], 1))
        self._noise_shared.set_value(noise)
        # noise = np.zeros((states.shape[0],1))
        # self._noise_shared.set_value(noise)

    def trainCritic(self, states, actions, result_states, rewards):

        self.setData(states, actions, result_states, rewards)
        noise = np.random.normal(self._noise_mean,
                                 self._noise_std,
                                 size=(states.shape[0], 1))
        # print ("Shapes: ", states.shape, actions.shape, rewards.shape, result_states.shape, falls.shape, noise.shape)
        self._noise_shared.set_value(noise)
        self._updates += 1
        ## Compute actions for TargetNet
        generated_samples = self._generate()
        ### Put generated samples in memory
        for i in range(generated_samples.shape[0]):
            next_state__ = scale_state(generated_samples[i],
                                       self._state_bounds)
            tup = ([states[i]], [actions[i]], [next_state__], [rewards[i]],
                   [0], [0], [0])
            self._experience.insertTuple(tup)
        tmp_result_states = copy.deepcopy(result_states)
        tmp_rewards = copy.deepcopy(rewards)

        ## Pull out a batch of generated samples
        states__, actions__, generated_samples, rewards__, falls__, G_ts__, exp_actions__ = self._experience.get_batch(
            min(states.shape[0], self._experience.samples()))
        """
        print("generated_samples: ", generated_samples.shape)
        print("tmp_result_states: ", tmp_result_states.shape)
        print("tmp_rewards: ", tmp_rewards.shape)
        print("states: ", states.shape)
        print("actions: ", actions.shape)
        """

        ## replace half of the samples with generated ones...
        for i in range(int(states.shape[0] / 2)):

            tmp_result_states[i] = generated_samples[i]
            tmp_rewards[i] = [0]

        # print("Discriminator targets: ", tmp_rewards)

        self.setData(states, actions, tmp_result_states, tmp_rewards)

        loss, _ = self._train()
        # print("Discriminator loss: ", loss)
        return loss

    def trainActor(self, states, actions, result_states, rewards):
        self.setData(states, actions, result_states, rewards)

        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1)))
        ## Add MSE term
        if ('train_gan_mse' in self.getSettings()
                and (self.getSettings()['train_gan_mse'] == False)):
            pass
        else:
            self._trainGenerator_MSE()
        # print("Policy mean: ", np.mean(self._q_action(), axis=0))
        loss = 0
        # print("******** Not learning actor right now *****")
        # return loss
        generated_samples = self.predict_batch(states, actions)
        result_state_grads = self.getResultStateGrads(generated_samples,
                                                      actions,
                                                      alreadyNormed=True)[0]
        discriminator_value = self._discriminate()
        """
            From DEEP REINFORCEMENT LEARNING IN PARAMETERIZED ACTION SPACE
            Hausknecht, Matthew and Stone, Peter
            
            actions.shape == result_state_grads.shape
        """
        use_parameter_grad_inversion = True
        if (use_parameter_grad_inversion):
            for i in range(result_state_grads.shape[0]):
                for j in range(result_state_grads.shape[1]):
                    if (result_state_grads[i, j] > 0):
                        inversion = (1.0 - generated_samples[i, j]) / 2.0
                    else:
                        inversion = (generated_samples[i, j] - (-1.0)) / 2.0
                    result_state_grads[i,
                                       j] = result_state_grads[i,
                                                               j] * inversion

        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['debug']):
            print("Policy mean: ", np.mean(self._generate(), axis=0))
            print("Mean action grad: ", np.mean(result_state_grads, axis=0),
                  " std ", np.std(result_state_grads, axis=0))

        ## Set data for gradient
        self._model.setResultStates(result_states)
        self._modelTarget.setResultStates(result_states)

        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1)))
        error_MSE = self._bellman_error()
        ## Why the -1.0??
        ## Because the SGD method is always performing MINIMIZATION!!
        self._result_state_grad_shared.set_value(-1.0 * result_state_grads)
        self._trainGenerator()
        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1)))
        error_MSE = self._bellman_error()
        return (np.mean(discriminator_value), error_MSE)

    def train(self, states, actions, result_states, rewards):
        loss = self.trainCritic(states, actions, result_states, rewards)
        # loss = 0
        lossActor = self.trainActor(states, actions, result_states, rewards)
        if (self.getSettings()['train_reward_predictor']):
            # print ("self._reward_bounds: ", self._reward_bounds)
            # print( "Rewards, predicted_reward, difference, model diff, model rewards: ", np.concatenate((rewards, self._predict_reward(), self._predict_reward() - rewards, self._reward_error(), self._reward_values()), axis=1))
            self.setData(states, actions, result_states, rewards)
            lossReward = self._train_reward()
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['train']):
                print("Loss Reward: ", lossReward)
        return (loss, lossActor)

    def predict(self, state, deterministic_=True):
        pass

    def predict_batch(self, states, deterministic_=True):
        pass

    def predict(self, state, action):
        # states = np.zeros((self._batch_size, self._self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        state = np.array(norm_state(state, self._state_bounds),
                         dtype=self.getSettings()['float_type'])
        # print ("fd state: ", state)
        action = np.array(norm_action(action, self._action_bounds),
                          dtype=self.getSettings()['float_type'])
        # self._model.setStates(state)
        # self._model.setActions(action)
        self.setData(state, action)
        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(1,1)))
        # print ("State bounds: ", self._state_bounds)
        # print ("gen output: ", self._generate()[0])
        state_ = scale_state(self._generate(), self._state_bounds)
        # print( "self._state_bounds: ", self._state_bounds)
        # print ("scaled output: ", state_)
        return state_

    def predict_batch(self, states, actions):
        ## These input should already be normalized.
        # self._model.setStates(states)
        # self._model.setActions(actions)
        self.setData(states, actions)
        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1)))
        # print ("State bounds: ", self._state_bounds)
        # print ("fd output: ", self._forwardDynamics()[0])
        # state_ = scale_state(self._generate(), self._state_bounds)
        state_ = self._generate()
        return state_

    def q_value(self, state):
        """
            For returning a vector of q values, state should NOT be normalized
        """
        # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        action = self._q_action()
        self._model.setActions(action)
        self._modelTarget.setActions(action)
        return scale_reward(self._discriminate(), self.getRewardBounds()) * (
            1.0 / (1.0 - self.getSettings()['discount_factor']))
        # return self._q_valTarget()[0]
        # return self._q_val()[0]

    def q_value(self, state, action, next_state):
        """
            For returning a vector of q values, state should NOT be normalized
        """
        # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        # action = self._q_action()
        action = norm_state(action, self.getActionBounds())
        self._model.setActions(action)
        self._modelTarget.setActions(action)
        nextState = norm_state(next_state, self.getStateBounds())
        # nextState = np.reshape(nextState, (1,20))
        self._model.setResultStates(nextState)
        self._modelTarget.setResultStates(nextState)

        # return scale_reward(self._discriminate(), self.getRewardBounds())[0] * (1.0 / (1.0- self.getSettings()['discount_factor']))
        return self._discriminate()
        # return self._q_valTarget()[0]
        # return self._q_val()[0]

    def q_values(self, state):
        """
            For returning a vector of q values, state should already be normalized
        """
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        action = self._q_action()
        self._model.setActions(action)
        self._modelTarget.setActions(action)
        return scale_reward(self._q_val(), self.getRewardBounds()) * (
            1.0 / (1.0 - self.getSettings()['discount_factor']))
        # return self._q_valTarget()
        # return self._q_val()

    def predict_std(self, state, deterministic_=True):
        """
            This does nothing for a GAN...
        """
        # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        action_std = np.array([0] * len(self._action_bounds))
        # np.zeros((state.shape[0], len(self._action_bounds)))
        # else:
        # action_ = scale_action(self._q_action()[0], self._action_bounds)
        # action_ = q_valsActA[0]
        return action_std

    def predict_reward(self, state, action):
        # states = np.zeros((self._batch_size, self._self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        state = np.array(norm_state(state, self._state_bounds),
                         dtype=self.getSettings()['float_type'])
        action = np.array(norm_action(action, self._action_bounds),
                          dtype=self.getSettings()['float_type'])
        self._model.setStates(state)
        self._model.setActions(action)
        predicted_reward = self._predict_reward()
        reward_ = scale_reward(predicted_reward, self.getRewardBounds(
        ))  # * (1.0 / (1.0- self.getSettings()['discount_factor']))
        # reward_ = scale_reward(predicted_reward, self.getRewardBounds())[0] * (1.0 / (1.0- self.getSettings()['discount_factor']))
        # reward_ = scale_state(predicted_reward, self._reward_bounds)
        # print ("reward, predicted reward: ", reward_, predicted_reward)
        return reward_

    def predict_reward_batch(self, states, actions):

        # states = np.zeros((self._batch_size, self._self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        # state = np.array(norm_state(state, self._state_bounds), dtype=self.getSettings()['float_type'])
        # action = np.array(norm_action(action, self._action_bounds), dtype=self.getSettings()['float_type'])
        self._model.setStates(states)
        self._model.setActions(actions)
        predicted_reward = self._predict_reward()
        # reward_ = scale_reward(predicted_reward, self.getRewardBounds())[0] # * (1.0 / (1.0- self.getSettings()['discount_factor']))
        # reward_ = scale_reward(predicted_reward, self.getRewardBounds())[0] * (1.0 / (1.0- self.getSettings()['discount_factor']))
        # reward_ = scale_state(predicted_reward, self._reward_bounds)
        # print ("reward, predicted reward: ", reward_, predicted_reward)
        return predicted_reward

    def bellman_error(self, states, actions, result_states, rewards):
        self.setData(states, actions, result_states, rewards)
        return self._bellman_error()

    def reward_error(self, states, actions, result_states, rewards):
        # rewards = rewards * (1.0/(1.0-self.getSettings()['discount_factor'])) # scale rewards
        self.setData(states, actions, result_states, rewards)
        return self._reward_error()

    def setStateBounds(self, state_bounds):
        super(GAN, self).setStateBounds(state_bounds)
        """
        print ("")
        print("Setting GAN state bounds: ", state_bounds)
        print("self.getStateBounds(): ", self.getStateBounds())
        print ("")
        """
        self._experience.setStateBounds(copy.deepcopy(self.getStateBounds()))

    def setActionBounds(self, action_bounds):
        super(GAN, self).setActionBounds(action_bounds)
        self._experience.setActionBounds(copy.deepcopy(self.getActionBounds()))

    def setRewardBounds(self, reward_bounds):
        super(GAN, self).setRewardBounds(reward_bounds)
        self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds()))
Example #2
0
    old_states = states
    # print states
    actions = np.array(list(map(f, states)))
    actionsNoNoise = np.array(list(map(f, states)))

    # states2 = np.transpose(np.repeat([states], 2, axis=0))
    # print states2
    model = createForwardDynamicsModel(settings, state_bounds, action_bounds,
                                       None, None, None)

    experience = ExperienceMemory(len(state_bounds[0]),
                                  len(action_bounds[0]),
                                  experience_length,
                                  continuous_actions=True,
                                  settings=settings)
    experience.setStateBounds(state_bounds)
    experience.setRewardBounds(reward_bounds)
    experience.setActionBounds(action_bounds)
    experience.setSettings(settings)
    arr = list(range(experience_length))
    random.shuffle(arr)
    num_samples_to_keep = 300
    given_actions = []
    given_states = []
    for i in range(num_samples_to_keep):
        a = actions[arr[i]]
        action_ = np.array([a])
        given_actions.append(action_)
        state_ = np.array([states[arr[i]]])
        given_states.append(state_)
        # print "Action: " + str([actions[i]])
Example #3
0
def fitModelToData(settingsFileName):
    """
    State is the input state and Action is the desired output (y).
    """
    # from model.ModelUtil import *
    
    file = open(settingsFileName)
    settings = json.load(file)
    print ("Settings: " + str(json.dumps(settings)))
    file.close()
    import os    
    os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device="+settings['training_processor_type']+",floatX="+settings['float_type']
    
    from ModelEvaluation import SimWorker, evalModelParrallel, collectExperience, simEpoch, evalModel
    from model.ModelUtil import validBounds
    from model.LearningAgent import LearningAgent, LearningWorker
    from util.SimulationUtil import validateSettings, createEnvironment, createRLAgent, createActor
    from util.SimulationUtil import getDataDirectory, createForwardDynamicsModel, createSampler
    
    
    from util.ExperienceMemory import ExperienceMemory
    from RLVisualize import RLVisualize
    from NNVisualize import NNVisualize
    
    from sim.PendulumEnvState import PendulumEnvState
    from sim.PendulumEnv import PendulumEnv
    from sim.BallGame2DEnv import BallGame2DEnv 
    import time  
    
    settings = validateSettings(settings)

    train_forward_dynamics=True
    model_type= settings["model_type"]
    directory= getDataDirectory(settings)
    discrete_actions = np.array(settings['discrete_actions'])
    num_actions= discrete_actions.shape[0] # number of rows
    rounds = settings["rounds"]
    epochs = settings["epochs"]
    epsilon = settings["epsilon"]
    discount_factor=settings["discount_factor"]
    reward_bounds=np.array(settings["reward_bounds"])
    batch_size=settings["batch_size"]
    train_on_validation_set=settings["train_on_validation_set"]
    state_bounds = np.array(settings['state_bounds'])
    discrete_actions = np.array(settings['discrete_actions'])
    print ("Sim config file name: ", str(settings["sim_config_file"]))
    action_space_continuous=settings['action_space_continuous']
    if action_space_continuous:
        action_bounds = np.array(settings["action_bounds"], dtype=float)
    
    if action_space_continuous:
        experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), settings['expereince_length'], continuous_actions=True, settings=settings)
    else:
        experience = ExperienceMemory(len(state_bounds[0]), 1, settings['expereince_length'])
    file_name=directory+getAgentName()+"expBufferInit.hdf5"
    experience.loadFromFile(file_name)
    state_bounds = experience._state_bounds
    action_bounds = experience._action_bounds
    reward_bounds = experience._reward_bounds
    
    output_experience_queue = multiprocessing.Queue(settings['queue_size_limit'])
    mgr = multiprocessing.Manager()
    namespace = mgr.Namespace()
    learning_workers = []
    for process in range(1):
        # this is the process that selects which game to play
        agent = LearningAgent(n_in=len(state_bounds[0]), n_out=len(action_bounds[0]), state_bounds=state_bounds, 
                          action_bounds=action_bounds, reward_bound=reward_bounds, settings_=settings)
        
        agent.setSettings(settings)
        
        lw = LearningWorker(output_experience_queue, agent, namespace)
        learning_workers.append(lw)  
    masterAgent = agent
    masterAgent.setExperience(experience)
    
    if action_space_continuous:
        model = createRLAgent(settings['agent_name'], state_bounds, action_bounds, reward_bounds, settings)
    else:
        model = createRLAgent(settings['agent_name'], state_bounds, discrete_actions, reward_bounds, settings)
    if ( not settings['load_saved_model'] ):
        model.setStateBounds(state_bounds)
        model.setActionBounds(action_bounds)
        model.setRewardBounds(reward_bounds)
    else: # continuation learning
        experience.setStateBounds(model.getStateBounds())
        experience.setRewardBounds(model.getRewardBounds())
        experience.setActionBounds(model.getActionBounds())
        
    
    if (settings['train_forward_dynamics']):
        print ("Created forward dynamics network")
        forwardDynamicsModel = createForwardDynamicsModel(settings, state_bounds, action_bounds, None, None)
        masterAgent.setForwardDynamics(forwardDynamicsModel)
        forwardDynamicsModel.setActor(actor)        
        forwardDynamicsModel.init(len(state_bounds[0]), len(action_bounds[0]), state_bounds, action_bounds, actor, None, settings)
        namespace.forwardNN = masterAgent.getForwardDynamics().getNetworkParameters()
        namespace.forwardDynamicsModel = forwardDynamicsModel
    
    ## Now everything related to the exp memory needs to be updated
    bellman_errors=[]
    masterAgent.setPolicy(model)
    namespace.agentPoly = masterAgent.getPolicy().getNetworkParameters()
    namespace.model = model
    
    
    if (settings['visualize_learning']):
        rlv = NNVisualize(title=str(directory), settings=settings)
        rlv.setInteractive()
        rlv.init()
            
    if (settings['debug_critic']):
        criticLosses = []
        criticRegularizationCosts = [] 
        if (settings['visualize_learning']):
            critic_loss_viz = NNVisualize(title=str("Critic Loss") + " with " + str(settings["model_type"]))
            critic_loss_viz.setInteractive()
            critic_loss_viz.init()
            critic_regularization_viz = NNVisualize(title=str("Critic Regularization Cost") + " with " + str(settings["model_type"]))
            critic_regularization_viz.setInteractive()
            critic_regularization_viz.init()
        
    if (settings['debug_actor']):
        actorLosses = []
        actorRegularizationCosts = []            
        if (settings['visualize_learning']):
            actor_loss_viz = NNVisualize(title=str("Actor Loss") + " with " + str(settings["model_type"]))
            actor_loss_viz.setInteractive()
            actor_loss_viz.init()
            actor_regularization_viz = NNVisualize(title=str("Actor Regularization Cost") + " with " + str(settings["model_type"]))
            actor_regularization_viz.setInteractive()
            actor_regularization_viz.init()
                
    trainData = {}
    trainData["mean_reward"]=[]
    trainData["std_reward"]=[]
    trainData["mean_bellman_error"]=[]
    trainData["std_bellman_error"]=[]
    trainData["mean_discount_error"]=[]
    trainData["std_discount_error"]=[]
    trainData["mean_forward_dynamics_loss"]=[]
    trainData["std_forward_dynamics_loss"]=[]
    trainData["mean_eval"]=[]
    trainData["std_eval"]=[]
    trainData["mean_critic_loss"]=[]
    trainData["std_critic_loss"]=[]
    trainData["mean_critic_regularization_cost"]=[]
    trainData["std_critic_regularization_cost"]=[]
    trainData["mean_actor_loss"]=[]
    trainData["std_actor_loss"]=[]
    trainData["mean_actor_regularization_cost"]=[]
    trainData["std_actor_regularization_cost"]=[]
        
    best_dynamicsLosses=1000000
    _states, _actions, _result_states, _rewards, _falls, _G_ts = experience.get_batch(batch_size)
    for round_ in range(rounds):
        t0 = time.time()
        __states, __actions, __result_states, __rewards, __falls, __G_ts = experience.get_batch(100)
        for i in range(1):
            masterAgent.train(_states=__states, _actions=__actions, _rewards=__rewards, _result_states=__result_states, _falls=__falls)
        t1 = time.time()
        time_taken = t1 - t0
        if masterAgent.getExperience().samples() > batch_size:
            states, actions, result_states, rewards, falls, G_ts = masterAgent.getExperience().get_batch(batch_size)
            print ("Batch size: " + str(batch_size))
            error = masterAgent.bellman_error(states, actions, rewards, result_states, falls)
            bellman_errors.append(error)
            if (settings['debug_critic']):
                loss__ = masterAgent.getPolicy()._get_critic_loss() # uses previous call batch data
                criticLosses.append(loss__)
                regularizationCost__ = masterAgent.getPolicy()._get_critic_regularization()
                criticRegularizationCosts.append(regularizationCost__)
                
            if (settings['debug_actor']):
                loss__ = masterAgent.getPolicy()._get_actor_loss() # uses previous call batch data
                actorLosses.append(loss__)
                regularizationCost__ = masterAgent.getPolicy()._get_actor_regularization()
                actorRegularizationCosts.append(regularizationCost__)
            
            if not all(np.isfinite(error)):
                print ("States: " + str(states) + " ResultsStates: " + str(result_states) + " Rewards: " + str(rewards) + " Actions: " + str(actions) + " Falls: ", str(falls))
                print ("Bellman Error is Nan: " + str(error) + str(np.isfinite(error)))
                sys.exit()
            
            error = np.mean(np.fabs(error))
            if error > 10000:
                print ("Error to big: ")
                print (states, actions, rewards, result_states)
                
            if (settings['train_forward_dynamics']):
                dynamicsLoss = masterAgent.getForwardDynamics().bellman_error(states, actions, result_states, rewards)
                dynamicsLoss = np.mean(np.fabs(dynamicsLoss))
                dynamicsLosses.append(dynamicsLoss)
            if (settings['train_forward_dynamics']):
                print ("Round: " + str(round_) + " bellman error: " + str(error) + " ForwardPredictionLoss: " + str(dynamicsLoss) + " in " + str(time_taken) + " seconds")
            else:
                print ("Round: " + str(round_) + " bellman error: " + str(error) + " in " + str(time_taken) + " seconds")
           

        print ("Master agent experience size: " + str(masterAgent.getExperience().samples()))
        trainData["mean_bellman_error"].append(np.mean(np.fabs(bellman_errors)))
        trainData["std_bellman_error"].append(np.std(bellman_errors))
        if (settings['visualize_learning']):
            rlv.updateLoss(np.array(trainData["mean_bellman_error"]), np.array(trainData["std_bellman_error"]))
            rlv.redraw()
            rlv.setInteractiveOff()
            rlv.saveVisual(directory+"trainingGraphNN")
            rlv.setInteractive()
        # print "Error: " + str(error)
        if (settings['debug_critic']):
            mean_criticLosses = np.mean(criticLosses)
            std_criticLosses = np.std(criticLosses)
            trainData["mean_critic_loss"].append(mean_criticLosses)
            trainData["std_critic_loss"].append(std_criticLosses)
            criticLosses = []
            if (settings['visualize_learning']):
                critic_loss_viz.updateLoss(np.array(trainData["mean_critic_loss"]), np.array(trainData["std_critic_loss"]))
                critic_loss_viz.redraw()
                critic_loss_viz.setInteractiveOff()
                critic_loss_viz.saveVisual(directory+"criticLossGraph")
                critic_loss_viz.setInteractive()
            
            mean_criticRegularizationCosts = np.mean(criticRegularizationCosts)
            std_criticRegularizationCosts = np.std(criticRegularizationCosts)
            trainData["mean_critic_regularization_cost"].append(mean_criticRegularizationCosts)
            trainData["std_critic_regularization_cost"].append(std_criticRegularizationCosts)
            criticRegularizationCosts = []
            if (settings['visualize_learning']):
                critic_regularization_viz.updateLoss(np.array(trainData["mean_critic_regularization_cost"]), np.array(trainData["std_critic_regularization_cost"]))
                critic_regularization_viz.redraw()
                critic_regularization_viz.setInteractiveOff()
                critic_regularization_viz.saveVisual(directory+"criticRegularizationGraph")
                critic_regularization_viz.setInteractive()
            
        if (settings['debug_actor']):
            
            mean_actorLosses = np.mean(actorLosses)
            std_actorLosses = np.std(actorLosses)
            trainData["mean_actor_loss"].append(mean_actorLosses)
            trainData["std_actor_loss"].append(std_actorLosses)
            actorLosses = []
            if (settings['visualize_learning']):
                actor_loss_viz.updateLoss(np.array(trainData["mean_actor_loss"]), np.array(trainData["std_actor_loss"]))
                actor_loss_viz.redraw()
                actor_loss_viz.setInteractiveOff()
                actor_loss_viz.saveVisual(directory+"actorLossGraph")
                actor_loss_viz.setInteractive()
            
            mean_actorRegularizationCosts = np.mean(actorRegularizationCosts)
            std_actorRegularizationCosts = np.std(actorRegularizationCosts)
            trainData["mean_actor_regularization_cost"].append(mean_actorRegularizationCosts)
            trainData["std_actor_regularization_cost"].append(std_actorRegularizationCosts)
            actorRegularizationCosts = []
            if (settings['visualize_learning']):
                actor_regularization_viz.updateLoss(np.array(trainData["mean_actor_regularization_cost"]), np.array(trainData["std_actor_regularization_cost"]))
                actor_regularization_viz.redraw()
                actor_regularization_viz.setInteractiveOff()
                actor_regularization_viz.saveVisual(directory+"actorRegularizationGraph")
                actor_regularization_viz.setInteractive()
Example #4
0
def trainModelParallel(inputData):
    settingsFileName = inputData[0]
    settings = inputData[1]
    np.random.seed(int(settings['random_seed']))
    import os
    if ('THEANO_FLAGS' in os.environ):
        os.environ['THEANO_FLAGS'] = os.environ[
            'THEANO_FLAGS'] + "mode=FAST_RUN,device=" + settings[
                'training_processor_type'] + ",floatX=" + settings['float_type']
    else:
        os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device=" + settings[
            'training_processor_type'] + ",floatX=" + settings['float_type']
    import keras.backend
    keras.backend.set_floatx(settings['float_type'])
    print("K.floatx()", keras.backend.floatx())

    from ModelEvaluation import SimWorker, evalModelParrallel, collectExperience, simEpoch, evalModel, simModelParrallel
    from model.ModelUtil import validBounds, fixBounds, anneal_value
    from model.LearningAgent import LearningAgent, LearningWorker
    from util.SimulationUtil import validateSettings
    from util.SimulationUtil import createEnvironment
    from util.SimulationUtil import createRLAgent
    from util.SimulationUtil import createActor, getAgentName
    from util.SimulationUtil import getDataDirectory, createForwardDynamicsModel, createSampler

    from util.ExperienceMemory import ExperienceMemory
    from RLVisualize import RLVisualize
    from NNVisualize import NNVisualize

    #from sim.PendulumEnvState import PendulumEnvState
    #from sim.PendulumEnv import PendulumEnv
    #from sim.BallGame2DEnv import BallGame2DEnv
    settings = validateSettings(settings)

    model_type = settings["model_type"]
    directory = getDataDirectory(settings)

    if not os.path.exists(directory):
        os.makedirs(directory)

    # copy settings file
    out_file_name = directory + os.path.basename(settingsFileName)
    print("Saving settings file with data: ", out_file_name)
    out_file = open(out_file_name, 'w')
    out_file.write(json.dumps(settings, indent=4))
    out_file.close()
    ### Try and save algorithm and model files for reference
    if "." in settings['model_type']:
        ### convert . to / and copy file over
        file_name = settings['model_type']
        k = file_name.rfind(".")
        file_name = file_name[:k]
        file_name_read = file_name.replace(".", "/")
        file_name_read = file_name_read + ".py"
        print("model file name:", file_name)
        print("os.path.basename(file_name): ", os.path.basename(file_name))
        file = open(file_name_read, 'r')
        out_file = open(directory + file_name + ".py", 'w')
        out_file.write(file.read())
        file.close()
        out_file.close()
    if "." in settings['agent_name']:
        ### convert . to / and copy file over
        file_name = settings['agent_name']
        k = file_name.rfind(".")
        file_name = file_name[:k]
        file_name_read = file_name.replace(".", "/")
        file_name_read = file_name_read + ".py"
        print("model file name:", file_name)
        print("os.path.basename(file_name): ", os.path.basename(file_name))
        file = open(file_name_read, 'r')
        out_file = open(directory + file_name + ".py", 'w')
        out_file.write(file.read())
        file.close()
        out_file.close()

    if (settings['train_forward_dynamics']):
        if "." in settings['forward_dynamics_model_type']:
            ### convert . to / and copy file over
            file_name = settings['forward_dynamics_model_type']
            k = file_name.rfind(".")
            file_name = file_name[:k]
            file_name_read = file_name.replace(".", "/")
            file_name_read = file_name_read + ".py"
            print("model file name:", file_name)
            print("os.path.basename(file_name): ", os.path.basename(file_name))
            file = open(file_name_read, 'r')
            out_file = open(directory + file_name + ".py", 'w')
            out_file.write(file.read())
            file.close()
            out_file.close()

    rounds = settings["rounds"]
    epochs = settings["epochs"]
    epsilon = settings["epsilon"]
    discount_factor = settings["discount_factor"]
    reward_bounds = np.array(settings["reward_bounds"])
    batch_size = settings["batch_size"]
    train_on_validation_set = settings["train_on_validation_set"]
    state_bounds = np.array(settings['state_bounds'])
    discrete_actions = np.array(settings['discrete_actions'])  #9*6
    num_actions = discrete_actions.shape[0]  # number of rows
    print("Sim config file name: " + str(settings["sim_config_file"]))
    action_space_continuous = settings['action_space_continuous']

    if (settings['num_available_threads'] == 1):
        input_anchor_queue = multiprocessing.Queue(
            settings['queue_size_limit'])
        input_anchor_queue_eval = multiprocessing.Queue(
            settings['queue_size_limit'])
        output_experience_queue = multiprocessing.Queue(
            settings['queue_size_limit'])
        eval_episode_data_queue = multiprocessing.Queue(
            settings['queue_size_limit'])
    else:
        input_anchor_queue = multiprocessing.Queue(settings['epochs'])
        input_anchor_queue_eval = multiprocessing.Queue(settings['epochs'])
        output_experience_queue = multiprocessing.Queue(
            settings['queue_size_limit'])
        eval_episode_data_queue = multiprocessing.Queue(
            settings['eval_epochs'])

    if (settings['on_policy']):  ## So that off policy agent does not learn
        output_experience_queue = None

    sim_work_queues = []

    action_space_continuous = settings['action_space_continuous']
    if action_space_continuous:
        action_bounds = np.array(settings["action_bounds"], dtype=float)

    ### Using a wrapper for the type of actor now
    actor = createActor(settings['environment_type'], settings, None)
    exp_val = None
    if (not validBounds(action_bounds)):
        # Check that the action bounds are spcified correctly
        print("Action bounds invalid: ", action_bounds)
        sys.exit()
    if (not validBounds(state_bounds)):
        # Probably did not collect enough bootstrapping samples to get good state bounds.
        print("State bounds invalid: ", state_bounds)
        state_bounds = fixBounds(np.array(state_bounds))
        bound_fixed = validBounds(state_bounds)
        print("State bounds fixed: ", bound_fixed)
        sys.exit()
    if (not validBounds(reward_bounds)):
        print("Reward bounds invalid: ", reward_bounds)
        sys.exit()

    if settings['action_space_continuous']:
        experience = ExperienceMemory(len(state_bounds[0]),
                                      len(action_bounds[0]),
                                      settings['expereince_length'],
                                      continuous_actions=True,
                                      settings=settings)
    else:
        experience = ExperienceMemory(len(state_bounds[0]), 1,
                                      settings['expereince_length'])

    experience.setSettings(settings)

    if settings['visualize_learning']:
        title = settings['agent_name']
        k = title.rfind(".") + 1
        if (k > len(title)):  ## name does not contain a .
            k = 0
        title = title[k:]
        rlv = RLVisualize(title=title + " agent on " +
                          str(settings['environment_type']),
                          settings=settings)
        rlv.setInteractive()
        rlv.init()
    if (settings['train_forward_dynamics']):
        if settings['visualize_learning']:
            title = settings['forward_dynamics_model_type']
            k = title.rfind(".") + 1
            if (k > len(title)):  ## name does not contain a .
                k = 0
            title = title[k:]
            nlv = NNVisualize(title=str("Dynamics Model") + " with " + title,
                              settings=settings)
            nlv.setInteractive()
            nlv.init()
    if (settings['train_reward_predictor']):
        if settings['visualize_learning']:
            title = settings['forward_dynamics_model_type']
            k = title.rfind(".") + 1
            if (k > len(title)):  ## name does not contain a .
                k = 0

            title = title[k:]
            rewardlv = NNVisualize(title=str("Reward Model") + " with " +
                                   title,
                                   settings=settings)
            rewardlv.setInteractive()
            rewardlv.init()

    if (settings['debug_critic']):  #True
        criticLosses = []
        criticRegularizationCosts = []
        if (settings['visualize_learning']):
            title = settings['agent_name']
            k = title.rfind(".") + 1
            if (k > len(title)):  ## name does not contain a .
                k = 0
            title = title[k:]
            critic_loss_viz = NNVisualize(title=str("Critic Loss") + " with " +
                                          title)
            critic_loss_viz.setInteractive()
            critic_loss_viz.init()
            critic_regularization_viz = NNVisualize(
                title=str("Critic Reg Cost") + " with " + title)
            critic_regularization_viz.setInteractive()
            critic_regularization_viz.init()

    if (settings['debug_actor']):  # True
        actorLosses = []
        actorRegularizationCosts = []
        if (settings['visualize_learning']):  #False
            title = settings['agent_name']
            k = title.rfind(".") + 1
            if (k > len(title)):  ## name does not contain a .
                k = 0
            title = title[k:]
            actor_loss_viz = NNVisualize(title=str("Actor Loss") + " with " +
                                         title)
            actor_loss_viz.setInteractive()
            actor_loss_viz.init()
            actor_regularization_viz = NNVisualize(
                title=str("Actor Reg Cost") + " with " + title)
            actor_regularization_viz.setInteractive()
            actor_regularization_viz.init()

    model = createRLAgent(settings['agent_name'], state_bounds,
                          discrete_actions, reward_bounds,
                          settings)  #return a model class
    forwardDynamicsModel = None
    if (settings['train_forward_dynamics']):  #False
        if (settings['forward_dynamics_model_type'] == "SingleNet"):
            print(
                "Creating forward dynamics network: Using single network model"
            )
            forwardDynamicsModel = createForwardDynamicsModel(settings,
                                                              state_bounds,
                                                              action_bounds,
                                                              None,
                                                              None,
                                                              agentModel=model)
        else:
            print("Creating forward dynamics network")
            forwardDynamicsModel = createForwardDynamicsModel(settings,
                                                              state_bounds,
                                                              action_bounds,
                                                              None,
                                                              None,
                                                              agentModel=None)
        forwardDynamicsModel.setActor(actor)
        forwardDynamicsModel.init(len(state_bounds[0]), len(action_bounds[0]),
                                  state_bounds, action_bounds, actor, None,
                                  settings)

    (agent,
     learning_workers) = createLearningAgent(settings, output_experience_queue,
                                             state_bounds, action_bounds,
                                             reward_bounds)
    masterAgent = agent

    ### These are the workers for training
    (sim_workers, sim_work_queues) = createSimWorkers(
        settings, input_anchor_queue, output_experience_queue,
        eval_episode_data_queue, model, forwardDynamicsModel, exp_val,
        state_bounds, action_bounds, reward_bounds)

    eval_sim_workers = sim_workers
    eval_sim_work_queues = sim_work_queues
    if ('override_sim_env_id' in settings
            and (settings['override_sim_env_id'] != False)):  #True
        (eval_sim_workers, eval_sim_work_queues) = createSimWorkers(
            settings,
            input_anchor_queue_eval,
            output_experience_queue,
            eval_episode_data_queue,
            model,
            forwardDynamicsModel,
            exp_val,
            state_bounds,
            action_bounds,
            reward_bounds,
            default_sim_id=settings['override_sim_env_id'])  # id=1
    else:
        input_anchor_queue_eval = input_anchor_queue

    best_eval = -100000000.0
    best_dynamicsLosses = best_eval * -1.0

    values = []
    discounted_values = []
    bellman_error = []
    reward_over_epoc = []
    dynamicsLosses = []
    dynamicsRewardLosses = []

    for lw in learning_workers:
        print("Learning worker")
        print(lw)

    if (int(settings["num_available_threads"]) > 1):
        for sw in sim_workers:
            print("Sim worker")
            print(sw)
            sw.start()
        if ('override_sim_env_id' in settings
                and (settings['override_sim_env_id'] != False)):
            for sw in eval_sim_workers:
                print("Sim worker")
                print(sw)
                sw.start()

    ## This needs to be done after the simulation worker processes are created
    exp_val = createEnvironment(settings["forwardDynamics_config_file"],
                                settings['environment_type'],
                                settings,
                                render=settings['shouldRender'],
                                index=0)
    exp_val.setActor(actor)
    exp_val.getActor().init()
    exp_val.init()

    ### This is for a single-threaded Synchronous sim only.
    if (int(settings["num_available_threads"]) == 1
        ):  # This is okay if there is one thread only...
        sim_workers[0].setEnvironment(exp_val)
        sim_workers[0].start()
        if ('override_sim_env_id' in settings
                and (settings['override_sim_env_id'] != False)):
            eval_sim_workers[0].setEnvironment(exp_val)
            eval_sim_workers[0].start()

    masterAgent.setPolicy(model)
    if (settings['train_forward_dynamics']):
        masterAgent.setForwardDynamics(forwardDynamicsModel)

    tmp_p = 1.0
    message = {}
    if (settings['load_saved_model']):
        tmp_p = settings['min_epsilon']
    data = ('Update_Policy', tmp_p, model.getStateBounds(),
            model.getActionBounds(), model.getRewardBounds(),
            masterAgent.getPolicy().getNetworkParameters())
    if (settings['train_forward_dynamics']):
        data = ('Update_Policy', tmp_p, model.getStateBounds(),
                model.getActionBounds(), model.getRewardBounds(),
                masterAgent.getPolicy().getNetworkParameters(),
                masterAgent.getForwardDynamics().getNetworkParameters())
    message['type'] = 'Update_Policy'
    message['data'] = data
    for m_q in sim_work_queues:
        print("trainModel: Sending current network parameters: ", m_q)
        m_q.put(message)

    if (int(settings["num_available_threads"]) == 1):
        experience, state_bounds, reward_bounds, action_bounds = collectExperience(
            actor,
            exp_val,
            model,
            settings,
            sim_work_queues=None,
            eval_episode_data_queue=None
        )  #experience: state, action, nextstate, rewards,

    else:
        if (settings['on_policy']):
            experience, state_bounds, reward_bounds, action_bounds = collectExperience(
                actor,
                None,
                model,
                settings,
                sim_work_queues=sim_work_queues,
                eval_episode_data_queue=eval_episode_data_queue)
        else:
            experience, state_bounds, reward_bounds, action_bounds = collectExperience(
                actor,
                None,
                model,
                settings,
                sim_work_queues=input_anchor_queue,
                eval_episode_data_queue=eval_episode_data_queue)
    masterAgent.setExperience(experience)
    if ('keep_seperate_fd_exp_buffer' in settings
            and (settings['keep_seperate_fd_exp_buffer'])):
        masterAgent.setFDExperience(copy.deepcopy(experience))

    if (not validBounds(action_bounds)):
        # Check that the action bounds are spcified correctly
        print("Action bounds invalid: ", action_bounds)
        sys.exit()
    if (not validBounds(state_bounds)):
        # Probably did not collect enough bootstrapping samples to get good state bounds.
        print("State bounds invalid: ", state_bounds)
        state_bounds = fixBounds(np.array(state_bounds))
        bound_fixed = validBounds(state_bounds)
        print("State bounds fixed: ", bound_fixed)
    if (not validBounds(reward_bounds)):
        print("Reward bounds invalid: ", reward_bounds)
        sys.exit()

    print("Reward History: ", experience._reward_history)
    print("Action History: ", experience._action_history)
    print("Action Mean: ", np.mean(experience._action_history))
    print("Experience Samples: ", (experience.samples()))

    if (settings["save_experience_memory"]):
        print("Saving initial experience memory")
        file_name = directory + getAgentName() + "_expBufferInit.hdf5"
        experience.saveToFile(file_name)

    if (settings['load_saved_model']
            or (settings['load_saved_model']
                == 'network_and_scales')):  ## Transfer learning
        experience.setStateBounds(copy.deepcopy(model.getStateBounds()))
        experience.setRewardBounds(copy.deepcopy(model.getRewardBounds()))
        experience.setActionBounds(copy.deepcopy(model.getActionBounds()))
        model.setSettings(settings)
    else:  ## Normal
        model.setStateBounds(state_bounds)
        model.setActionBounds(action_bounds)
        model.setRewardBounds(reward_bounds)
        experience.setStateBounds(copy.deepcopy(model.getStateBounds()))
        experience.setRewardBounds(copy.deepcopy(model.getRewardBounds()))
        experience.setActionBounds(copy.deepcopy(model.getActionBounds()))

    masterAgent_message_queue = multiprocessing.Queue(settings['epochs'])

    if (settings['train_forward_dynamics']):
        if (not settings['load_saved_model']):
            forwardDynamicsModel.setStateBounds(state_bounds)
            forwardDynamicsModel.setActionBounds(action_bounds)
            forwardDynamicsModel.setRewardBounds(reward_bounds)
        masterAgent.setForwardDynamics(forwardDynamicsModel)

    ## Now everything related to the exp memory needs to be updated
    bellman_errors = []
    masterAgent.setPolicy(model)
    print("Master agent state bounds: ",
          repr(masterAgent.getPolicy().getStateBounds()))
    for sw in sim_workers:  # Need to update parameter bounds for models
        print("exp: ", sw._exp)
        print("sw modle: ", sw._model.getPolicy())

    ## If not on policy
    if (not settings['on_policy']):
        for lw in learning_workers:
            lw._agent.setPolicy(model)
            lw.setMasterAgentMessageQueue(masterAgent_message_queue)
            lw.updateExperience(experience)
            print("ls policy: ", lw._agent.getPolicy())

            lw.start()

    tmp_p = 1.0
    if (settings['load_saved_model']):
        tmp_p = settings['min_epsilon']
    data = ('Update_Policy', tmp_p, model.getStateBounds(),
            model.getActionBounds(), model.getRewardBounds(),
            masterAgent.getPolicy().getNetworkParameters())
    if (settings['train_forward_dynamics']):
        data = ('Update_Policy', tmp_p, model.getStateBounds(),
                model.getActionBounds(), model.getRewardBounds(),
                masterAgent.getPolicy().getNetworkParameters(),
                masterAgent.getForwardDynamics().getNetworkParameters())
    message['type'] = 'Update_Policy'
    message['data'] = data
    for m_q in sim_work_queues:
        print("trainModel: Sending current network parameters: ", m_q)
        m_q.put(message)

    del model
    ## Give gloabl access to processes to they can be terminated when ctrl+c is pressed
    global sim_processes
    sim_processes = sim_workers
    global learning_processes
    learning_processes = learning_workers
    global _input_anchor_queue
    _input_anchor_queue = input_anchor_queue
    global _output_experience_queue
    _output_experience_queue = output_experience_queue
    global _eval_episode_data_queue
    _eval_episode_data_queue = eval_episode_data_queue
    global _sim_work_queues
    _sim_work_queues = sim_work_queues

    trainData = {}
    trainData["mean_reward"] = []
    trainData["std_reward"] = []
    trainData["mean_bellman_error"] = []
    trainData["std_bellman_error"] = []
    trainData["mean_discount_error"] = []
    trainData["std_discount_error"] = []
    trainData["mean_forward_dynamics_loss"] = []
    trainData["std_forward_dynamics_loss"] = []
    trainData["mean_forward_dynamics_reward_loss"] = []
    trainData["std_forward_dynamics_reward_loss"] = []
    trainData["mean_eval"] = []
    trainData["std_eval"] = []
    trainData["mean_critic_loss"] = []
    trainData["std_critic_loss"] = []
    trainData["mean_critic_regularization_cost"] = []
    trainData["std_critic_regularization_cost"] = []
    trainData["mean_actor_loss"] = []
    trainData["std_actor_loss"] = []
    trainData["mean_actor_regularization_cost"] = []
    trainData["std_actor_regularization_cost"] = []
    trainData["anneal_p"] = []

    if (False):
        print("State Bounds:", masterAgent.getStateBounds())
        print("Action Bounds:", masterAgent.getActionBounds())

        print("Exp State Bounds: ", experience.getStateBounds())
        print("Exp Action Bounds: ", experience.getActionBounds())

    print("Starting first round")
    if (settings['on_policy']):
        sim_epochs_ = epochs
    for round_ in range(
            0, rounds):  #annel value # the parameter of greedy exploration
        if ('annealing_schedule' in settings
                and (settings['annealing_schedule'] != False)):
            p = anneal_value(float(round_ / rounds), settings_=settings)
        else:
            p = ((settings['initial_temperature'] / math.log(round_ + 2)))
        p = max(settings['min_epsilon'],
                min(settings['epsilon'], p))  # Keeps it between 1.0 and 0.2
        if (settings['load_saved_model']):
            p = settings['min_epsilon']

        for epoch in range(epochs):
            if (settings['on_policy']):

                out = simModelParrallel(
                    sw_message_queues=sim_work_queues,
                    model=masterAgent,
                    settings=settings,
                    eval_episode_data_queue=eval_episode_data_queue,
                    anchors=settings['num_on_policy_rollouts'])
                (
                    tuples, discounted_sum, q_value, evalData
                ) = out  # tuples = states, actions, result_states, rewards, falls, G_ts, advantage, exp_actions
                (__states, __actions, __result_states, __rewards, __falls,
                 __G_ts, advantage__, exp_actions__) = tuples
                for i in range(1):
                    masterAgent.train(_states=__states,
                                      _actions=__actions,
                                      _rewards=__rewards,
                                      _result_states=__result_states,
                                      _falls=__falls,
                                      _advantage=advantage__,
                                      _exp_actions=exp_actions__)

                if (('anneal_on_policy' in settings)
                        and settings['anneal_on_policy']):
                    p_tmp_ = p
                else:
                    p_tmp_ = 1.0
                data = ('Update_Policy', p_tmp_, masterAgent.getStateBounds(),
                        masterAgent.getActionBounds(),
                        masterAgent.getRewardBounds(),
                        masterAgent.getPolicy().getNetworkParameters())
                message = {}
                message['type'] = 'Update_Policy'
                message['data'] = data
                if (settings['train_forward_dynamics']):
                    data = ('Update_Policy', p_tmp_,
                            masterAgent.getStateBounds(),
                            masterAgent.getActionBounds(),
                            masterAgent.getRewardBounds(),
                            masterAgent.getPolicy().getNetworkParameters(),
                            masterAgent.getForwardDynamics().
                            getNetworkParameters())
                    message['data'] = data
                for m_q in sim_work_queues:
                    ## block on full queue
                    m_q.put(message)

                if ('override_sim_env_id' in settings
                        and (settings['override_sim_env_id'] != False)):
                    for m_q in eval_sim_work_queues:
                        ## block on full queue
                        m_q.put(message)

            else:
                episodeData = {}
                episodeData['data'] = epoch
                episodeData['type'] = 'sim'
                input_anchor_queue.put(episodeData)

            if masterAgent.getExperience().samples(
            ) >= batch_size:  #更新policy网络
                states, actions, result_states, rewards, falls, G_ts, exp_actions = masterAgent.getExperience(
                ).get_batch(batch_size)
                error = masterAgent.bellman_error(states, actions, rewards,
                                                  result_states, falls)
                bellman_errors.append(error)
                if (settings['debug_critic']):
                    loss__ = masterAgent.getPolicy()._get_critic_loss(
                    )  # uses previous call batch data
                    criticLosses.append(loss__)
                    regularizationCost__ = masterAgent.getPolicy(
                    )._get_critic_regularization()
                    criticRegularizationCosts.append(regularizationCost__)

                if (settings['debug_actor']):  #True
                    loss__ = masterAgent.getPolicy()._get_actor_loss(
                    )  # uses previous call batch data
                    actorLosses.append(loss__)
                    regularizationCost__ = masterAgent.getPolicy(
                    )._get_actor_regularization()
                    actorRegularizationCosts.append(regularizationCost__)

                if not all(np.isfinite(error)):
                    print(
                        "States: " + str(states) + " ResultsStates: " +
                        str(result_states) + " Rewards: " + str(rewards) +
                        " Actions: " + str(actions) + " Falls: ", str(falls))
                    print("Bellman Error is Nan: " + str(error) +
                          str(np.isfinite(error)))
                    sys.exit()

                error = np.mean(np.fabs(error))
                if error > 10000:
                    print("Error to big: ")
                    print(states, actions, rewards, result_states)

                if (settings['train_forward_dynamics']):  #False
                    dynamicsLoss = masterAgent.getForwardDynamics(
                    ).bellman_error(states, actions, result_states, rewards)
                    dynamicsLoss = np.mean(np.fabs(dynamicsLoss))  #fabs:计算绝对值
                    dynamicsLosses.append(dynamicsLoss)
                    if (settings['train_reward_predictor']):
                        dynamicsRewardLoss = masterAgent.getForwardDynamics(
                        ).reward_error(states, actions, result_states, rewards)
                        dynamicsRewardLoss = np.mean(
                            np.fabs(dynamicsRewardLoss))
                        dynamicsRewardLosses.append(dynamicsRewardLoss)
                if (settings['train_forward_dynamics']):
                    print("Round: " + str(round_) + " Epoch: " + str(epoch) +
                          " p: " + str(p) + " With mean reward: " +
                          str(np.mean(rewards)) + " bellman error: " +
                          str(error) + " ForwardPredictionLoss: " +
                          str(dynamicsLoss))
                else:
                    print("Round: " + str(round_) + " Epoch: " + str(epoch) +
                          " p: " + str(p) + " With mean reward: " +
                          str(np.mean(rewards)) + " bellman error: " +
                          str(error))

            if (settings["print_levels"][settings["print_level"]] >=
                    settings["print_levels"]['train']):
                print("Master agent experience size: " +
                      str(masterAgent.getExperience().samples()))

            if (not settings['on_policy']):
                ## There could be stale policy parameters in here, use the last set put in the queue
                data = None
                while (not masterAgent_message_queue.empty()):
                    ## Don't block
                    try:
                        data = masterAgent_message_queue.get(False)
                    except Exception as inst:
                        pass
                if (not (data == None)):
                    masterAgent.setExperience(data[0])
                    masterAgent.getPolicy().setNetworkParameters(data[1])
                    masterAgent.setStateBounds(
                        masterAgent.getExperience().getStateBounds())
                    masterAgent.setActionBounds(
                        masterAgent.getExperience().getActionBounds())
                    masterAgent.setRewardBounds(
                        masterAgent.getExperience().getRewardBounds())
                    if (settings['train_forward_dynamics']):
                        masterAgent.getForwardDynamics().setNetworkParameters(
                            data[2])
                        if ('keep_seperate_fd_exp_buffer' in settings
                                and (settings['keep_seperate_fd_exp_buffer'])):
                            masterAgent.setFDExperience(data[3])

            # this->_actor->iterate();
        ## This will let me know which part of learning is going slower training updates or simulation
        if (settings["print_levels"][settings["print_level"]] >=
                settings["print_levels"]['train']):
            print("sim queue size: ", input_anchor_queue.qsize())  #返回队列的大小
        if (output_experience_queue != None):
            print("exp tuple queue size: ", output_experience_queue.qsize())

        if (not settings['on_policy']):
            data = ('Update_Policy', p, masterAgent.getStateBounds(),
                    masterAgent.getActionBounds(),
                    masterAgent.getRewardBounds(),
                    masterAgent.getPolicy().getNetworkParameters())
            if (settings['train_forward_dynamics']):
                data = (
                    'Update_Policy', p, masterAgent.getStateBounds(),
                    masterAgent.getActionBounds(),
                    masterAgent.getRewardBounds(),
                    masterAgent.getPolicy().getNetworkParameters(),
                    masterAgent.getForwardDynamics().getNetworkParameters())
            message['type'] = 'Update_Policy'
            message['data'] = data
            for m_q in sim_work_queues:
                ## Don't block on full queue
                try:
                    m_q.put(message, False)
                except:
                    print("SimWorker model parameter message queue full: ",
                          m_q.qsize())
            if ('override_sim_env_id' in settings
                    and (settings['override_sim_env_id'] != False)):
                for m_q in eval_sim_work_queues:
                    ## Don't block on full queue
                    try:
                        m_q.put(message, False)
                    except:
                        print("SimWorker model parameter message queue full: ",
                              m_q.qsize())

        if (round_ % settings['plotting_update_freq_num_rounds']) == 0:
            # Running less often helps speed learning up.
            # Sync up sim actors
            if (settings['on_policy']):
                mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error, mean_eval, std_eval = evalModelParrallel(
                    input_anchor_queue=eval_sim_work_queues,
                    model=masterAgent,
                    settings=settings,
                    eval_episode_data_queue=eval_episode_data_queue,
                    anchors=settings['eval_epochs'])
            else:
                mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error, mean_eval, std_eval = evalModelParrallel(
                    input_anchor_queue=input_anchor_queue_eval,
                    model=masterAgent,
                    settings=settings,
                    eval_episode_data_queue=eval_episode_data_queue,
                    anchors=settings['eval_epochs'])

            print(mean_reward, std_reward, mean_bellman_error,
                  std_bellman_error, mean_discount_error, std_discount_error)
            if mean_bellman_error > 10000:
                print("Error to big: ")
            else:
                if (settings['train_forward_dynamics']):  #false
                    mean_dynamicsLosses = np.mean(dynamicsLosses)
                    std_dynamicsLosses = np.std(dynamicsLosses)
                    dynamicsLosses = []
                if (settings['train_reward_predictor']):  #false
                    mean_dynamicsRewardLosses = np.mean(dynamicsRewardLosses)
                    std_dynamicsRewardLosses = np.std(dynamicsRewardLosses)
                    dynamicsRewardLosses = []

                trainData["mean_reward"].append(mean_reward)
                trainData["std_reward"].append(std_reward)
                trainData["anneal_p"].append(p)
                trainData["mean_bellman_error"].append(
                    np.mean(np.fabs(bellman_errors)))
                trainData["std_bellman_error"].append(np.std(bellman_errors))
                bellman_errors = []
                trainData["mean_discount_error"].append(mean_discount_error)
                trainData["std_discount_error"].append(std_discount_error)
                trainData["mean_eval"].append(mean_eval)
                trainData["std_eval"].append(std_eval)
                if (settings['train_forward_dynamics']):
                    trainData["mean_forward_dynamics_loss"].append(
                        mean_dynamicsLosses)
                    trainData["std_forward_dynamics_loss"].append(
                        std_dynamicsLosses)
                if (settings['train_reward_predictor']):
                    trainData["mean_forward_dynamics_reward_loss"].append(
                        mean_dynamicsRewardLosses)
                    trainData["std_forward_dynamics_reward_loss"].append(
                        std_dynamicsRewardLosses)

        if (round_ % settings['saving_update_freq_num_rounds']) == 0:
            if (settings['train_forward_dynamics']):
                file_name_dynamics = directory + "forward_dynamics_" + ".pkl"
                f = open(file_name_dynamics, 'wb')
                dill.dump(masterAgent.getForwardDynamics(), f)
                f.close()
                if mean_dynamicsLosses < best_dynamicsLosses:
                    best_dynamicsLosses = mean_dynamicsLosses
                    print("Saving BEST current forward dynamics agent: " +
                          str(best_dynamicsLosses))
                    file_name_dynamics = directory + "forward_dynamics_" + "_Best.pkl"
                    f = open(file_name_dynamics, 'wb')
                    dill.dump(masterAgent.getForwardDynamics(), f)  #save model
                    f.close()

            if (mean_eval > best_eval):
                best_eval = mean_eval
                print("Saving BEST current agent: " + str(best_eval))
                file_name = directory + getAgentName() + "_Best.pkl"
                f = open(file_name, 'wb')
                dill.dump(masterAgent.getPolicy(), f)
                f.close()

            if settings['save_trainData']:
                fp = open(
                    directory + "trainingData_" + str(settings['agent_name']) +
                    ".json", 'w')
                ## because json does not serialize np.float32
                for key in trainData:
                    trainData[key] = [float(i) for i in trainData[key]]
                json.dump(trainData, fp)
                fp.close()

            print("Saving current masterAgent")

            file_name = directory + getAgentName() + ".pkl"
            f = open(file_name, 'wb')
            dill.dump(masterAgent.getPolicy(), f)
            f.close()

        gc.collect()

    print("Terminating Workers")
    if (settings['on_policy']):
        for m_q in sim_work_queues:
            ## block on full queue
            m_q.put(None)
        if ('override_sim_env_id' in settings
                and (settings['override_sim_env_id'] != False)):
            for m_q in eval_sim_work_queues:
                ## block on full queue
                m_q.put(None)
        for sw in sim_workers:  # Should update these more often
            sw.join()
        if ('override_sim_env_id' in settings
                and (settings['override_sim_env_id'] != False)):
            for sw in eval_sim_workers:  # Should update these more often
                sw.join()

    for i in range(len(sim_work_queues)):
        print("sim_work_queues size: ", sim_work_queues[i].qsize())
        while (not sim_work_queues[i].empty()):  ### Empty the queue
            ## Don't block
            try:
                data_ = sim_work_queues[i].get(False)
            except Exception as inst:
                pass
        print("sim_work_queues size: ", sim_work_queues[i].qsize())

    for i in range(len(eval_sim_work_queues)):
        print("eval_sim_work_queues size: ", eval_sim_work_queues[i].qsize())
        while (not eval_sim_work_queues[i].empty()):  ### Empty the queue
            ## Don't block
            try:
                data_ = eval_sim_work_queues[i].get(False)
            except Exception as inst:
                pass
        print("eval_sim_work_queues size: ", eval_sim_work_queues[i].qsize())

    print("Finish sim")
    exp_val.finish()

    print("Save last versions of files.")
    file_name = directory + getAgentName() + ".pkl"
    f = open(file_name, 'wb')
    dill.dump(masterAgent.getPolicy(), f)
    f.close()

    f = open(
        directory + "trainingData_" + str(settings['agent_name']) + ".json",
        "w")
    for key in trainData:
        trainData[key] = [float(i) for i in trainData[key]]
    json.dump(trainData, f, sort_keys=True, indent=4)
    f.close()

    if (settings['train_forward_dynamics']):
        file_name_dynamics = directory + "forward_dynamics_" + ".pkl"
        f = open(file_name_dynamics, 'wb')
        dill.dump(masterAgent.getForwardDynamics(), f)
        f.close()

    print("Delete any plots being used")

    gc.collect()  #立即释放内存
Example #5
0
def fitModelToData(settingsFileName):
    """
    State is the input state and Action is the desired output (y).
    """
    # from model.ModelUtil import *

    file = open(settingsFileName)
    settings = json.load(file)
    print("Settings: " + str(json.dumps(settings)))
    file.close()
    import os
    os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device=" + settings[
        'training_processor_type'] + ",floatX=" + settings['float_type']

    ## Theano needs to be imported after the flags are set.
    # from ModelEvaluation import *
    # from model.ModelUtil import *
    # print ( "theano.config.mode: ", theano.config.mode)
    from ModelEvaluation import SimWorker, evalModelParrallel, collectExperience, simEpoch, evalModel
    from model.ModelUtil import validBounds
    from model.LearningAgent import LearningAgent, LearningWorker
    from util.SimulationUtil import validateSettings, createEnvironment, createRLAgent, createActor
    from util.SimulationUtil import getDataDirectory, createForwardDynamicsModel, createSampler

    from util.ExperienceMemory import ExperienceMemory
    from RLVisualize import RLVisualize
    from NNVisualize import NNVisualize

    from sim.PendulumEnvState import PendulumEnvState
    from sim.PendulumEnv import PendulumEnv
    from sim.BallGame2DEnv import BallGame2DEnv
    import time

    settings = validateSettings(settings)

    # anchor_data_file = open(settings["anchor_file"])
    # _anchors = getAnchors(anchor_data_file)
    # print ("Length of anchors epochs: ", str(len(_anchors)))
    # anchor_data_file.close()
    train_forward_dynamics = True
    model_type = settings["model_type"]
    directory = getDataDirectory(settings)
    discrete_actions = np.array(settings['discrete_actions'])
    num_actions = discrete_actions.shape[0]  # number of rows
    rounds = settings["rounds"]
    epochs = settings["epochs"]
    # num_states=settings["num_states"]
    epsilon = settings["epsilon"]
    discount_factor = settings["discount_factor"]
    # max_reward=settings["max_reward"]
    reward_bounds = np.array(settings["reward_bounds"])
    batch_size = settings["batch_size"]
    train_on_validation_set = settings["train_on_validation_set"]
    state_bounds = np.array(settings['state_bounds'])
    discrete_actions = np.array(settings['discrete_actions'])
    print("Sim config file name: ", str(settings["sim_config_file"]))
    # c = characterSim.Configuration(str(settings["sim_config_file"]))
    # c = characterSim.Configuration("../data/epsilon0Config.ini")
    action_space_continuous = settings['action_space_continuous']
    # states2 = np.transpose(np.repeat([states], 2, axis=0))
    # print states2
    if action_space_continuous:
        action_bounds = np.array(settings["action_bounds"], dtype=float)

    if action_space_continuous:
        experience = ExperienceMemory(len(state_bounds[0]),
                                      len(action_bounds[0]),
                                      settings['expereince_length'],
                                      continuous_actions=True,
                                      settings=settings)
    else:
        experience = ExperienceMemory(len(state_bounds[0]), 1,
                                      settings['expereince_length'])
    file_name = directory + getAgentName() + "expBufferInit.hdf5"
    # experience.saveToFile(file_name)
    experience.loadFromFile(file_name)
    state_bounds = experience._state_bounds
    action_bounds = experience._action_bounds
    reward_bounds = experience._reward_bounds

    output_experience_queue = multiprocessing.Queue(
        settings['queue_size_limit'])
    mgr = multiprocessing.Manager()
    namespace = mgr.Namespace()
    learning_workers = []
    # for process in range(settings['num_available_threads']):
    for process in range(1):
        # this is the process that selects which game to play
        agent = LearningAgent(n_in=len(state_bounds[0]),
                              n_out=len(action_bounds[0]),
                              state_bounds=state_bounds,
                              action_bounds=action_bounds,
                              reward_bound=reward_bounds,
                              settings_=settings)

        agent.setSettings(settings)
        """
        if action_space_continuous:
            model = createRLAgent(settings['agent_name'], state_bounds, action_bounds, reward_bounds, settings)
        else:
            model = createRLAgent(settings['agent_name'], state_bounds, discrete_actions, reward_bounds, settings)
        model.setStateBounds(state_bounds)
        model.setActionBounds(action_bounds)
        model.setRewardBounds(reward_bounds)
        """
        # agent.setPolicy(model)
        # actor.setPolicy(model)
        # agent.setExperience(experience)
        # namespace.agentPoly = agent.getPolicy().getNetworkParameters()
        # namespace.experience = experience

        lw = LearningWorker(output_experience_queue, agent, namespace)
        # lw.start()
        learning_workers.append(lw)
    masterAgent = agent
    masterAgent.setExperience(experience)

    if action_space_continuous:
        model = createRLAgent(settings['agent_name'], state_bounds,
                              action_bounds, reward_bounds, settings)
    else:
        model = createRLAgent(settings['agent_name'], state_bounds,
                              discrete_actions, reward_bounds, settings)
    if (not settings['load_saved_model']):
        model.setStateBounds(state_bounds)
        model.setActionBounds(action_bounds)
        model.setRewardBounds(reward_bounds)
    else:  # continuation learning
        experience.setStateBounds(model.getStateBounds())
        experience.setRewardBounds(model.getRewardBounds())
        experience.setActionBounds(model.getActionBounds())

    if (settings['train_forward_dynamics']):
        print("Created forward dynamics network")
        # forwardDynamicsModel = ForwardDynamicsNetwork(state_length=len(state_bounds[0]),action_length=len(action_bounds[0]), state_bounds=state_bounds, action_bounds=action_bounds, settings_=settings)
        forwardDynamicsModel = createForwardDynamicsModel(
            settings, state_bounds, action_bounds, None, None)
        masterAgent.setForwardDynamics(forwardDynamicsModel)
        forwardDynamicsModel.setActor(actor)
        # forwardDynamicsModel.setEnvironment(exp)
        forwardDynamicsModel.init(len(state_bounds[0]), len(action_bounds[0]),
                                  state_bounds, action_bounds, actor, None,
                                  settings)
        namespace.forwardNN = masterAgent.getForwardDynamics(
        ).getNetworkParameters()
        # actor.setForwardDynamicsModel(forwardDynamicsModel)
        namespace.forwardDynamicsModel = forwardDynamicsModel

    ## Now everything related to the exp memory needs to be updated
    bellman_errors = []
    masterAgent.setPolicy(model)
    # masterAgent.setForwardDynamics(forwardDynamicsModel)
    namespace.agentPoly = masterAgent.getPolicy().getNetworkParameters()
    namespace.model = model
    # experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), experience_length, continuous_actions=True)
    """
    for i in range(experience_length):
        action_ = np.array([actions[i]])
        state_ = np.array([states[i]])
        # print "Action: " + str([actions[i]])
        experience.insert(norm_state(state_, state_bounds), norm_action(action_, action_bounds),
                           norm_state(state_, state_bounds), norm_reward(np.array([0]), reward_bounds))
    """

    if (settings['visualize_learning']):
        rlv = NNVisualize(title=str(directory), settings=settings)
        rlv.setInteractive()
        rlv.init()

    if (settings['debug_critic']):
        criticLosses = []
        criticRegularizationCosts = []
        if (settings['visualize_learning']):
            critic_loss_viz = NNVisualize(title=str("Critic Loss") + " with " +
                                          str(settings["model_type"]))
            critic_loss_viz.setInteractive()
            critic_loss_viz.init()
            critic_regularization_viz = NNVisualize(
                title=str("Critic Regularization Cost") + " with " +
                str(settings["model_type"]))
            critic_regularization_viz.setInteractive()
            critic_regularization_viz.init()

    if (settings['debug_actor']):
        actorLosses = []
        actorRegularizationCosts = []
        if (settings['visualize_learning']):
            actor_loss_viz = NNVisualize(title=str("Actor Loss") + " with " +
                                         str(settings["model_type"]))
            actor_loss_viz.setInteractive()
            actor_loss_viz.init()
            actor_regularization_viz = NNVisualize(
                title=str("Actor Regularization Cost") + " with " +
                str(settings["model_type"]))
            actor_regularization_viz.setInteractive()
            actor_regularization_viz.init()

    trainData = {}
    trainData["mean_reward"] = []
    trainData["std_reward"] = []
    trainData["mean_bellman_error"] = []
    trainData["std_bellman_error"] = []
    trainData["mean_discount_error"] = []
    trainData["std_discount_error"] = []
    trainData["mean_forward_dynamics_loss"] = []
    trainData["std_forward_dynamics_loss"] = []
    trainData["mean_eval"] = []
    trainData["std_eval"] = []
    trainData["mean_critic_loss"] = []
    trainData["std_critic_loss"] = []
    trainData["mean_critic_regularization_cost"] = []
    trainData["std_critic_regularization_cost"] = []
    trainData["mean_actor_loss"] = []
    trainData["std_actor_loss"] = []
    trainData["mean_actor_regularization_cost"] = []
    trainData["std_actor_regularization_cost"] = []

    # dynamicsLosses=[]
    best_dynamicsLosses = 1000000
    _states, _actions, _result_states, _rewards, _falls, _G_ts = experience.get_batch(
        batch_size)
    """
    _states = theano.shared(np.array(_states, dtype=theano.config.floatX))
    _actions = theano.shared(np.array(_actions, dtype=theano.config.floatX))
    _result_states = theano.shared(np.array(_result_states, dtype=theano.config.floatX))
    _rewards = theano.shared(np.array(_rewards, dtype=theano.config.floatX))
    """
    for round_ in range(rounds):
        t0 = time.time()
        # out = simEpoch(actor, exp_val, masterAgent, discount_factor, anchors=epoch, action_space_continuous=action_space_continuous, settings=settings,
        #                print_data=False, p=1.0, validation=False, epoch=epoch, evaluation=False, _output_queue=None )
        # (tuples, discounted_sum, q_value, evalData) = out
        # (__states, __actions, __result_states, __rewards, __falls, __G_ts) = tuples
        __states, __actions, __result_states, __rewards, __falls, __G_ts = experience.get_batch(
            100)
        # print("**** training states: ", np.array(__states).shape)
        # print("**** training __result_states: ", np.array(__result_states).shape)
        # print ("Actions before: ", __actions)
        for i in range(1):
            masterAgent.train(_states=__states,
                              _actions=__actions,
                              _rewards=__rewards,
                              _result_states=__result_states,
                              _falls=__falls)
        t1 = time.time()
        time_taken = t1 - t0
        if masterAgent.getExperience().samples() > batch_size:
            states, actions, result_states, rewards, falls, G_ts = masterAgent.getExperience(
            ).get_batch(batch_size)
            print("Batch size: " + str(batch_size))
            error = masterAgent.bellman_error(states, actions, rewards,
                                              result_states, falls)
            bellman_errors.append(error)
            if (settings['debug_critic']):
                loss__ = masterAgent.getPolicy()._get_critic_loss(
                )  # uses previous call batch data
                criticLosses.append(loss__)
                regularizationCost__ = masterAgent.getPolicy(
                )._get_critic_regularization()
                criticRegularizationCosts.append(regularizationCost__)

            if (settings['debug_actor']):
                """
                print( "Advantage: ", masterAgent.getPolicy()._get_advantage())
                print("Policy prob: ", masterAgent.getPolicy()._q_action())
                print("Policy log prob: ", masterAgent.getPolicy()._get_log_prob())
                print( "Actor loss: ", masterAgent.getPolicy()._get_action_diff())
                """
                loss__ = masterAgent.getPolicy()._get_actor_loss(
                )  # uses previous call batch data
                actorLosses.append(loss__)
                regularizationCost__ = masterAgent.getPolicy(
                )._get_actor_regularization()
                actorRegularizationCosts.append(regularizationCost__)

            if not all(np.isfinite(error)):
                print(
                    "States: " + str(states) + " ResultsStates: " +
                    str(result_states) + " Rewards: " + str(rewards) +
                    " Actions: " + str(actions) + " Falls: ", str(falls))
                print("Bellman Error is Nan: " + str(error) +
                      str(np.isfinite(error)))
                sys.exit()

            error = np.mean(np.fabs(error))
            if error > 10000:
                print("Error to big: ")
                print(states, actions, rewards, result_states)

            if (settings['train_forward_dynamics']):
                dynamicsLoss = masterAgent.getForwardDynamics().bellman_error(
                    states, actions, result_states, rewards)
                dynamicsLoss = np.mean(np.fabs(dynamicsLoss))
                dynamicsLosses.append(dynamicsLoss)
            if (settings['train_forward_dynamics']):
                print("Round: " + str(round_) + " bellman error: " +
                      str(error) + " ForwardPredictionLoss: " +
                      str(dynamicsLoss) + " in " + str(time_taken) +
                      " seconds")
            else:
                print("Round: " + str(round_) + " bellman error: " +
                      str(error) + " in " + str(time_taken) + " seconds")
            # discounted_values.append(discounted_sum)

        print("Master agent experience size: " +
              str(masterAgent.getExperience().samples()))
        # print ("**** Master agent experience size: " + str(learning_workers[0]._agent._expBuff.samples()))
        # masterAgent.getPolicy().setNetworkParameters(namespace.agentPoly)
        # masterAgent.setExperience(learningNamespace.experience)
        # if (settings['train_forward_dynamics']):
        #     masterAgent.getForwardDynamics().setNetworkParameters(namespace.forwardNN)
        """
        for sw in sim_workers: # Should update these more often?
            sw._model.getPolicy().setNetworkParameters(namespace.agentPoly)
            if (settings['train_forward_dynamics']):
                sw._model.getForwardDynamics().setNetworkParameters(namespace.forwardNN)
                """
        # experience = learningNamespace.experience
        # actor.setExperience(experience)
        """
        pr.disable()
        f = open('x.prof', 'a')
        pstats.Stats(pr, stream=f).sort_stats('time').print_stats()
        f.close()
        """
        trainData["mean_bellman_error"].append(np.mean(
            np.fabs(bellman_errors)))
        trainData["std_bellman_error"].append(np.std(bellman_errors))
        if (settings['visualize_learning']):
            rlv.updateLoss(np.array(trainData["mean_bellman_error"]),
                           np.array(trainData["std_bellman_error"]))
            rlv.redraw()
            rlv.setInteractiveOff()
            rlv.saveVisual(directory + "trainingGraphNN")
            rlv.setInteractive()
        # print "Error: " + str(error)
        if (settings['debug_critic']):
            mean_criticLosses = np.mean(criticLosses)
            std_criticLosses = np.std(criticLosses)
            trainData["mean_critic_loss"].append(mean_criticLosses)
            trainData["std_critic_loss"].append(std_criticLosses)
            criticLosses = []
            if (settings['visualize_learning']):
                critic_loss_viz.updateLoss(
                    np.array(trainData["mean_critic_loss"]),
                    np.array(trainData["std_critic_loss"]))
                critic_loss_viz.redraw()
                critic_loss_viz.setInteractiveOff()
                critic_loss_viz.saveVisual(directory + "criticLossGraph")
                critic_loss_viz.setInteractive()

            mean_criticRegularizationCosts = np.mean(criticRegularizationCosts)
            std_criticRegularizationCosts = np.std(criticRegularizationCosts)
            trainData["mean_critic_regularization_cost"].append(
                mean_criticRegularizationCosts)
            trainData["std_critic_regularization_cost"].append(
                std_criticRegularizationCosts)
            criticRegularizationCosts = []
            if (settings['visualize_learning']):
                critic_regularization_viz.updateLoss(
                    np.array(trainData["mean_critic_regularization_cost"]),
                    np.array(trainData["std_critic_regularization_cost"]))
                critic_regularization_viz.redraw()
                critic_regularization_viz.setInteractiveOff()
                critic_regularization_viz.saveVisual(
                    directory + "criticRegularizationGraph")
                critic_regularization_viz.setInteractive()

        if (settings['debug_actor']):

            mean_actorLosses = np.mean(actorLosses)
            std_actorLosses = np.std(actorLosses)
            trainData["mean_actor_loss"].append(mean_actorLosses)
            trainData["std_actor_loss"].append(std_actorLosses)
            actorLosses = []
            if (settings['visualize_learning']):
                actor_loss_viz.updateLoss(
                    np.array(trainData["mean_actor_loss"]),
                    np.array(trainData["std_actor_loss"]))
                actor_loss_viz.redraw()
                actor_loss_viz.setInteractiveOff()
                actor_loss_viz.saveVisual(directory + "actorLossGraph")
                actor_loss_viz.setInteractive()

            mean_actorRegularizationCosts = np.mean(actorRegularizationCosts)
            std_actorRegularizationCosts = np.std(actorRegularizationCosts)
            trainData["mean_actor_regularization_cost"].append(
                mean_actorRegularizationCosts)
            trainData["std_actor_regularization_cost"].append(
                std_actorRegularizationCosts)
            actorRegularizationCosts = []
            if (settings['visualize_learning']):
                actor_regularization_viz.updateLoss(
                    np.array(trainData["mean_actor_regularization_cost"]),
                    np.array(trainData["std_actor_regularization_cost"]))
                actor_regularization_viz.redraw()
                actor_regularization_viz.setInteractiveOff()
                actor_regularization_viz.saveVisual(directory +
                                                    "actorRegularizationGraph")
                actor_regularization_viz.setInteractive()
Example #6
0
class QProp(AlgorithmInterface):
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):
        """
            In order to get this to work we need to be careful not to update the actor parameters
            when updating the critic. This can be an issue when the Concatenating networks together.
            The first first network becomes a part of the second. However you can still access the first
            network by itself but an updates on the second network will effect the first network.
            Care needs to be taken to make sure only the parameters of the second network are updated.
        """

        super(QProp, self).__init__(model, n_in, n_out, state_bounds,
                                    action_bounds, reward_bound, settings_)

        # if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)):
        self._experience = ExperienceMemory(
            n_in,
            n_out,
            self.getSettings()['expereince_length'],
            continuous_actions=True,
            settings=self.getSettings())

        self._experience.setStateBounds(copy.deepcopy(self.getStateBounds()))
        self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds()))
        self._experience.setActionBounds(copy.deepcopy(self.getActionBounds()))

        self._use_basic_polcy_grad = False

        self._Fallen = T.bcol("Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._Fallen.tag.test_value = np.zeros((self._batch_size, 1),
                                               dtype=np.dtype('int8'))

        self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                     dtype='int8'),
                                            broadcastable=(False, True))

        self._Action = T.matrix("Action2")
        self._Action.tag.test_value = np.random.rand(self._batch_size,
                                                     self._action_length)

        self._Tmp_Target = T.col("Tmp_Target")
        self._Tmp_Target.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                                broadcastable=(False, True))

        self._Advantage = T.col("Advantage")
        self._Advantage.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))
        self._advantage_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                               broadcastable=(False, True))
        self._QProp_N = T.col("QProp_N")
        self._QProp_N.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))
        self._QProp_N_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                             broadcastable=(False, True))

        self._modelTarget = copy.deepcopy(model)
        self._modelTarget2 = copy.deepcopy(model)

        self._learning_rate = self.getSettings()['learning_rate']
        self._discount_factor = self.getSettings()['discount_factor']
        self._rho = self.getSettings()['rho']
        self._rms_epsilon = self.getSettings()['rms_epsilon']

        self._weight_update_steps = self.getSettings(
        )['steps_until_target_network_update']
        self._updates = 0
        self._decay_weight = self.getSettings()['regularization_weight']
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget_State = lasagne.layers.get_output(
            self._modelTarget2.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)

        self._q_valsActASTD = (T.ones_like(
            self._q_valsActA)) * self.getSettings()['exploration_rate']
        self._q_valsActTargetSTD = (T.ones_like(self._q_valsActTarget_State)
                                    ) * self.getSettings()['exploration_rate']

        inputs_1 = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions()
        }
        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_1)
        inputs_2 = {
            self._modelTarget.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._modelTarget.getActionSymbolicVariable():
            self._model.getActions()
        }
        self._q_valsB_ = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(), inputs_2, deterministic=True)

        self._q_func = self._q_valsA
        self._q_funcB = self._q_valsB_
        self._q_funcAct = self._q_valsActA

        self._diff = self._Tmp_Target - self._q_func
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        print("******Number of Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getCriticNetwork()))))
        print("******Number of Action Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getActorNetwork()))))
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            self._Tmp_Target: self._tmp_target_shared
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))

        ## MSE update
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_ = lasagne.updates.adam(self._value_grad,
                                              self._params,
                                              self._critic_learning_rate,
                                              beta1=0.9,
                                              beta2=0.9,
                                              epsilon=self._rms_epsilon)

        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
        }

        ## Some cool stuff to backprop action gradients
        self._action_grad = T.matrix("Action_Grad")
        self._action_grad.tag.test_value = np.zeros(
            (self._batch_size, self._action_length),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._action_grad_shared = theano.shared(
            np.zeros((self._batch_size, self._action_length),
                     dtype=self.getSettings()['float_type']))

        ### Maximize wrt q function
        self._action_mean_grads = T.grad(
            cost=None,
            wrt=self._actionParams,
            known_grads={self._q_valsActA: self._action_grad_shared}),
        print("Action grads: ", self._action_mean_grads[0])
        ## When passing in gradients it needs to be a proper list of gradient expressions
        self._action_mean_grads = list(self._action_mean_grads[0])
        self._actionGRADUpdates = lasagne.updates.adam(
            self._action_mean_grads,
            self._actionParams,
            self._learning_rate,
            beta1=0.9,
            beta2=0.9,
            epsilon=self._rms_epsilon)

        self._actGradGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
        }
        self._actor_regularization = (
            self._regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getActorNetwork(), lasagne.regularization.l2))

        ### update Actor wrt to Q function
        """
        inputs_1_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._q_valsActA: self._model.getActions()
        }
        q = self._model.getCriticNetwork()(self._model.getStateSymbolicVariable(), self._q_valsActA)
        self._q_valsA_ = lasagne.layers.get_output(self._model.getCriticNetwork(), inputs_1_)
        # self._q_valsA_ = lasagne.layers.get_output(self._model.getCriticNetwork(), self._q_valsActA)
        self._q_val2 = theano.function([self._model.getStateSymbolicVariable()], self._q_valsA_)
        self._actionUpdates = lasagne.updates.adam(-T.mean(self._q_valsA_), self._actionParams, 
                    self._learning_rate,  beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon)
        """
        ## Compute on-policy policy gradient
        self._prob = likelihood(self._model.getActionSymbolicVariable(),
                                self._q_valsActA, self._q_valsActASTD,
                                self._action_length)
        ### How should this work if the target network is very odd, as in not a slightly outdated copy.
        self._prob_target = likelihood(self._model.getActionSymbolicVariable(),
                                       self._q_valsActTarget_State,
                                       self._q_valsActTargetSTD,
                                       self._action_length)
        ## This does the sum already
        self._r = (self._prob / self._prob_target)
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (self._r), self._Advantage)
        ppo_epsilon = self.getSettings()['kl_divergence_threshold']
        self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (theano.tensor.clip(self._r, 1.0 - ppo_epsilon,
                                1 + ppo_epsilon), self._Advantage))
        self._actLoss_ = theano.tensor.minimum((self._actLoss_),
                                               (self._actLoss_2))
        self._actLoss = (
            (T.mean(self._actLoss_))) + -self._actor_regularization

        self._policy_grad = T.grad(-1.0 * self._actLoss, self._actionParams)
        self._policy_grad = lasagne.updates.total_norm_constraint(
            self._policy_grad, 5)
        if (self.getSettings()['optimizer'] == 'rmsprop'):
            self._actionUpdates = lasagne.updates.rmsprop(
                self._policy_grad, self._actionParams, self._learning_rate,
                self._rho, self._rms_epsilon)
        elif (self.getSettings()['optimizer'] == 'momentum'):
            self._actionUpdates = lasagne.updates.momentum(self._policy_grad,
                                                           self._actionParams,
                                                           self._learning_rate,
                                                           momentum=self._rho)
        elif (self.getSettings()['optimizer'] == 'adam'):
            self._actionUpdates = lasagne.updates.adam(self._policy_grad,
                                                       self._actionParams,
                                                       self._learning_rate,
                                                       beta1=0.9,
                                                       beta2=0.999,
                                                       epsilon=1e-08)

        self._qprop_loss = self._actLoss + T.mean(
            (self._QProp_N * self._q_func))
        self._policy_grad_loss = self._actLoss
        # if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)):
        self._valsA = lasagne.layers.get_output(
            self._model._value_function,
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._valsA_drop = lasagne.layers.get_output(
            self._model._value_function,
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._valsNextState = lasagne.layers.get_output(
            self._model._value_function,
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._valsTargetNextState = lasagne.layers.get_output(
            self._modelTarget._value_function,
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        self._valsTarget = lasagne.layers.get_output(
            self._modelTarget._value_function,
            self._model.getStateSymbolicVariable(),
            deterministic=True)

        self._v_target = self._model.getRewardSymbolicVariable() + (
            self._discount_factor * self._valsTargetNextState)
        self._v_diff = self._v_target - self._valsA
        loss_v = T.pow(self._v_diff, 2)
        self._v_loss = T.mean(loss_v)

        self._params_value = lasagne.layers.helper.get_all_params(
            self._model._value_function)
        self._givens_value = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
        }
        self._value_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model._value_function, lasagne.regularization.l2))

        self._value_grad = T.grad(self._v_loss + self._value_regularization,
                                  self._params_value)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_value = lasagne.updates.adam(self._value_grad,
                                                   self._params_value,
                                                   self._critic_learning_rate,
                                                   beta1=0.9,
                                                   beta2=0.9,
                                                   epsilon=self._rms_epsilon)

        self._actGivens_PPO = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._NotFallen: self._NotFallen_shared,
            self._Advantage:
            self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }
        QProp.compile(self)

    def compile(self):

        ## For training the Q function
        self._train = theano.function([], [self._loss, self._q_func],
                                      updates=self._updates_,
                                      givens=self._givens_)
        ## For training the on-policy value function.
        self._train_value = theano.function([], [self._v_loss, self._valsA],
                                            updates=self._updates_value,
                                            givens=self._givens_value)
        ## Train the policy by backpropgating action gradients
        self._trainActionGRAD = theano.function(
            [], [],
            updates=self._actionGRADUpdates,
            givens=self._actGradGivens)
        ## Train the Policy directly using q function.
        # self._trainAction  = theano.function([self._model.getStateSymbolicVariable()], [self._q_valsA_], updates=self._actionUpdates)
        ## Train using PPO like method
        self._trainActor = theano.function([
            self._model.getStateSymbolicVariable(),
            self._model.getActionSymbolicVariable(), self._Advantage
        ], [self._actLoss],
                                           updates=self._actionUpdates)
        ## Get the
        # self._q_val2 = theano.function([self._model.getStateSymbolicVariable(),
        #                                 self._model.getActionSymbolicVariable()], self._q_valsA_)

        self._q_val = theano.function(
            [],
            self._q_func,
            givens={
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions()
            })

        self._q_val_Target = theano.function(
            [  #self._model.getStateSymbolicVariable(), 
                #self._model.getActionSymbolicVariable()
            ],
            self._q_valsB_,
            givens={
                self._modelTarget.getStateSymbolicVariable():
                self._model.getResultStates(),
                self._modelTarget.getActionSymbolicVariable():
                self._model.getActions()
            })
        self._q_action = theano.function(
            [],
            self._q_valsActA,
            givens={
                self._model.getStateSymbolicVariable():
                self._model.getStates()
            })
        self._action_Target = theano.function(
            [],
            self._q_valsActTarget,
            givens={
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates()
            })
        self._bellman_error2 = theano.function(inputs=[],
                                               outputs=self._diff,
                                               allow_input_downcast=True,
                                               givens=self._givens_)

        self._get_action_grad = theano.function(
            [],
            outputs=lasagne.updates.get_or_compute_grads(
                T.mean(self._q_func),
                [self._model._actionInputVar] + self._params),
            allow_input_downcast=True,
            givens=self._givens_grad)

        self._givens_QProp = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
            self._Advantage: self._advantage_shared,
            self._QProp_N: self._QProp_N_shared
        }
        self._get_Qprop_action_grad = theano.function(
            [],
            outputs=lasagne.updates.get_or_compute_grads(
                T.mean(self._qprop_loss),
                [self._model._actionInputVar] + self._params),
            allow_input_downcast=True,
            givens=self._givens_QProp)

        # self._get_Qprop_action_grad = theano.function([], outputs=lasagne.updates.get_or_compute_grads(self._policy_grad_loss, [self._model._actionInputVar] + self._params), allow_input_downcast=True, givens=self._actGivens_PPO)

        self._vals_extra = theano.function(
            [],
            outputs=self._valsA,
            allow_input_downcast=True,
            givens={
                self._model.getStateSymbolicVariable():
                self._model.getStates()
            })
        if ('train_extra_value_function' in self.getSettings()
                and (self.getSettings()['train_extra_value_function'])):
            self._givens_grad = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
            }
            self._get_state_grad = theano.function(
                [],
                outputs=lasagne.updates.get_or_compute_grads(
                    T.mean(self._valsA),
                    [self._model._stateInputVar] + self._params_value),
                allow_input_downcast=True,
                givens=self._givens_grad)
        else:
            self._get_state_grad = theano.function(
                [],
                outputs=lasagne.updates.get_or_compute_grads(
                    T.mean(self._q_func),
                    [self._model._stateInputVar] + self._params),
                allow_input_downcast=True,
                givens=self._givens_grad)

    def getGrads(self, states, actions=None, alreadyNormed=False):
        """
            The states should be normalized
        """
        # self.setData(states, actions, rewards, result_states)
        if (alreadyNormed == False):
            states = norm_state(states, self._state_bounds)
        states = np.array(states, dtype=theano.config.floatX)
        self._model.setStates(states)
        if (actions is None):
            actions = self.predict_batch(states)
        self._model.setActions(actions)
        return self._get_state_grad()

    def getActionGrads(self, states, actions=None, alreadyNormed=False):
        """
            The states should be normalized
        """
        # self.setData(states, actions, rewards, result_states)
        if (alreadyNormed == False):
            states = norm_state(states, self._state_bounds)
        states = np.array(states, dtype=theano.config.floatX)
        self._model.setStates(states)
        if (actions is None):
            actions = self.predict_batch(states)
        self._model.setActions(actions)

        if (self._use_basic_polcy_grad):
            return self._get_action_grad()
        else:
            return self._get_Qprop_action_grad()

    def updateTargetModel(self):
        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['train']):
            print("Updating target Model")
        """
            Target model updates
        """
        # return
        ## I guess it is okay to lerp the entire network even though we only really want to
        ## lerp the value function part of the networks, the target policy is not used for anythings
        all_paramsA = lasagne.layers.helper.get_all_param_values(
            self._model.getCriticNetwork())
        all_paramsB = lasagne.layers.helper.get_all_param_values(
            self._modelTarget.getCriticNetwork())
        all_paramsActA = lasagne.layers.helper.get_all_param_values(
            self._model.getActorNetwork())
        all_paramsActB = lasagne.layers.helper.get_all_param_values(
            self._modelTarget.getActorNetwork())
        lerp_weight = self.getSettings()['target_net_interp_weight']

        all_params = []
        for paramsA, paramsB in zip(all_paramsA, all_paramsB):
            params = (lerp_weight * paramsA) + ((1.0 - lerp_weight) * paramsB)
            all_params.append(params)

        all_paramsAct = []
        for paramsA, paramsB in zip(all_paramsActA, all_paramsActB):
            params = (lerp_weight * paramsA) + ((1.0 - lerp_weight) * paramsB)
            all_paramsAct.append(params)

        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getCriticNetwork(), all_params)
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getActorNetwork(), all_paramsAct)

    def updateTargetModelValue(self):
        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['train']):
            print("Updating MBAE target Model")
        """
            Target model updates
        """
        all_paramsA = lasagne.layers.helper.get_all_param_values(
            self._model._value_function)
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget._value_function, all_paramsA)

        all_paramsActA = lasagne.layers.helper.get_all_param_values(
            self._model.getActorNetwork())
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget2.getActorNetwork(), all_paramsActA)

    def updateTargetModel(self):
        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['train']):
            print("Updating target Model")
        """
            Target model updates
        """
        # return
        ## I guess it is okay to lerp the entire network even though we only really want to
        ## lerp the value function part of the networks, the target policy is not used for anythings
        all_paramsA = self._model.getCriticNetwork().get_weights()
        all_paramsB = self._modelTarget.getCriticNetwork().get_weights()
        if ('target_net_interp_weight' in self.getSettings()):
            lerp_weight = self.getSettings()['target_net_interp_weight']
        else:
            lerp_weight = 0.001
        # vals = lasagne.layers.helper.get_all_param_values(self._l_outActA)

        all_params = []
        for paramsA, paramsB in zip(all_paramsA, all_paramsB):
            # print ("paramsA: " + str(paramsA))
            # print ("paramsB: " + str(paramsB))
            params = (lerp_weight * paramsA) + ((1.0 - lerp_weight) * paramsB)
            all_params.append(params)
        self._modelTarget.getCriticNetwork().set_weights(all_params)

        all_paramsA_Act = self._model.getActorNetwork().get_weights()
        all_paramsB_Act = self._modelTarget.getActorNetwork().get_weights()

        all_params = []
        for paramsA, paramsB in zip(all_paramsA_Act, all_paramsB_Act):
            # print ("paramsA: " + str(paramsA))
            # print ("paramsB: " + str(paramsB))
            params = (lerp_weight * paramsA) + ((1.0 - lerp_weight) * paramsB)
            all_params.append(params)
        self._modelTarget.getActorNetwork().set_weights(all_params)

    def getNetworkParameters(self):
        params = []
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._model.getCriticNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._model.getActorNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget.getCriticNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget.getActorNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._model._value_function))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget._value_function))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget2.getActorNetwork()))
        return params

    def setNetworkParameters(self, params):
        lasagne.layers.helper.set_all_param_values(
            self._model.getCriticNetwork(), params[0])
        lasagne.layers.helper.set_all_param_values(
            self._model.getActorNetwork(), params[1])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getCriticNetwork(), params[2])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getActorNetwork(), params[3])
        lasagne.layers.helper.set_all_param_values(self._model._value_function,
                                                   params[4])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget._value_function, params[5])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget2.getActorNetwork(), params[6])

    def setData(self, states, actions, rewards, result_states, fallen):
        self._model.setStates(states)
        self._model.setResultStates(result_states)
        self._model.setActions(actions)
        self._model.setRewards(rewards)
        self._modelTarget.setStates(states)
        self._modelTarget.setResultStates(result_states)
        self._modelTarget.setActions(actions)
        self._modelTarget.setRewards(rewards)
        self._fallen_shared.set_value(fallen)

    def trainOnPolicyCritic(self, states, actions, rewards, result_states,
                            falls):
        """
            Train an on policy value function for the policy
            
            The incoming data should not be normalized.
        """
        self.setData(states, actions, rewards, result_states, falls)
        loss_v, _ = self._train_value()
        print("MBAE Value function loss: ", loss_v)

    def trainCritic(self, states, actions, rewards, result_states, falls):

        self.setData(states, actions, rewards, result_states, falls)
        self._updates += 1
        ## Compute actions for TargetNet
        target_actions = self._action_Target()
        self.setData(states, target_actions, rewards, result_states, falls)
        ## Get next q value
        q_vals_b = self._q_val_Target()
        ## Compute target values
        target_tmp_ = rewards + ((self._discount_factor * q_vals_b))
        self.setData(states, actions, rewards, result_states, falls)
        self._tmp_target_shared.set_value(target_tmp_)

        loss, _ = self._train()
        # self.updateTargetModel()

        return loss

    def trainActor(self,
                   states,
                   actions,
                   rewards,
                   result_states,
                   falls,
                   advantage,
                   exp_actions,
                   forwardDynamicsModel=None):

        if ((self._updates % self._weight_update_steps) == 0):
            self.updateTargetModelValue()
        self._updates += 1
        self.setData(states, actions, rewards, result_states, falls)
        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['debug']):
            print(
                "values: ",
                np.mean(self._q_val() *
                        (1.0 / (1.0 - self.getSettings()['discount_factor']))),
                " std: ",
                np.std(self._q_val() *
                       (1.0 / (1.0 - self.getSettings()['discount_factor']))))
            print("Rewards: ", np.mean(rewards), " std: ", np.std(rewards),
                  " shape: ",
                  np.array(rewards).shape)

        advantage = advantage * (1.0 - self._discount_factor)
        ### Get Q value of sampled actions
        sampled_q = self._q_val()
        ### Get Q value of policy
        policy_mean = self.predict_batch(states)
        self.setData(states, policy_mean, rewards, result_states, falls)
        true_q = self._q_val()

        ### From Q-prop paper, compute adaptive control variate.
        cov = advantage * true_q
        # var = true_q * true_q
        # n = cov / var
        ### practical implementation n = 1 when cov > 0, otherwise 0
        n = (np.sign(cov) + 1.0) / 2.0
        n = np.zeros_like(n)
        advantage = (advantage - (n * (sampled_q - true_q)))
        std = np.std(advantage)
        mean = np.mean(advantage)
        if ('advantage_scaling' in self.getSettings()
                and (self.getSettings()['advantage_scaling'] != False)):
            std = std / self.getSettings()['advantage_scaling']
            mean = 0.0
        advantage = (advantage - mean) / std

        if (self._use_basic_polcy_grad):
            loss = 0
            action_grads = self.getActionGrads(states,
                                               policy_mean,
                                               alreadyNormed=True)[0]

            ### Get Advantage Action Gradients
            action_diff = (actions - policy_mean)
            # print ("advantage ", advantage)
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['train']):
                print("Q-prop n mean: ", np.mean(n), " std: ", np.std(n))
                print("Advantage mean: ", np.mean(advantage), " std: ",
                      np.std(advantage))
                print("sampled_q mean: ", np.mean(sampled_q), " std: ",
                      np.std(sampled_q))
                print("true_q mean: ", np.mean(true_q), " std: ",
                      np.std(true_q))
                print("Policy mean: ", np.mean(self._q_action(), axis=0))
            # print ("Mean learned advantage: ", np.mean(sampled_q - true_q))
            # print ("Mean advantage: " , np.mean(advantage))
            action_gra = action_diff * (advantage)

            # action_grads = action_gra + ( n * action_grads )
        else:
            loss = 0
            self._advantage_shared.set_value(advantage)
            self._QProp_N_shared.set_value(n)
            action_grads = self.getActionGrads(states,
                                               policy_mean,
                                               alreadyNormed=True)[0]
            action_grads = action_grads
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['train']):
                print("Q-prop n mean: ", np.mean(n), " std: ", np.std(n))
                print("Advantage mean: ", np.mean(advantage), " std: ",
                      np.std(advantage))
                print("sampled_q mean: ", np.mean(sampled_q), " std: ",
                      np.std(sampled_q))
                print("true_q mean: ", np.mean(true_q), " std: ",
                      np.std(true_q))
                print("Policy mean: ", np.mean(self._q_action(), axis=0))
        """
            From DEEP REINFORCEMENT LEARNING IN PARAMETERIZED ACTION SPACE
            Hausknecht, Matthew and Stone, Peter
            
            actions.shape == action_grads.shape
        """
        use_parameter_grad_inversion = False
        if (use_parameter_grad_inversion):
            for i in range(action_grads.shape[0]):
                for j in range(action_grads.shape[1]):
                    if (action_grads[i, j] > 0):
                        inversion = (1.0 - actions[i, j]) / 2.0
                    else:
                        inversion = (actions[i, j] - (-1.0)) / 2.0
                    action_grads[i, j] = action_grads[i, j] * inversion

        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['debug']):
            # print("Actions mean:     ", np.mean(actions, axis=0))
            print("Policy mean: ", np.mean(self._q_action(), axis=0))
            # print("Actions std:  ", np.mean(np.sqrt( (np.square(np.abs(actions - np.mean(actions, axis=0))))/1.0), axis=0) )
            # print("Actions std:  ", np.std((actions - self._q_action()), axis=0) )
            # print("Actions std:  ", np.std((actions), axis=0) )
            # print("Policy std: ", np.mean(self._q_action_std(), axis=0))
            # print("Mean Next State Grad grad: ", np.mean(next_state_grads, axis=0), " std ", np.std(next_state_grads, axis=0))
            print("Mean action grad: ", np.mean(action_grads, axis=0), " std ",
                  np.std(action_grads, axis=0))

        ## Set data for gradient
        self._model.setStates(states)
        self._modelTarget.setStates(states)
        ## Why the -1.0??
        ## Because the SGD method is always performing MINIMIZATION!!
        # self._action_grad_shared.set_value(-1.0*action_grads)
        # self._trainActionGRAD()

        self.setData(states, actions, rewards, result_states, falls)
        self._advantage_shared.set_value(advantage)
        loss = self._trainActor(states, actions, advantage)
        # loss = self._trainActor()

        return loss

    def train(self, states, actions, rewards, result_states):
        loss = self.trainCritic(states, actions, rewards, result_states)
        lossActor = self.trainActor(states, actions, rewards, result_states)
        return loss

    def q_value(self, state):
        """
            For returning a vector of q values, state should NOT be normalized
        """
        # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        action = self._q_action()
        self._model.setActions(action)
        self._modelTarget.setActions(action)

        if ('train_extra_value_function' in self.getSettings() and
            (self.getSettings()['train_extra_value_function'] == True)):
            q_vals = self._vals_extra()
        else:
            q_vals = self._q_val()

        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            return scale_reward(q_vals, self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
            # return (self._q_val())[0]
        else:
            return scale_reward(q_vals, self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
        # return self._q_valTarget()[0]
        # return self._q_val()[0]

    def q_values(self, state):
        """
            For returning a vector of q values, state should already be normalized
        """
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        action = self._q_action()
        self._model.setActions(action)
        self._modelTarget.setActions(action)

        if ('train_extra_value_function' in self.getSettings() and
            (self.getSettings()['train_extra_value_function'] == True)):
            q_vals = self._vals_extra()
        else:
            q_vals = self._q_val()
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            return scale_reward(q_vals, self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
            # return (self._q_val())[0]
        else:
            return scale_reward(q_vals, self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
        # return self._q_valTarget()
        # return self._q_val()

    def setStateBounds(self, state_bounds):
        super(QProp, self).setStateBounds(state_bounds)
        self._experience.setStateBounds(copy.deepcopy(self.getStateBounds()))

    def setActionBounds(self, action_bounds):
        super(QProp, self).setActionBounds(action_bounds)
        self._experience.setActionBounds(copy.deepcopy(self.getActionBounds()))

    def setRewardBounds(self, reward_bounds):
        super(QProp, self).setRewardBounds(reward_bounds)
        self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds()))

    def _q_action_std(self):
        ones = np.ones((self._model.getStateValues().shape[0],
                        len(self.getActionBounds()[0])))
        return np.array(self.getSettings()["exploration_rate"] * ones)