Esempio n. 1
0
File: GAN.py Progetto: skylbc/SMBAE
class GAN(AlgorithmInterface):
    """
        0 is a generated sample
        1 is a true sample
        maximize D while minimizing G
    """
    def __init__(self, model, state_length, action_length, state_bounds,
                 action_bounds, settings_):

        print("Building GAN Model")
        super(GAN, self).__init__(model, state_length, action_length,
                                  state_bounds, action_bounds, 0, settings_)
        self._noise_mean = 0.0
        self._noise_std = 1.0
        self._noise_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                           broadcastable=(False, True))

        # if settings['action_space_continuous']:
        if ('size_of_result_state' in self.getSettings()):
            self._experience = ExperienceMemory(
                state_length,
                action_length,
                self.getSettings()['expereince_length'],
                continuous_actions=True,
                settings=self.getSettings(),
                result_state_length=self.getSettings()['size_of_result_state'])
        else:
            self._experience = ExperienceMemory(
                state_length,
                action_length,
                self.getSettings()['expereince_length'],
                continuous_actions=True,
                settings=self.getSettings())

        self._experience.setStateBounds(copy.deepcopy(self.getStateBounds()))
        self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds()))
        self._experience.setActionBounds(copy.deepcopy(self.getActionBounds()))

        self._modelTarget = copy.deepcopy(model)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = self.getSettings()["fd_learning_rate"]
        self._regularization_weight = 1e-5
        self._discount_factor = self.getSettings()['discount_factor']
        self._rho = self.getSettings()['rho']
        self._rms_epsilon = self.getSettings()['rms_epsilon']

        self._weight_update_steps = self.getSettings(
        )['steps_until_target_network_update']
        self._updates = 0
        self._decay_weight = self.getSettings()['regularization_weight']
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]

        # self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        # self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        # self._q_valsNextState = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        # self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)

        if ("train_gan_with_gaussian_noise" in self.getSettings()
                and (self.getSettings()["train_gan_with_gaussian_noise"])):
            inputs_1 = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                self._model._Noise:
                self._noise_shared
            }
            self._generator_drop = lasagne.layers.get_output(
                self._model.getForwardDynamicsNetwork(),
                inputs_1,
                deterministic=True)
            self._generator = lasagne.layers.get_output(
                self._model.getForwardDynamicsNetwork(),
                inputs_1,
                deterministic=True)
        else:
            inputs_1 = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                # self._model._Noise: self._noise_shared
            }
            self._generator = lasagne.layers.get_output(
                self._model.getForwardDynamicsNetwork(),
                inputs_1,
                deterministic=True)
            self._generator_drop = lasagne.layers.get_output(
                self._model.getForwardDynamicsNetwork(),
                inputs_1,
                deterministic=False)
        # self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getForwardDynamicsNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsActA_drop = lasagne.layers.get_output(self._model.getForwardDynamicsNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)

        inputs_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model._Noise: self._noise_shared
        }
        self._discriminator = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_, deterministic=True)
        self._discriminator_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_, deterministic=False)
        """
        inputs_2 = {
            self._modelTarget.getStateSymbolicVariable(): self._model.getResultStates(),
            self._modelTarget.getActionSymbolicVariable(): self._model.getActions()
        }
        """

        self._diff = self._model.getRewardSymbolicVariable(
        ) - self._discriminator_drop
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)

        self._diff_g = self._model.getResultStateSymbolicVariable(
        ) - self._generator_drop
        loss_g = T.pow(self._diff_g, 2)
        self._loss_g = T.mean(loss_g)

        # assert len(lasagne.layers.helper.get_all_params(self._l_outA)) == 16
        # Need to remove the action layers from these params
        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        print("******Number of Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getCriticNetwork()))))
        print("******Number of Action Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getForwardDynamicsNetwork()))))
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getForwardDynamicsNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
            # self._model._Noise: self._noise_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))

        ## MSE update
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_ = lasagne.updates.adam(self._value_grad,
                                              self._params,
                                              self._learning_rate,
                                              beta1=0.9,
                                              beta2=0.9,
                                              epsilon=self._rms_epsilon)

        if ("train_gan_with_gaussian_noise" in settings_
                and (settings_["train_gan_with_gaussian_noise"])):
            self._actGivens = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                self._model._Noise:
                self._noise_shared
            }
            self._actGivens_MSE = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                self._model._Noise:
                self._noise_shared
            }
        else:
            self._actGivens = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                # self._model._Noise: self._noise_shared
            }
            self._actGivens_MSE = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                # self._model._Noise: self._noise_shared
            }

        self._actor_regularization = (
            self._regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getForwardDynamicsNetwork(),
                lasagne.regularization.l2))
        ## MSE update
        self._gen_grad = T.grad(self._loss_g + self._actor_regularization,
                                self._actionParams)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_generator = lasagne.updates.adam(
            self._gen_grad,
            self._actionParams,
            self._learning_rate,
            beta1=0.9,
            beta2=0.9,
            epsilon=self._rms_epsilon)

        ## Some cool stuff to backprop action gradients

        self._result_state_grad = T.matrix("Action_Grad")
        self._result_state_grad.tag.test_value = np.zeros(
            (self._batch_size, self._state_length),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._result_state_grad_shared = theano.shared(
            np.zeros((self._batch_size, self._state_length),
                     dtype=self.getSettings()['float_type']))

        ### Maximize wrt q function

        self._result_state_mean_grads = T.grad(
            cost=None,
            wrt=self._actionParams,
            known_grads={self._generator: self._result_state_grad_shared}),
        print("Action grads: ", self._result_state_mean_grads[0])
        ## When passing in gradients it needs to be a proper list of gradient expressions
        self._result_state_mean_grads = list(self._result_state_mean_grads[0])
        # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list))
        # print ("Action grads: ", self._action_mean_grads)
        self._generatorGRADUpdates = lasagne.updates.adam(
            self._result_state_mean_grads,
            self._actionParams,
            self._learning_rate * 0.1,
            beta1=0.9,
            beta2=0.9,
            epsilon=self._rms_epsilon)

        self._givens_grad = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._model.getResultStateSymbolicVariable():
            self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
        }

        ### Some other stuff to learn a reward function
        self._inputs_reward_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions(),
        }
        self._reward = lasagne.layers.get_output(
            self._model.getRewardNetwork(),
            self._inputs_reward_,
            deterministic=True)
        self._reward_drop = lasagne.layers.get_output(
            self._model.getRewardNetwork(),
            self._inputs_reward_,
            deterministic=False)
        ## because rewards are noramlized then scaled by the discount factor to the value stay between -1,1.
        self._reward_diff = (self._model.getRewardSymbolicVariable() *
                             (1.0 /
                              (1.0 - self.getSettings()['discount_factor']))
                             ) - self._reward_drop
        self.__Reward = self._model.getRewardSymbolicVariable()
        print("self.__Reward", self.__Reward)
        # self._reward_diff = (self._model.getRewardSymbolicVariable()) - self._reward_drop
        self._reward_loss_ = T.mean(T.pow(self._reward_diff, 2), axis=1)
        self._reward_loss = T.mean(self._reward_loss_)

        self._reward_diff_NoDrop = (
            self._model.getRewardSymbolicVariable() *
            (1.0 /
             (1.0 - self.getSettings()['discount_factor']))) - self._reward
        # self._reward_diff_NoDrop = (self._model.getRewardSymbolicVariable()) - self._reward
        self._reward_loss_NoDrop_ = T.mean(T.pow(self._reward_diff_NoDrop, 2),
                                           axis=1)
        self._reward_loss_NoDrop = T.mean(self._reward_loss_NoDrop_)
        self._reward_params = lasagne.layers.helper.get_all_params(
            self._model.getRewardNetwork())
        self._reward_givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            self._model.getRewardSymbolicVariable():
            self._model.getRewards(),
        }
        self._reward_updates_ = lasagne.updates.adam(
            self._reward_loss +
            (self._regularization_weight *
             lasagne.regularization.regularize_network_params(
                 self._model.getRewardNetwork(), lasagne.regularization.l2)),
            self._reward_params,
            self._learning_rate,
            beta1=0.9,
            beta2=0.999,
            epsilon=self._rms_epsilon)

        GAN.compile(self)

    def compile(self):

        self._train = theano.function([], [self._loss, self._discriminator],
                                      updates=self._updates_,
                                      givens=self._givens_)

        # self._trainActor = theano.function([], [actLoss, self._q_valsActA], updates=actionUpdates, givens=actGivens)
        # self._trainActor = theano.function([], [self._q_func], updates=self._actionUpdates, givens=self._actGivens)
        self._trainGenerator = theano.function(
            [], [], updates=self._generatorGRADUpdates, givens=self._actGivens)
        self._trainGenerator_MSE = theano.function(
            [], [],
            updates=self._updates_generator,
            givens=self._actGivens_MSE)
        self._discriminate = theano.function(
            [],
            self._discriminator,
            givens={
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
            })

        #self._q_val_Target = theano.function([], self._q_valsB_, givens=self._givens_grad)
        if ("train_gan_with_gaussian_noise" in self.getSettings()
                and (self.getSettings()["train_gan_with_gaussian_noise"])):
            self._generate = theano.function(
                [],
                self._generator,
                givens={
                    self._model.getStateSymbolicVariable():
                    self._model.getStates(),
                    self._model.getActionSymbolicVariable():
                    self._model.getActions(),
                    # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                    self._model._Noise:
                    self._noise_shared
                })
        else:
            self._generate = theano.function(
                [],
                self._generator,
                givens={
                    self._model.getStateSymbolicVariable():
                    self._model.getStates(),
                    self._model.getActionSymbolicVariable():
                    self._model.getActions(),
                    # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
                    self._model._Noise:
                    self._noise_shared
                })
        """
        inputs_ = [
                   self._model.getStateSymbolicVariable(), 
                   self._model.getRewardSymbolicVariable(), 
                   # ResultState
                   ]
        self._bellman_error = theano.function(inputs=inputs_, outputs=self._diff, allow_input_downcast=True)
        """
        # self._diffs = theano.function(input=[State])
        self._bellman_error = theano.function(
            inputs=[],
            outputs=self._loss_g,
            allow_input_downcast=True,
            givens={
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getActionSymbolicVariable():
                self._model.getActions(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                self._model._Noise:
                self._noise_shared
            })

        # self._get_action_grad = theano.function([], outputs=lasagne.updates.get_or_compute_grads(T.mean(self._discriminator), [self._model._actionInputVar] + self._params), allow_input_downcast=True, givens=self._givens_grad)
        self._get_state_grad = theano.function(
            [],
            outputs=lasagne.updates.get_or_compute_grads(
                T.mean(self._discriminator),
                [self._model._stateInputVar] + self._params),
            allow_input_downcast=True,
            givens=self._givens_grad)
        self._get_result_state_grad = theano.function(
            [],
            outputs=lasagne.updates.get_or_compute_grads(
                T.mean(self._discriminator),
                [self._model._resultStateInputVar] + self._params),
            allow_input_downcast=True,
            givens=self._givens_grad)
        self._get_action_grad = theano.function(
            [],
            outputs=T.grad(
                cost=None,
                wrt=[self._model._actionInputVar] + self._actionParams,
                known_grads={self._generator: self._result_state_grad_shared}),
            allow_input_downcast=True,
            givens=self._actGivens)

        # self._get_grad_reward = theano.function([], outputs=lasagne.updates.get_or_compute_grads((self._reward_loss_NoDrop), [lasagne.layers.get_all_layers(self._model.getRewardNetwork())[0].input_var] + self._reward_params), allow_input_downcast=True,
        self._get_grad_reward = theano.function(
            [],
            outputs=lasagne.updates.get_or_compute_grads(
                T.mean(self._reward),
                [self._model._actionInputVar] + self._reward_params),
            allow_input_downcast=True,
            givens=self._inputs_reward_)

        self._train_reward = theano.function([], [self._reward_loss],
                                             updates=self._reward_updates_,
                                             givens=self._reward_givens_)
        self._predict_reward = theano.function([],
                                               self._reward,
                                               givens=self._inputs_reward_)
        self._reward_error = theano.function(inputs=[],
                                             outputs=self._reward_diff,
                                             allow_input_downcast=True,
                                             givens=self._reward_givens_)
        self._reward_values = theano.function(
            inputs=[],
            outputs=self.__Reward,
            allow_input_downcast=True,
            givens={
                # self._model.getStateSymbolicVariable() : self._model.getStates(),
                # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(),
                # self._model.getActionSymbolicVariable(): self._model.getActions(),
                self._model.getRewardSymbolicVariable():
                self._model.getRewards(),
            })

    def getStateGrads(self, states, actions=None, alreadyNormed=False):
        """
            The states should be normalized
        """
        # self.setData(states, actions, rewards, result_states)
        if (alreadyNormed == False):
            states = norm_state(states, self._state_bounds)
        states = np.array(states, dtype=theano.config.floatX)
        self._model.setStates(states)

        return self._get_state_grad()

    def getResultStateGrads(self,
                            result_states,
                            actions=None,
                            alreadyNormed=False):
        """
            The states should be normalized
        """
        # self.setData(states, actions, rewards, result_states)
        if (alreadyNormed == False):
            result_states = norm_state(result_states, self._state_bounds)
        result_states = np.array(result_states, dtype=theano.config.floatX)
        self._model.setResultStates(result_states)

        return self._get_result_state_grad()

    def setGradTarget(self, grad):
        self._result_state_grad_shared.set_value(grad)

    def getGrads(self,
                 states,
                 actions,
                 result_states,
                 v_grad=None,
                 alreadyNormed=False):
        if (alreadyNormed == False):
            states = np.array(norm_state(states, self._state_bounds),
                              dtype=self.getSettings()['float_type'])
            actions = np.array(norm_action(actions, self._action_bounds),
                               dtype=self.getSettings()['float_type'])
            result_states = np.array(norm_state(result_states,
                                                self._state_bounds),
                                     dtype=self.getSettings()['float_type'])
        # result_states = np.array(result_states, dtype=self.getSettings()['float_type'])
        self.setData(states, actions, result_states)
        # if (v_grad != None):
        self.setGradTarget(v_grad)
        return self._get_action_grad()

    def getRewardGrads(self, states, actions, alreadyNormed=False):
        # states = np.array(states, dtype=self.getSettings()['float_type'])
        # actions = np.array(actions, dtype=self.getSettings()['float_type'])
        if (alreadyNormed is False):
            states = np.array(norm_state(states, self._state_bounds),
                              dtype=self.getSettings()['float_type'])
            actions = np.array(norm_action(actions, self._action_bounds),
                               dtype=self.getSettings()['float_type'])
            # rewards = np.array(norm_state(rewards, self._reward_bounds), dtype=self.getSettings()['float_type'])
        self.setData(states, actions)
        return self._get_grad_reward()

    def getNetworkParameters(self):
        params = []
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._model.getCriticNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._model.getForwardDynamicsNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._model.getRewardNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget.getCriticNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget.getForwardDynamicsNetwork()))
        params.append(
            lasagne.layers.helper.get_all_param_values(
                self._modelTarget.getRewardNetwork()))
        return params

    def setNetworkParameters(self, params):
        lasagne.layers.helper.set_all_param_values(
            self._model.getCriticNetwork(), params[0])
        lasagne.layers.helper.set_all_param_values(
            self._model.getForwardDynamicsNetwork(), params[1])
        lasagne.layers.helper.set_all_param_values(
            self._model.getRewardNetwork(), params[2])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getCriticNetwork(), params[3])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getForwardDynamicsNetwork(), params[4])
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getRewardNetwork(), params[5])

    def setData(self, states, actions, result_states=None, rewards=None):
        self._model.setStates(states)
        self._model.setActions(actions)
        if not (result_states is None):
            self._model.setResultStates(result_states)
        if not (rewards is None):
            self._model.setRewards(rewards)
        noise = np.random.normal(self._noise_mean,
                                 self._noise_std,
                                 size=(states.shape[0], 1))
        self._noise_shared.set_value(noise)
        # noise = np.zeros((states.shape[0],1))
        # self._noise_shared.set_value(noise)

    def trainCritic(self, states, actions, result_states, rewards):

        self.setData(states, actions, result_states, rewards)
        noise = np.random.normal(self._noise_mean,
                                 self._noise_std,
                                 size=(states.shape[0], 1))
        # print ("Shapes: ", states.shape, actions.shape, rewards.shape, result_states.shape, falls.shape, noise.shape)
        self._noise_shared.set_value(noise)
        self._updates += 1
        ## Compute actions for TargetNet
        generated_samples = self._generate()
        ### Put generated samples in memory
        for i in range(generated_samples.shape[0]):
            next_state__ = scale_state(generated_samples[i],
                                       self._state_bounds)
            tup = ([states[i]], [actions[i]], [next_state__], [rewards[i]],
                   [0], [0], [0])
            self._experience.insertTuple(tup)
        tmp_result_states = copy.deepcopy(result_states)
        tmp_rewards = copy.deepcopy(rewards)

        ## Pull out a batch of generated samples
        states__, actions__, generated_samples, rewards__, falls__, G_ts__, exp_actions__ = self._experience.get_batch(
            min(states.shape[0], self._experience.samples()))
        """
        print("generated_samples: ", generated_samples.shape)
        print("tmp_result_states: ", tmp_result_states.shape)
        print("tmp_rewards: ", tmp_rewards.shape)
        print("states: ", states.shape)
        print("actions: ", actions.shape)
        """

        ## replace half of the samples with generated ones...
        for i in range(int(states.shape[0] / 2)):

            tmp_result_states[i] = generated_samples[i]
            tmp_rewards[i] = [0]

        # print("Discriminator targets: ", tmp_rewards)

        self.setData(states, actions, tmp_result_states, tmp_rewards)

        loss, _ = self._train()
        # print("Discriminator loss: ", loss)
        return loss

    def trainActor(self, states, actions, result_states, rewards):
        self.setData(states, actions, result_states, rewards)

        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1)))
        ## Add MSE term
        if ('train_gan_mse' in self.getSettings()
                and (self.getSettings()['train_gan_mse'] == False)):
            pass
        else:
            self._trainGenerator_MSE()
        # print("Policy mean: ", np.mean(self._q_action(), axis=0))
        loss = 0
        # print("******** Not learning actor right now *****")
        # return loss
        generated_samples = self.predict_batch(states, actions)
        result_state_grads = self.getResultStateGrads(generated_samples,
                                                      actions,
                                                      alreadyNormed=True)[0]
        discriminator_value = self._discriminate()
        """
            From DEEP REINFORCEMENT LEARNING IN PARAMETERIZED ACTION SPACE
            Hausknecht, Matthew and Stone, Peter
            
            actions.shape == result_state_grads.shape
        """
        use_parameter_grad_inversion = True
        if (use_parameter_grad_inversion):
            for i in range(result_state_grads.shape[0]):
                for j in range(result_state_grads.shape[1]):
                    if (result_state_grads[i, j] > 0):
                        inversion = (1.0 - generated_samples[i, j]) / 2.0
                    else:
                        inversion = (generated_samples[i, j] - (-1.0)) / 2.0
                    result_state_grads[i,
                                       j] = result_state_grads[i,
                                                               j] * inversion

        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['debug']):
            print("Policy mean: ", np.mean(self._generate(), axis=0))
            print("Mean action grad: ", np.mean(result_state_grads, axis=0),
                  " std ", np.std(result_state_grads, axis=0))

        ## Set data for gradient
        self._model.setResultStates(result_states)
        self._modelTarget.setResultStates(result_states)

        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1)))
        error_MSE = self._bellman_error()
        ## Why the -1.0??
        ## Because the SGD method is always performing MINIMIZATION!!
        self._result_state_grad_shared.set_value(-1.0 * result_state_grads)
        self._trainGenerator()
        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1)))
        error_MSE = self._bellman_error()
        return (np.mean(discriminator_value), error_MSE)

    def train(self, states, actions, result_states, rewards):
        loss = self.trainCritic(states, actions, result_states, rewards)
        # loss = 0
        lossActor = self.trainActor(states, actions, result_states, rewards)
        if (self.getSettings()['train_reward_predictor']):
            # print ("self._reward_bounds: ", self._reward_bounds)
            # print( "Rewards, predicted_reward, difference, model diff, model rewards: ", np.concatenate((rewards, self._predict_reward(), self._predict_reward() - rewards, self._reward_error(), self._reward_values()), axis=1))
            self.setData(states, actions, result_states, rewards)
            lossReward = self._train_reward()
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['train']):
                print("Loss Reward: ", lossReward)
        return (loss, lossActor)

    def predict(self, state, deterministic_=True):
        pass

    def predict_batch(self, states, deterministic_=True):
        pass

    def predict(self, state, action):
        # states = np.zeros((self._batch_size, self._self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        state = np.array(norm_state(state, self._state_bounds),
                         dtype=self.getSettings()['float_type'])
        # print ("fd state: ", state)
        action = np.array(norm_action(action, self._action_bounds),
                          dtype=self.getSettings()['float_type'])
        # self._model.setStates(state)
        # self._model.setActions(action)
        self.setData(state, action)
        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(1,1)))
        # print ("State bounds: ", self._state_bounds)
        # print ("gen output: ", self._generate()[0])
        state_ = scale_state(self._generate(), self._state_bounds)
        # print( "self._state_bounds: ", self._state_bounds)
        # print ("scaled output: ", state_)
        return state_

    def predict_batch(self, states, actions):
        ## These input should already be normalized.
        # self._model.setStates(states)
        # self._model.setActions(actions)
        self.setData(states, actions)
        # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1)))
        # print ("State bounds: ", self._state_bounds)
        # print ("fd output: ", self._forwardDynamics()[0])
        # state_ = scale_state(self._generate(), self._state_bounds)
        state_ = self._generate()
        return state_

    def q_value(self, state):
        """
            For returning a vector of q values, state should NOT be normalized
        """
        # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        action = self._q_action()
        self._model.setActions(action)
        self._modelTarget.setActions(action)
        return scale_reward(self._discriminate(), self.getRewardBounds()) * (
            1.0 / (1.0 - self.getSettings()['discount_factor']))
        # return self._q_valTarget()[0]
        # return self._q_val()[0]

    def q_value(self, state, action, next_state):
        """
            For returning a vector of q values, state should NOT be normalized
        """
        # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        # action = self._q_action()
        action = norm_state(action, self.getActionBounds())
        self._model.setActions(action)
        self._modelTarget.setActions(action)
        nextState = norm_state(next_state, self.getStateBounds())
        # nextState = np.reshape(nextState, (1,20))
        self._model.setResultStates(nextState)
        self._modelTarget.setResultStates(nextState)

        # return scale_reward(self._discriminate(), self.getRewardBounds())[0] * (1.0 / (1.0- self.getSettings()['discount_factor']))
        return self._discriminate()
        # return self._q_valTarget()[0]
        # return self._q_val()[0]

    def q_values(self, state):
        """
            For returning a vector of q values, state should already be normalized
        """
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        action = self._q_action()
        self._model.setActions(action)
        self._modelTarget.setActions(action)
        return scale_reward(self._q_val(), self.getRewardBounds()) * (
            1.0 / (1.0 - self.getSettings()['discount_factor']))
        # return self._q_valTarget()
        # return self._q_val()

    def predict_std(self, state, deterministic_=True):
        """
            This does nothing for a GAN...
        """
        # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        action_std = np.array([0] * len(self._action_bounds))
        # np.zeros((state.shape[0], len(self._action_bounds)))
        # else:
        # action_ = scale_action(self._q_action()[0], self._action_bounds)
        # action_ = q_valsActA[0]
        return action_std

    def predict_reward(self, state, action):
        # states = np.zeros((self._batch_size, self._self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        state = np.array(norm_state(state, self._state_bounds),
                         dtype=self.getSettings()['float_type'])
        action = np.array(norm_action(action, self._action_bounds),
                          dtype=self.getSettings()['float_type'])
        self._model.setStates(state)
        self._model.setActions(action)
        predicted_reward = self._predict_reward()
        reward_ = scale_reward(predicted_reward, self.getRewardBounds(
        ))  # * (1.0 / (1.0- self.getSettings()['discount_factor']))
        # reward_ = scale_reward(predicted_reward, self.getRewardBounds())[0] * (1.0 / (1.0- self.getSettings()['discount_factor']))
        # reward_ = scale_state(predicted_reward, self._reward_bounds)
        # print ("reward, predicted reward: ", reward_, predicted_reward)
        return reward_

    def predict_reward_batch(self, states, actions):

        # states = np.zeros((self._batch_size, self._self._state_length), dtype=theano.config.floatX)
        # states[0, ...] = state
        # state = np.array(norm_state(state, self._state_bounds), dtype=self.getSettings()['float_type'])
        # action = np.array(norm_action(action, self._action_bounds), dtype=self.getSettings()['float_type'])
        self._model.setStates(states)
        self._model.setActions(actions)
        predicted_reward = self._predict_reward()
        # reward_ = scale_reward(predicted_reward, self.getRewardBounds())[0] # * (1.0 / (1.0- self.getSettings()['discount_factor']))
        # reward_ = scale_reward(predicted_reward, self.getRewardBounds())[0] * (1.0 / (1.0- self.getSettings()['discount_factor']))
        # reward_ = scale_state(predicted_reward, self._reward_bounds)
        # print ("reward, predicted reward: ", reward_, predicted_reward)
        return predicted_reward

    def bellman_error(self, states, actions, result_states, rewards):
        self.setData(states, actions, result_states, rewards)
        return self._bellman_error()

    def reward_error(self, states, actions, result_states, rewards):
        # rewards = rewards * (1.0/(1.0-self.getSettings()['discount_factor'])) # scale rewards
        self.setData(states, actions, result_states, rewards)
        return self._reward_error()

    def setStateBounds(self, state_bounds):
        super(GAN, self).setStateBounds(state_bounds)
        """
        print ("")
        print("Setting GAN state bounds: ", state_bounds)
        print("self.getStateBounds(): ", self.getStateBounds())
        print ("")
        """
        self._experience.setStateBounds(copy.deepcopy(self.getStateBounds()))

    def setActionBounds(self, action_bounds):
        super(GAN, self).setActionBounds(action_bounds)
        self._experience.setActionBounds(copy.deepcopy(self.getActionBounds()))

    def setRewardBounds(self, reward_bounds):
        super(GAN, self).setRewardBounds(reward_bounds)
        self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds()))
Esempio n. 2
0
def fitModelToData(settingsFileName):
    """
    State is the input state and Action is the desired output (y).
    """
    # from model.ModelUtil import *
    
    file = open(settingsFileName)
    settings = json.load(file)
    print ("Settings: " + str(json.dumps(settings)))
    file.close()
    import os    
    os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device="+settings['training_processor_type']+",floatX="+settings['float_type']
    
    from ModelEvaluation import SimWorker, evalModelParrallel, collectExperience, simEpoch, evalModel
    from model.ModelUtil import validBounds
    from model.LearningAgent import LearningAgent, LearningWorker
    from util.SimulationUtil import validateSettings, createEnvironment, createRLAgent, createActor
    from util.SimulationUtil import getDataDirectory, createForwardDynamicsModel, createSampler
    
    
    from util.ExperienceMemory import ExperienceMemory
    from RLVisualize import RLVisualize
    from NNVisualize import NNVisualize
    
    from sim.PendulumEnvState import PendulumEnvState
    from sim.PendulumEnv import PendulumEnv
    from sim.BallGame2DEnv import BallGame2DEnv 
    import time  
    
    settings = validateSettings(settings)

    train_forward_dynamics=True
    model_type= settings["model_type"]
    directory= getDataDirectory(settings)
    discrete_actions = np.array(settings['discrete_actions'])
    num_actions= discrete_actions.shape[0] # number of rows
    rounds = settings["rounds"]
    epochs = settings["epochs"]
    epsilon = settings["epsilon"]
    discount_factor=settings["discount_factor"]
    reward_bounds=np.array(settings["reward_bounds"])
    batch_size=settings["batch_size"]
    train_on_validation_set=settings["train_on_validation_set"]
    state_bounds = np.array(settings['state_bounds'])
    discrete_actions = np.array(settings['discrete_actions'])
    print ("Sim config file name: ", str(settings["sim_config_file"]))
    action_space_continuous=settings['action_space_continuous']
    if action_space_continuous:
        action_bounds = np.array(settings["action_bounds"], dtype=float)
    
    if action_space_continuous:
        experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), settings['expereince_length'], continuous_actions=True, settings=settings)
    else:
        experience = ExperienceMemory(len(state_bounds[0]), 1, settings['expereince_length'])
    file_name=directory+getAgentName()+"expBufferInit.hdf5"
    experience.loadFromFile(file_name)
    state_bounds = experience._state_bounds
    action_bounds = experience._action_bounds
    reward_bounds = experience._reward_bounds
    
    output_experience_queue = multiprocessing.Queue(settings['queue_size_limit'])
    mgr = multiprocessing.Manager()
    namespace = mgr.Namespace()
    learning_workers = []
    for process in range(1):
        # this is the process that selects which game to play
        agent = LearningAgent(n_in=len(state_bounds[0]), n_out=len(action_bounds[0]), state_bounds=state_bounds, 
                          action_bounds=action_bounds, reward_bound=reward_bounds, settings_=settings)
        
        agent.setSettings(settings)
        
        lw = LearningWorker(output_experience_queue, agent, namespace)
        learning_workers.append(lw)  
    masterAgent = agent
    masterAgent.setExperience(experience)
    
    if action_space_continuous:
        model = createRLAgent(settings['agent_name'], state_bounds, action_bounds, reward_bounds, settings)
    else:
        model = createRLAgent(settings['agent_name'], state_bounds, discrete_actions, reward_bounds, settings)
    if ( not settings['load_saved_model'] ):
        model.setStateBounds(state_bounds)
        model.setActionBounds(action_bounds)
        model.setRewardBounds(reward_bounds)
    else: # continuation learning
        experience.setStateBounds(model.getStateBounds())
        experience.setRewardBounds(model.getRewardBounds())
        experience.setActionBounds(model.getActionBounds())
        
    
    if (settings['train_forward_dynamics']):
        print ("Created forward dynamics network")
        forwardDynamicsModel = createForwardDynamicsModel(settings, state_bounds, action_bounds, None, None)
        masterAgent.setForwardDynamics(forwardDynamicsModel)
        forwardDynamicsModel.setActor(actor)        
        forwardDynamicsModel.init(len(state_bounds[0]), len(action_bounds[0]), state_bounds, action_bounds, actor, None, settings)
        namespace.forwardNN = masterAgent.getForwardDynamics().getNetworkParameters()
        namespace.forwardDynamicsModel = forwardDynamicsModel
    
    ## Now everything related to the exp memory needs to be updated
    bellman_errors=[]
    masterAgent.setPolicy(model)
    namespace.agentPoly = masterAgent.getPolicy().getNetworkParameters()
    namespace.model = model
    
    
    if (settings['visualize_learning']):
        rlv = NNVisualize(title=str(directory), settings=settings)
        rlv.setInteractive()
        rlv.init()
            
    if (settings['debug_critic']):
        criticLosses = []
        criticRegularizationCosts = [] 
        if (settings['visualize_learning']):
            critic_loss_viz = NNVisualize(title=str("Critic Loss") + " with " + str(settings["model_type"]))
            critic_loss_viz.setInteractive()
            critic_loss_viz.init()
            critic_regularization_viz = NNVisualize(title=str("Critic Regularization Cost") + " with " + str(settings["model_type"]))
            critic_regularization_viz.setInteractive()
            critic_regularization_viz.init()
        
    if (settings['debug_actor']):
        actorLosses = []
        actorRegularizationCosts = []            
        if (settings['visualize_learning']):
            actor_loss_viz = NNVisualize(title=str("Actor Loss") + " with " + str(settings["model_type"]))
            actor_loss_viz.setInteractive()
            actor_loss_viz.init()
            actor_regularization_viz = NNVisualize(title=str("Actor Regularization Cost") + " with " + str(settings["model_type"]))
            actor_regularization_viz.setInteractive()
            actor_regularization_viz.init()
                
    trainData = {}
    trainData["mean_reward"]=[]
    trainData["std_reward"]=[]
    trainData["mean_bellman_error"]=[]
    trainData["std_bellman_error"]=[]
    trainData["mean_discount_error"]=[]
    trainData["std_discount_error"]=[]
    trainData["mean_forward_dynamics_loss"]=[]
    trainData["std_forward_dynamics_loss"]=[]
    trainData["mean_eval"]=[]
    trainData["std_eval"]=[]
    trainData["mean_critic_loss"]=[]
    trainData["std_critic_loss"]=[]
    trainData["mean_critic_regularization_cost"]=[]
    trainData["std_critic_regularization_cost"]=[]
    trainData["mean_actor_loss"]=[]
    trainData["std_actor_loss"]=[]
    trainData["mean_actor_regularization_cost"]=[]
    trainData["std_actor_regularization_cost"]=[]
        
    best_dynamicsLosses=1000000
    _states, _actions, _result_states, _rewards, _falls, _G_ts = experience.get_batch(batch_size)
    for round_ in range(rounds):
        t0 = time.time()
        __states, __actions, __result_states, __rewards, __falls, __G_ts = experience.get_batch(100)
        for i in range(1):
            masterAgent.train(_states=__states, _actions=__actions, _rewards=__rewards, _result_states=__result_states, _falls=__falls)
        t1 = time.time()
        time_taken = t1 - t0
        if masterAgent.getExperience().samples() > batch_size:
            states, actions, result_states, rewards, falls, G_ts = masterAgent.getExperience().get_batch(batch_size)
            print ("Batch size: " + str(batch_size))
            error = masterAgent.bellman_error(states, actions, rewards, result_states, falls)
            bellman_errors.append(error)
            if (settings['debug_critic']):
                loss__ = masterAgent.getPolicy()._get_critic_loss() # uses previous call batch data
                criticLosses.append(loss__)
                regularizationCost__ = masterAgent.getPolicy()._get_critic_regularization()
                criticRegularizationCosts.append(regularizationCost__)
                
            if (settings['debug_actor']):
                loss__ = masterAgent.getPolicy()._get_actor_loss() # uses previous call batch data
                actorLosses.append(loss__)
                regularizationCost__ = masterAgent.getPolicy()._get_actor_regularization()
                actorRegularizationCosts.append(regularizationCost__)
            
            if not all(np.isfinite(error)):
                print ("States: " + str(states) + " ResultsStates: " + str(result_states) + " Rewards: " + str(rewards) + " Actions: " + str(actions) + " Falls: ", str(falls))
                print ("Bellman Error is Nan: " + str(error) + str(np.isfinite(error)))
                sys.exit()
            
            error = np.mean(np.fabs(error))
            if error > 10000:
                print ("Error to big: ")
                print (states, actions, rewards, result_states)
                
            if (settings['train_forward_dynamics']):
                dynamicsLoss = masterAgent.getForwardDynamics().bellman_error(states, actions, result_states, rewards)
                dynamicsLoss = np.mean(np.fabs(dynamicsLoss))
                dynamicsLosses.append(dynamicsLoss)
            if (settings['train_forward_dynamics']):
                print ("Round: " + str(round_) + " bellman error: " + str(error) + " ForwardPredictionLoss: " + str(dynamicsLoss) + " in " + str(time_taken) + " seconds")
            else:
                print ("Round: " + str(round_) + " bellman error: " + str(error) + " in " + str(time_taken) + " seconds")
           

        print ("Master agent experience size: " + str(masterAgent.getExperience().samples()))
        trainData["mean_bellman_error"].append(np.mean(np.fabs(bellman_errors)))
        trainData["std_bellman_error"].append(np.std(bellman_errors))
        if (settings['visualize_learning']):
            rlv.updateLoss(np.array(trainData["mean_bellman_error"]), np.array(trainData["std_bellman_error"]))
            rlv.redraw()
            rlv.setInteractiveOff()
            rlv.saveVisual(directory+"trainingGraphNN")
            rlv.setInteractive()
        # print "Error: " + str(error)
        if (settings['debug_critic']):
            mean_criticLosses = np.mean(criticLosses)
            std_criticLosses = np.std(criticLosses)
            trainData["mean_critic_loss"].append(mean_criticLosses)
            trainData["std_critic_loss"].append(std_criticLosses)
            criticLosses = []
            if (settings['visualize_learning']):
                critic_loss_viz.updateLoss(np.array(trainData["mean_critic_loss"]), np.array(trainData["std_critic_loss"]))
                critic_loss_viz.redraw()
                critic_loss_viz.setInteractiveOff()
                critic_loss_viz.saveVisual(directory+"criticLossGraph")
                critic_loss_viz.setInteractive()
            
            mean_criticRegularizationCosts = np.mean(criticRegularizationCosts)
            std_criticRegularizationCosts = np.std(criticRegularizationCosts)
            trainData["mean_critic_regularization_cost"].append(mean_criticRegularizationCosts)
            trainData["std_critic_regularization_cost"].append(std_criticRegularizationCosts)
            criticRegularizationCosts = []
            if (settings['visualize_learning']):
                critic_regularization_viz.updateLoss(np.array(trainData["mean_critic_regularization_cost"]), np.array(trainData["std_critic_regularization_cost"]))
                critic_regularization_viz.redraw()
                critic_regularization_viz.setInteractiveOff()
                critic_regularization_viz.saveVisual(directory+"criticRegularizationGraph")
                critic_regularization_viz.setInteractive()
            
        if (settings['debug_actor']):
            
            mean_actorLosses = np.mean(actorLosses)
            std_actorLosses = np.std(actorLosses)
            trainData["mean_actor_loss"].append(mean_actorLosses)
            trainData["std_actor_loss"].append(std_actorLosses)
            actorLosses = []
            if (settings['visualize_learning']):
                actor_loss_viz.updateLoss(np.array(trainData["mean_actor_loss"]), np.array(trainData["std_actor_loss"]))
                actor_loss_viz.redraw()
                actor_loss_viz.setInteractiveOff()
                actor_loss_viz.saveVisual(directory+"actorLossGraph")
                actor_loss_viz.setInteractive()
            
            mean_actorRegularizationCosts = np.mean(actorRegularizationCosts)
            std_actorRegularizationCosts = np.std(actorRegularizationCosts)
            trainData["mean_actor_regularization_cost"].append(mean_actorRegularizationCosts)
            trainData["std_actor_regularization_cost"].append(std_actorRegularizationCosts)
            actorRegularizationCosts = []
            if (settings['visualize_learning']):
                actor_regularization_viz.updateLoss(np.array(trainData["mean_actor_regularization_cost"]), np.array(trainData["std_actor_regularization_cost"]))
                actor_regularization_viz.redraw()
                actor_regularization_viz.setInteractiveOff()
                actor_regularization_viz.saveVisual(directory+"actorRegularizationGraph")
                actor_regularization_viz.setInteractive()
Esempio n. 3
0
    random.shuffle(arr)
    num_samples_to_keep = 300
    given_actions = []
    given_states = []
    for i in range(num_samples_to_keep):
        a = actions[arr[i]]
        action_ = np.array([a])
        given_actions.append(action_)
        state_ = np.array([states[arr[i]]])
        given_states.append(state_)
        # print "Action: " + str([actions[i]])
        experience.insert(state_, state_, action_, np.array([0]))

    errors = []
    for i in range(1000):
        _states, _actions, _result_states, _rewards, fals_, _G_ts, advantage = experience.get_batch(
            batch_size)
        # print ("Actions: ", _actions)
        # print ("States: ", _states)
        error = model.train(_states, _states, _result_states, _actions)
        errors.append(error)
        # print "Error: " + str(error)

    states = np.linspace(-5.0, 5.0, experience_length)
    actionsNoNoise = np.array(map(f, states))
    # print ("Eval States: ", np.transpose(np.array([states])))

    # predicted_actions = np.array(map(model.predict , states, states))
    # predicted_actions = model.predict(np.transpose(np.array(states)), np.transpose(np.array(states)))

    predicted_actions = []
    for i in range(len(states)):
Esempio n. 4
0
    np.random.randn(nb_sample, 1),  # X
    np.ones(nb_sample),  # sample weights
    np.random.randint(1, size=[nb_sample, 1]),  # y
    0  # learning phase in TEST mode
]

print(zip(weights, get_gradients(inputs)))

# sys.exit()

from keras.callbacks import EarlyStopping
# early_stopping = EarlyStopping(monitor='val_loss', patience=2)

errors = []
for i in range(5000):
    _states, _actions, _result_states, _rewards, fals_, _G_ts, ext_act = experience.get_batch(
        batch_size)
    # scale_states = np.array(map(scale_state, _states, itertools.repeat(state_bounds, len(_states))))
    # tmp_actions = np.transpose(np.array([map(f, scale_states)]))
    # norm_actions = np.array(map(norm_action, tmp_actions, itertools.repeat(action_bounds, len(tmp_actions))))
    # print ("mini batch actions: " , tmp_actions, _actions)
    # print ("y diff: " ,  _actions - norm_actions)
    # error = model.train(_states, _actions)
    # errors.append(error)
    # print "Error: " + str(error)
    score = model.fit(_states,
                      _actions,
                      nb_epoch=1,
                      batch_size=32,
                      validation_data=(_states, _actions)
                      # callbacks=[early_stopping],
                      )
Esempio n. 5
0
        action_ = np.array([a])
        given_actions.append(action_)
        state_ = np.array([states_[arr[i]]])
        next_state_ = np.array([next_states_[arr[i]]])
        given_states.append(state_)
        # print "Action: " + str([actions[i]])
        experience.insert(state_, action_, next_state_, np.array([1]))
        # print ("Added tuple: ", i)

    errors = []
    for i in range(settings['rounds']):
        # print ("Actions: ", _actions)
        # print ("States: ", _states)
        # (error, lossActor) = model.train(_states, _actions, _result_states, _rewards)
        for j in range(1):
            _states, _actions, _result_states, _rewards, falls_, advantage, exp_actions__ = experience.get_batch(
                batch_size)
            error = model.trainCritic(_states, _actions, _result_states,
                                      _rewards)
        for j in range(5):
            _states, _actions, _result_states, _rewards, falls_, advantage, exp_actions__ = experience.get_batch(
                batch_size)
            lossActor = model.trainActor(_states, _actions, _result_states,
                                         _rewards)
        errors.append(error)
        if (i % 100 == 0):
            print("Iteration: ", i)
            print("discriminator loss: ", error, " generator loss: ",
                  lossActor)
        # print "Error: " + str(error)

    # states = np.linspace(-5.0, 5.0, experience_length)
Esempio n. 6
0
model.add(Dense(64, init='uniform')) 
model.add(Activation('relu'))
# 1 output, linear activation
model.add(Dense(1, init='uniform'))
model.add(Activation('linear'))

sgd = SGD(lr=0.01, momentum=0.9)
print ("Clipping: ", sgd.decay)
model.compile(loss='mse', optimizer=sgd)

from keras.callbacks import EarlyStopping
# early_stopping = EarlyStopping(monitor='val_loss', patience=2)

errors=[]
for i in range(5000):
    _states, _actions, _result_states, _rewards, fals_ = experience.get_batch(batch_size)
    # scale_states = np.array(map(scale_state, _states, itertools.repeat(state_bounds, len(_states))))
    # tmp_actions = np.transpose(np.array([map(f, scale_states)]))
    # norm_actions = np.array(map(norm_action, tmp_actions, itertools.repeat(action_bounds, len(tmp_actions))))
    # print ("mini batch actions: " , tmp_actions, _actions)
    # print ("y diff: " ,  _actions - norm_actions) 
    # error = model.train(_states, _actions)
    # errors.append(error)
    # print "Error: " + str(error)
    score = model.fit(_states, _actions,
              nb_epoch=1, batch_size=32,
              validation_data=(_states, _actions)
              # callbacks=[early_stopping],
              )

    errors.extend(score.history['loss'])
Esempio n. 7
0
def fitModelToData(settingsFileName):
    """
    State is the input state and Action is the desired output (y).
    """
    # from model.ModelUtil import *

    file = open(settingsFileName)
    settings = json.load(file)
    print("Settings: " + str(json.dumps(settings)))
    file.close()
    import os
    os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device=" + settings[
        'training_processor_type'] + ",floatX=" + settings['float_type']

    ## Theano needs to be imported after the flags are set.
    # from ModelEvaluation import *
    # from model.ModelUtil import *
    # print ( "theano.config.mode: ", theano.config.mode)
    from ModelEvaluation import SimWorker, evalModelParrallel, collectExperience, simEpoch, evalModel
    from model.ModelUtil import validBounds
    from model.LearningAgent import LearningAgent, LearningWorker
    from util.SimulationUtil import validateSettings, createEnvironment, createRLAgent, createActor
    from util.SimulationUtil import getDataDirectory, createForwardDynamicsModel, createSampler

    from util.ExperienceMemory import ExperienceMemory
    from RLVisualize import RLVisualize
    from NNVisualize import NNVisualize

    from sim.PendulumEnvState import PendulumEnvState
    from sim.PendulumEnv import PendulumEnv
    from sim.BallGame2DEnv import BallGame2DEnv
    import time

    settings = validateSettings(settings)

    # anchor_data_file = open(settings["anchor_file"])
    # _anchors = getAnchors(anchor_data_file)
    # print ("Length of anchors epochs: ", str(len(_anchors)))
    # anchor_data_file.close()
    train_forward_dynamics = True
    model_type = settings["model_type"]
    directory = getDataDirectory(settings)
    discrete_actions = np.array(settings['discrete_actions'])
    num_actions = discrete_actions.shape[0]  # number of rows
    rounds = settings["rounds"]
    epochs = settings["epochs"]
    # num_states=settings["num_states"]
    epsilon = settings["epsilon"]
    discount_factor = settings["discount_factor"]
    # max_reward=settings["max_reward"]
    reward_bounds = np.array(settings["reward_bounds"])
    batch_size = settings["batch_size"]
    train_on_validation_set = settings["train_on_validation_set"]
    state_bounds = np.array(settings['state_bounds'])
    discrete_actions = np.array(settings['discrete_actions'])
    print("Sim config file name: ", str(settings["sim_config_file"]))
    # c = characterSim.Configuration(str(settings["sim_config_file"]))
    # c = characterSim.Configuration("../data/epsilon0Config.ini")
    action_space_continuous = settings['action_space_continuous']
    # states2 = np.transpose(np.repeat([states], 2, axis=0))
    # print states2
    if action_space_continuous:
        action_bounds = np.array(settings["action_bounds"], dtype=float)

    if action_space_continuous:
        experience = ExperienceMemory(len(state_bounds[0]),
                                      len(action_bounds[0]),
                                      settings['expereince_length'],
                                      continuous_actions=True,
                                      settings=settings)
    else:
        experience = ExperienceMemory(len(state_bounds[0]), 1,
                                      settings['expereince_length'])
    file_name = directory + getAgentName() + "expBufferInit.hdf5"
    # experience.saveToFile(file_name)
    experience.loadFromFile(file_name)
    state_bounds = experience._state_bounds
    action_bounds = experience._action_bounds
    reward_bounds = experience._reward_bounds

    output_experience_queue = multiprocessing.Queue(
        settings['queue_size_limit'])
    mgr = multiprocessing.Manager()
    namespace = mgr.Namespace()
    learning_workers = []
    # for process in range(settings['num_available_threads']):
    for process in range(1):
        # this is the process that selects which game to play
        agent = LearningAgent(n_in=len(state_bounds[0]),
                              n_out=len(action_bounds[0]),
                              state_bounds=state_bounds,
                              action_bounds=action_bounds,
                              reward_bound=reward_bounds,
                              settings_=settings)

        agent.setSettings(settings)
        """
        if action_space_continuous:
            model = createRLAgent(settings['agent_name'], state_bounds, action_bounds, reward_bounds, settings)
        else:
            model = createRLAgent(settings['agent_name'], state_bounds, discrete_actions, reward_bounds, settings)
        model.setStateBounds(state_bounds)
        model.setActionBounds(action_bounds)
        model.setRewardBounds(reward_bounds)
        """
        # agent.setPolicy(model)
        # actor.setPolicy(model)
        # agent.setExperience(experience)
        # namespace.agentPoly = agent.getPolicy().getNetworkParameters()
        # namespace.experience = experience

        lw = LearningWorker(output_experience_queue, agent, namespace)
        # lw.start()
        learning_workers.append(lw)
    masterAgent = agent
    masterAgent.setExperience(experience)

    if action_space_continuous:
        model = createRLAgent(settings['agent_name'], state_bounds,
                              action_bounds, reward_bounds, settings)
    else:
        model = createRLAgent(settings['agent_name'], state_bounds,
                              discrete_actions, reward_bounds, settings)
    if (not settings['load_saved_model']):
        model.setStateBounds(state_bounds)
        model.setActionBounds(action_bounds)
        model.setRewardBounds(reward_bounds)
    else:  # continuation learning
        experience.setStateBounds(model.getStateBounds())
        experience.setRewardBounds(model.getRewardBounds())
        experience.setActionBounds(model.getActionBounds())

    if (settings['train_forward_dynamics']):
        print("Created forward dynamics network")
        # forwardDynamicsModel = ForwardDynamicsNetwork(state_length=len(state_bounds[0]),action_length=len(action_bounds[0]), state_bounds=state_bounds, action_bounds=action_bounds, settings_=settings)
        forwardDynamicsModel = createForwardDynamicsModel(
            settings, state_bounds, action_bounds, None, None)
        masterAgent.setForwardDynamics(forwardDynamicsModel)
        forwardDynamicsModel.setActor(actor)
        # forwardDynamicsModel.setEnvironment(exp)
        forwardDynamicsModel.init(len(state_bounds[0]), len(action_bounds[0]),
                                  state_bounds, action_bounds, actor, None,
                                  settings)
        namespace.forwardNN = masterAgent.getForwardDynamics(
        ).getNetworkParameters()
        # actor.setForwardDynamicsModel(forwardDynamicsModel)
        namespace.forwardDynamicsModel = forwardDynamicsModel

    ## Now everything related to the exp memory needs to be updated
    bellman_errors = []
    masterAgent.setPolicy(model)
    # masterAgent.setForwardDynamics(forwardDynamicsModel)
    namespace.agentPoly = masterAgent.getPolicy().getNetworkParameters()
    namespace.model = model
    # experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), experience_length, continuous_actions=True)
    """
    for i in range(experience_length):
        action_ = np.array([actions[i]])
        state_ = np.array([states[i]])
        # print "Action: " + str([actions[i]])
        experience.insert(norm_state(state_, state_bounds), norm_action(action_, action_bounds),
                           norm_state(state_, state_bounds), norm_reward(np.array([0]), reward_bounds))
    """

    if (settings['visualize_learning']):
        rlv = NNVisualize(title=str(directory), settings=settings)
        rlv.setInteractive()
        rlv.init()

    if (settings['debug_critic']):
        criticLosses = []
        criticRegularizationCosts = []
        if (settings['visualize_learning']):
            critic_loss_viz = NNVisualize(title=str("Critic Loss") + " with " +
                                          str(settings["model_type"]))
            critic_loss_viz.setInteractive()
            critic_loss_viz.init()
            critic_regularization_viz = NNVisualize(
                title=str("Critic Regularization Cost") + " with " +
                str(settings["model_type"]))
            critic_regularization_viz.setInteractive()
            critic_regularization_viz.init()

    if (settings['debug_actor']):
        actorLosses = []
        actorRegularizationCosts = []
        if (settings['visualize_learning']):
            actor_loss_viz = NNVisualize(title=str("Actor Loss") + " with " +
                                         str(settings["model_type"]))
            actor_loss_viz.setInteractive()
            actor_loss_viz.init()
            actor_regularization_viz = NNVisualize(
                title=str("Actor Regularization Cost") + " with " +
                str(settings["model_type"]))
            actor_regularization_viz.setInteractive()
            actor_regularization_viz.init()

    trainData = {}
    trainData["mean_reward"] = []
    trainData["std_reward"] = []
    trainData["mean_bellman_error"] = []
    trainData["std_bellman_error"] = []
    trainData["mean_discount_error"] = []
    trainData["std_discount_error"] = []
    trainData["mean_forward_dynamics_loss"] = []
    trainData["std_forward_dynamics_loss"] = []
    trainData["mean_eval"] = []
    trainData["std_eval"] = []
    trainData["mean_critic_loss"] = []
    trainData["std_critic_loss"] = []
    trainData["mean_critic_regularization_cost"] = []
    trainData["std_critic_regularization_cost"] = []
    trainData["mean_actor_loss"] = []
    trainData["std_actor_loss"] = []
    trainData["mean_actor_regularization_cost"] = []
    trainData["std_actor_regularization_cost"] = []

    # dynamicsLosses=[]
    best_dynamicsLosses = 1000000
    _states, _actions, _result_states, _rewards, _falls, _G_ts = experience.get_batch(
        batch_size)
    """
    _states = theano.shared(np.array(_states, dtype=theano.config.floatX))
    _actions = theano.shared(np.array(_actions, dtype=theano.config.floatX))
    _result_states = theano.shared(np.array(_result_states, dtype=theano.config.floatX))
    _rewards = theano.shared(np.array(_rewards, dtype=theano.config.floatX))
    """
    for round_ in range(rounds):
        t0 = time.time()
        # out = simEpoch(actor, exp_val, masterAgent, discount_factor, anchors=epoch, action_space_continuous=action_space_continuous, settings=settings,
        #                print_data=False, p=1.0, validation=False, epoch=epoch, evaluation=False, _output_queue=None )
        # (tuples, discounted_sum, q_value, evalData) = out
        # (__states, __actions, __result_states, __rewards, __falls, __G_ts) = tuples
        __states, __actions, __result_states, __rewards, __falls, __G_ts = experience.get_batch(
            100)
        # print("**** training states: ", np.array(__states).shape)
        # print("**** training __result_states: ", np.array(__result_states).shape)
        # print ("Actions before: ", __actions)
        for i in range(1):
            masterAgent.train(_states=__states,
                              _actions=__actions,
                              _rewards=__rewards,
                              _result_states=__result_states,
                              _falls=__falls)
        t1 = time.time()
        time_taken = t1 - t0
        if masterAgent.getExperience().samples() > batch_size:
            states, actions, result_states, rewards, falls, G_ts = masterAgent.getExperience(
            ).get_batch(batch_size)
            print("Batch size: " + str(batch_size))
            error = masterAgent.bellman_error(states, actions, rewards,
                                              result_states, falls)
            bellman_errors.append(error)
            if (settings['debug_critic']):
                loss__ = masterAgent.getPolicy()._get_critic_loss(
                )  # uses previous call batch data
                criticLosses.append(loss__)
                regularizationCost__ = masterAgent.getPolicy(
                )._get_critic_regularization()
                criticRegularizationCosts.append(regularizationCost__)

            if (settings['debug_actor']):
                """
                print( "Advantage: ", masterAgent.getPolicy()._get_advantage())
                print("Policy prob: ", masterAgent.getPolicy()._q_action())
                print("Policy log prob: ", masterAgent.getPolicy()._get_log_prob())
                print( "Actor loss: ", masterAgent.getPolicy()._get_action_diff())
                """
                loss__ = masterAgent.getPolicy()._get_actor_loss(
                )  # uses previous call batch data
                actorLosses.append(loss__)
                regularizationCost__ = masterAgent.getPolicy(
                )._get_actor_regularization()
                actorRegularizationCosts.append(regularizationCost__)

            if not all(np.isfinite(error)):
                print(
                    "States: " + str(states) + " ResultsStates: " +
                    str(result_states) + " Rewards: " + str(rewards) +
                    " Actions: " + str(actions) + " Falls: ", str(falls))
                print("Bellman Error is Nan: " + str(error) +
                      str(np.isfinite(error)))
                sys.exit()

            error = np.mean(np.fabs(error))
            if error > 10000:
                print("Error to big: ")
                print(states, actions, rewards, result_states)

            if (settings['train_forward_dynamics']):
                dynamicsLoss = masterAgent.getForwardDynamics().bellman_error(
                    states, actions, result_states, rewards)
                dynamicsLoss = np.mean(np.fabs(dynamicsLoss))
                dynamicsLosses.append(dynamicsLoss)
            if (settings['train_forward_dynamics']):
                print("Round: " + str(round_) + " bellman error: " +
                      str(error) + " ForwardPredictionLoss: " +
                      str(dynamicsLoss) + " in " + str(time_taken) +
                      " seconds")
            else:
                print("Round: " + str(round_) + " bellman error: " +
                      str(error) + " in " + str(time_taken) + " seconds")
            # discounted_values.append(discounted_sum)

        print("Master agent experience size: " +
              str(masterAgent.getExperience().samples()))
        # print ("**** Master agent experience size: " + str(learning_workers[0]._agent._expBuff.samples()))
        # masterAgent.getPolicy().setNetworkParameters(namespace.agentPoly)
        # masterAgent.setExperience(learningNamespace.experience)
        # if (settings['train_forward_dynamics']):
        #     masterAgent.getForwardDynamics().setNetworkParameters(namespace.forwardNN)
        """
        for sw in sim_workers: # Should update these more often?
            sw._model.getPolicy().setNetworkParameters(namespace.agentPoly)
            if (settings['train_forward_dynamics']):
                sw._model.getForwardDynamics().setNetworkParameters(namespace.forwardNN)
                """
        # experience = learningNamespace.experience
        # actor.setExperience(experience)
        """
        pr.disable()
        f = open('x.prof', 'a')
        pstats.Stats(pr, stream=f).sort_stats('time').print_stats()
        f.close()
        """
        trainData["mean_bellman_error"].append(np.mean(
            np.fabs(bellman_errors)))
        trainData["std_bellman_error"].append(np.std(bellman_errors))
        if (settings['visualize_learning']):
            rlv.updateLoss(np.array(trainData["mean_bellman_error"]),
                           np.array(trainData["std_bellman_error"]))
            rlv.redraw()
            rlv.setInteractiveOff()
            rlv.saveVisual(directory + "trainingGraphNN")
            rlv.setInteractive()
        # print "Error: " + str(error)
        if (settings['debug_critic']):
            mean_criticLosses = np.mean(criticLosses)
            std_criticLosses = np.std(criticLosses)
            trainData["mean_critic_loss"].append(mean_criticLosses)
            trainData["std_critic_loss"].append(std_criticLosses)
            criticLosses = []
            if (settings['visualize_learning']):
                critic_loss_viz.updateLoss(
                    np.array(trainData["mean_critic_loss"]),
                    np.array(trainData["std_critic_loss"]))
                critic_loss_viz.redraw()
                critic_loss_viz.setInteractiveOff()
                critic_loss_viz.saveVisual(directory + "criticLossGraph")
                critic_loss_viz.setInteractive()

            mean_criticRegularizationCosts = np.mean(criticRegularizationCosts)
            std_criticRegularizationCosts = np.std(criticRegularizationCosts)
            trainData["mean_critic_regularization_cost"].append(
                mean_criticRegularizationCosts)
            trainData["std_critic_regularization_cost"].append(
                std_criticRegularizationCosts)
            criticRegularizationCosts = []
            if (settings['visualize_learning']):
                critic_regularization_viz.updateLoss(
                    np.array(trainData["mean_critic_regularization_cost"]),
                    np.array(trainData["std_critic_regularization_cost"]))
                critic_regularization_viz.redraw()
                critic_regularization_viz.setInteractiveOff()
                critic_regularization_viz.saveVisual(
                    directory + "criticRegularizationGraph")
                critic_regularization_viz.setInteractive()

        if (settings['debug_actor']):

            mean_actorLosses = np.mean(actorLosses)
            std_actorLosses = np.std(actorLosses)
            trainData["mean_actor_loss"].append(mean_actorLosses)
            trainData["std_actor_loss"].append(std_actorLosses)
            actorLosses = []
            if (settings['visualize_learning']):
                actor_loss_viz.updateLoss(
                    np.array(trainData["mean_actor_loss"]),
                    np.array(trainData["std_actor_loss"]))
                actor_loss_viz.redraw()
                actor_loss_viz.setInteractiveOff()
                actor_loss_viz.saveVisual(directory + "actorLossGraph")
                actor_loss_viz.setInteractive()

            mean_actorRegularizationCosts = np.mean(actorRegularizationCosts)
            std_actorRegularizationCosts = np.std(actorRegularizationCosts)
            trainData["mean_actor_regularization_cost"].append(
                mean_actorRegularizationCosts)
            trainData["std_actor_regularization_cost"].append(
                std_actorRegularizationCosts)
            actorRegularizationCosts = []
            if (settings['visualize_learning']):
                actor_regularization_viz.updateLoss(
                    np.array(trainData["mean_actor_regularization_cost"]),
                    np.array(trainData["std_actor_regularization_cost"]))
                actor_regularization_viz.redraw()
                actor_regularization_viz.setInteractiveOff()
                actor_regularization_viz.saveVisual(directory +
                                                    "actorRegularizationGraph")
                actor_regularization_viz.setInteractive()
Esempio n. 8
0
def trainForwardDynamics(settingsFileName):
    """
    State is the input state and Action is the desired output (y).
    """
    # from model.ModelUtil import *

    np.random.seed(23)
    file = open(settingsFileName)
    settings = json.load(file)
    print("Settings: ", str(json.dumps(settings)))
    file.close()
    import os
    os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device=" + settings[
        'training_processor_type'] + ",floatX=" + settings['float_type']

    # import theano
    # from theano import tensor as T
    # import lasagne
    from util.SimulationUtil import validateSettings
    from util.SimulationUtil import getDataDirectory
    from util.SimulationUtil import createForwardDynamicsModel, createRLAgent
    from model.NeuralNetwork import NeuralNetwork
    from util.ExperienceMemory import ExperienceMemory
    import matplotlib.pyplot as plt
    import math
    # from ModelEvaluation import *
    # from util.SimulationUtil import *
    import time

    settings = validateSettings(settings)

    # anchor_data_file = open(settings["anchor_file"])
    # _anchors = getAnchors(anchor_data_file)
    # print ("Length of anchors epochs: ", str(len(_anchors)))
    # anchor_data_file.close()
    train_forward_dynamics = True
    model_type = settings["model_type"]
    directory = getDataDirectory(settings)

    if not os.path.exists(directory):
        os.makedirs(directory)

    if (settings['train_forward_dynamics']):
        if "." in settings['forward_dynamics_model_type']:
            ### convert . to / and copy file over
            file_name = settings['forward_dynamics_model_type']
            k = file_name.rfind(".")
            file_name = file_name[:k]
            file_name_read = file_name.replace(".", "/")
            file_name_read = file_name_read + ".py"
            print("model file name:", file_name)
            print("os.path.basename(file_name): ", os.path.basename(file_name))
            file = open(file_name_read, 'r')
            out_file = open(directory + file_name + ".py", 'w')
            out_file.write(file.read())
            file.close()
            out_file.close()

    discrete_actions = np.array(settings['discrete_actions'])
    num_actions = discrete_actions.shape[0]  # number of rows
    rounds = settings["rounds"]
    epochs = settings["epochs"]
    # num_states=settings["num_states"]
    epsilon = settings["epsilon"]
    discount_factor = settings["discount_factor"]
    # max_reward=settings["max_reward"]
    reward_bounds = np.array([[-10.1], [0.0]])
    batch_size = settings["batch_size"]
    train_on_validation_set = settings["train_on_validation_set"]
    state_bounds = np.array(settings['state_bounds'])
    discrete_actions = np.array(settings['discrete_actions'])
    print("Sim config file name: ", str(settings["sim_config_file"]))
    # c = characterSim.Configuration(str(settings["sim_config_file"]))
    # c = characterSim.Configuration("../data/epsilon0Config.ini")
    action_space_continuous = settings['action_space_continuous']
    # states2 = np.transpose(np.repeat([states], 2, axis=0))
    # print states2
    if action_space_continuous:
        action_bounds = np.array(settings["action_bounds"], dtype=float)

    if action_space_continuous:
        experience = ExperienceMemory(len(state_bounds[0]),
                                      len(action_bounds[0]),
                                      settings['expereince_length'],
                                      continuous_actions=True,
                                      settings=settings)
    else:
        experience = ExperienceMemory(len(state_bounds[0]), 1,
                                      settings['expereince_length'])
    experience.setSettings(settings)
    file_name = directory + getAgentName() + "expBufferInit.hdf5"
    # experience.saveToFile(file_name)
    experience.loadFromFile(file_name)
    state_bounds = experience._state_bounds
    print("Samples in experience: ", experience.samples())

    if (settings['train_forward_dynamics']):
        if (settings['forward_dynamics_model_type'] == "SingleNet"):
            print(
                "Creating forward dynamics network: Using single network model"
            )
            model = createRLAgent(settings['agent_name'], state_bounds,
                                  discrete_actions, reward_bounds, settings)
            forwardDynamicsModel = createForwardDynamicsModel(settings,
                                                              state_bounds,
                                                              action_bounds,
                                                              None,
                                                              None,
                                                              agentModel=model)
            # forwardDynamicsModel = model
        else:
            print("Creating forward dynamics network")
            # forwardDynamicsModel = ForwardDynamicsNetwork(state_length=len(state_bounds[0]),action_length=len(action_bounds[0]), state_bounds=state_bounds, action_bounds=action_bounds, settings_=settings)
            forwardDynamicsModel = createForwardDynamicsModel(settings,
                                                              state_bounds,
                                                              action_bounds,
                                                              None,
                                                              None,
                                                              agentModel=None)
        if settings['visualize_learning']:
            from NNVisualize import NNVisualize
            title = file_name = settings['forward_dynamics_model_type']
            k = title.rfind(".") + 1
            if (k > len(title)):  ## name does not contain a .
                k = 0
            file_name = file_name[k:]
            nlv = NNVisualize(title=str("Forward Dynamics Model") + " with " +
                              str(file_name))
            nlv.setInteractive()
            nlv.init()
    if (settings['train_reward_predictor']):
        if settings['visualize_learning']:
            rewardlv = NNVisualize(title=str("Reward Model") + " with " +
                                   str(settings["model_type"]),
                                   settings=settings)
            rewardlv.setInteractive()
            rewardlv.init()

    # experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), experience_length, continuous_actions=True)
    """
    for i in range(experience_length):
        action_ = np.array([actions[i]])
        state_ = np.array([states[i]])
        # print "Action: " + str([actions[i]])
        experience.insert(norm_state(state_, state_bounds), norm_action(action_, action_bounds),
                           norm_state(state_, state_bounds), norm_reward(np.array([0]), reward_bounds))
    """
    trainData = {}
    trainData["mean_reward"] = []
    trainData["std_reward"] = []
    trainData["mean_bellman_error"] = []
    trainData["std_bellman_error"] = []
    trainData["mean_discount_error"] = []
    trainData["std_discount_error"] = []
    trainData["mean_forward_dynamics_loss"] = []
    trainData["std_forward_dynamics_loss"] = []
    trainData["mean_forward_dynamics_reward_loss"] = []
    trainData["std_forward_dynamics_reward_loss"] = []
    trainData["mean_eval"] = []
    trainData["std_eval"] = []
    # dynamicsLosses=[]
    best_dynamicsLosses = 1000000
    _states, _actions, _result_states, _rewards, _falls, _G_ts, exp_actions__ = experience.get_batch(
        batch_size)
    """
    _states = theano.shared(np.array(_states, dtype=theano.config.floatX))
    _actions = theano.shared(np.array(_actions, dtype=theano.config.floatX))
    _result_states = theano.shared(np.array(_result_states, dtype=theano.config.floatX))
    _rewards = theano.shared(np.array(_rewards, dtype=theano.config.floatX))
    """
    forwardDynamicsModel.setData(_states, _actions, _result_states)
    for round_ in range(rounds):
        t0 = time.time()
        for epoch in range(epochs):
            _states, _actions, _result_states, _rewards, _falls, _G_ts, exp_actions__ = experience.get_batch(
                batch_size)
            # print _actions
            # dynamicsLoss = forwardDynamicsModel.train(states=_states, actions=_actions, result_states=_result_states)
            # forwardDynamicsModel.setData(_states, _actions, _result_states)
            dynamicsLoss = forwardDynamicsModel.train(_states, _actions,
                                                      _result_states, _rewards)
            # dynamicsLoss = forwardDynamicsModel._train()
        t1 = time.time()
        if (round_ % settings['plotting_update_freq_num_rounds']) == 0:
            dynamicsLoss_ = forwardDynamicsModel.bellman_error(
                _states, _actions, _result_states, _rewards)
            # dynamicsLoss_ = forwardDynamicsModel.bellman_error((_states), (_actions), (_result_states))
            if (settings['use_stochastic_forward_dynamics']):
                dynamicsLoss = np.mean(dynamicsLoss_)
            else:
                dynamicsLoss = np.mean(np.fabs(dynamicsLoss_))
            if (settings['train_reward_predictor']):
                dynamicsRewardLoss_ = forwardDynamicsModel.reward_error(
                    _states, _actions, _result_states, _rewards)
                dynamicsRewardLoss = np.mean(np.fabs(dynamicsRewardLoss_))
                # dynamicsRewardLosses.append(dynamicsRewardLoss)
                dynamicsRewardLosses = dynamicsRewardLoss
            if (settings['train_forward_dynamics'] and
                ((round_ % settings['plotting_update_freq_num_rounds']) == 0)):
                # dynamicsLosses.append(dynamicsLoss)
                mean_dynamicsLosses = dynamicsLoss
                std_dynamicsLosses = np.std((dynamicsLoss_))
                if (settings['train_forward_dynamics']):
                    trainData["mean_forward_dynamics_loss"].append(
                        mean_dynamicsLosses)
                    trainData["std_forward_dynamics_loss"].append(
                        std_dynamicsLosses)
                print("Round: " + str(round_) + " Epoch: " + str(epoch) +
                      " ForwardPredictionLoss: " + str(dynamicsLoss) + " in " +
                      str(datetime.timedelta(seconds=(t1 - t0))) + " seconds")
                # print ("State Bounds: ", forwardDynamicsModel.getStateBounds(), " exp: ", experience.getStateBounds())
                # print ("Action Bounds: ", forwardDynamicsModel.getActionBounds(), " exp: ", experience.getActionBounds())
                # print (str(datetime.timedelta(seconds=(t1-t0))))
                if (settings['visualize_learning']):
                    nlv.updateLoss(
                        np.array(trainData["mean_forward_dynamics_loss"]),
                        np.array(trainData["std_forward_dynamics_loss"]))
                    nlv.redraw()
                    nlv.setInteractiveOff()
                    nlv.saveVisual(directory + "trainingGraphNN")
                    nlv.setInteractive()
            if (settings['train_reward_predictor']):
                mean_dynamicsRewardLosses = np.mean(dynamicsRewardLoss)
                std_dynamicsRewardLosses = np.std(dynamicsRewardLoss_)
                dynamicsRewardLosses = []
                trainData["mean_forward_dynamics_reward_loss"].append(
                    mean_dynamicsRewardLosses)
                trainData["std_forward_dynamics_reward_loss"].append(
                    std_dynamicsRewardLosses)
            if (settings['train_reward_predictor']
                    and settings['visualize_learning']):
                rewardlv.updateLoss(
                    np.array(trainData["mean_forward_dynamics_reward_loss"]),
                    np.array(trainData["std_forward_dynamics_reward_loss"]))
                rewardlv.redraw()
                rewardlv.setInteractiveOff()
                rewardlv.saveVisual(directory + "rewardTrainingGraph")
                rewardlv.setInteractive()

        if (round_ % settings['saving_update_freq_num_rounds']) == 0:
            if mean_dynamicsLosses < best_dynamicsLosses:
                best_dynamicsLosses = mean_dynamicsLosses
                print("Saving BEST current forward dynamics model: " +
                      str(best_dynamicsLosses))
                file_name_dynamics = directory + "forward_dynamics_" + "_Best_pretrain.pkl"
                f = open(file_name_dynamics, 'wb')
                dill.dump(forwardDynamicsModel, f)
                f.close()

            if settings['save_trainData']:
                fp = open(
                    directory + "FD_trainingData_" +
                    str(settings['agent_name']) + ".json", 'w')
                # print ("Train data: ", trainData)
                ## because json does not serialize np.float32
                for key in trainData:
                    trainData[key] = [float(i) for i in trainData[key]]
                json.dump(trainData, fp)
                fp.close()