コード例 #1
0
ファイル: AlgorithmInterface.py プロジェクト: skylbc/SMBAE
    def q_value(self, state):
        """
            For returning a vector of q values, state should NOT be normalized
        """
        # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
        # states[0, ...] = state
        """
        if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])):
            pass
        else:
        """
        # print ("Agent state bounds: ", self._state_bounds)
        state = norm_state(state, self._state_bounds)
        # print ("Agent normalized state: ", state)
        state = np.array(state, dtype=self._settings['float_type'])
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            value = scale_reward(self._q_val(), self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
            # return (self._q_val())[0]
        else:
            value = scale_reward(self._q_val(), self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
#         print ("Agent scaled value: ", value)
        return value
コード例 #2
0
ファイル: QProp.py プロジェクト: skylbc/SMBAE
    def q_values(self, state):
        """
            For returning a vector of q values, state should already be normalized
        """
        state = norm_state(state, self._state_bounds)
        state = np.array(state, dtype=theano.config.floatX)
        self._model.setStates(state)
        self._modelTarget.setStates(state)
        action = self._q_action()
        self._model.setActions(action)
        self._modelTarget.setActions(action)

        if ('train_extra_value_function' in self.getSettings() and
            (self.getSettings()['train_extra_value_function'] == True)):
            q_vals = self._vals_extra()
        else:
            q_vals = self._q_val()
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            return scale_reward(q_vals, self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
            # return (self._q_val())[0]
        else:
            return scale_reward(q_vals, self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
コード例 #3
0
 def q_values(self, state):
     """
         For returning a vector of q values, state should already be normalized
     """
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     self._modelTarget.setStates(state)
     if (('disable_parameter_scaling' in self._settings)
             and (self._settings['disable_parameter_scaling'])):
         return scale_reward(self._q_val(), self.getRewardBounds()) * (
             1.0 / (1.0 - self.getSettings()['discount_factor']))
     else:
         return scale_reward(self._q_val(), self.getRewardBounds()) * (
             1.0 / (1.0 - self.getSettings()['discount_factor']))
コード例 #4
0
ファイル: QPropKeras.py プロジェクト: skylbc/SMBAE
 def q_valueWithDropout(self, state):
     # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
     # states[0, ...] = state
     state = np.array(state, dtype=self._settings['float_type'])
     state = norm_state(state, self._state_bounds)
     self._model.setStates(state)
     return scale_reward(self._q_val_drop(), self.getRewardBounds())
コード例 #5
0
ファイル: QPropKeras.py プロジェクト: skylbc/SMBAE
 def q_value(self, state):
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     value = scale_reward(
         self._value([state, 0])[0], self.getRewardBounds()) * (
             1.0 / (1.0 - self.getSettings()['discount_factor']))
     return value
コード例 #6
0
    def q_valueWithDropout(self, state):
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            pass
        else:
            state = norm_state(state, self._state_bounds)

        state = np.array(state, dtype=self._settings['float_type'])
        self._model.setStates(state)
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            return scale_reward(
                self._q_val_drop(), self.getRewardBounds())[0] * (
                    1.0 / (1.0 - self.getSettings()['discount_factor']))
        else:
            return scale_reward(
                self._q_val_drop(), self.getRewardBounds())[0] * (
                    1.0 / (1.0 - self.getSettings()['discount_factor']))
コード例 #7
0
ファイル: AlgorithmInterface.py プロジェクト: skylbc/SMBAE
    def q_valueWithDropout(self, state):
        # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
        # states[0, ...] = state
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            pass
        else:
            state = norm_state(state, self._state_bounds)

        state = np.array(state, dtype=self._settings['float_type'])
        self._model.setStates(state)
        if (('disable_parameter_scaling' in self._settings)
                and (self._settings['disable_parameter_scaling'])):
            return scale_reward(self._q_val_drop(), self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
            # return (self._q_val_drop())[0]
        else:
            return scale_reward(self._q_val_drop(), self.getRewardBounds()) * (
                1.0 / (1.0 - self.getSettings()['discount_factor']))
コード例 #8
0
 def q_value(self, state):
     # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type'])
     # states[0, ...] = state
     state = norm_state(state, self._state_bounds)
     state = np.array(state, dtype=self._settings['float_type'])
     self._model.setStates(state)
     self._modelTarget.setStates(state)
     # return scale_reward(self._q_valTarget(), self.getRewardBounds())[0]
     value = scale_reward(
         self._model.getCriticNetwork().predict(state, batch_size=1),
         self.getRewardBounds()) * (
             1.0 / (1.0 - self.getSettings()['discount_factor']))
     return value
コード例 #9
0
ファイル: QPropKeras.py プロジェクト: skylbc/SMBAE
    def trainActor(self,
                   states,
                   actions,
                   rewards,
                   result_states,
                   falls,
                   advantage,
                   exp_actions=None,
                   forwardDynamicsModel=None):
        lossActor = 0

        if ((self._updates % self._weight_update_steps) == 0):
            self.updateTargetModelValue()
        self._updates += 1
        """
        score = self._model.getActorNetwork().fit([states, actions, advantage], np.zeros_like(rewards),
              nb_epoch=1, batch_size=32,
              verbose=0
              # callbacks=[early_stopping],
              )
        """
        train_DPG = False

        if ((self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['train'])
                and True):
            mbae_actions = []
            mbae_advantage = []
            other_actions = []
            other_advantage = []
            policy_mean = self._model.getActorNetwork().predict(
                states, batch_size=states.shape[0])[:, :self._action_length]
            # print ("exp_actions: ", exp_actions)
            for k in range(actions.shape[0]):
                if (exp_actions[k] == 2):
                    mbae_actions.append(actions[k] - policy_mean[k])
                    mbae_advantage.append(advantage[k])
                else:
                    other_actions.append(actions[k] - policy_mean[k])
                    other_advantage.append(advantage[k])

            policy_mean = self._model.getActorNetwork().predict(
                states, batch_size=states.shape[0])[:, :self._action_length]
            print("MBAE Actions: ", len(mbae_actions), ", ",
                  len(mbae_actions) / actions.shape[0], "%")
            print("MBAE Actions std: ", np.std(mbae_actions, axis=0), " mean ",
                  np.mean(np.std(mbae_actions, axis=0)))
            print("MBAE Actions advantage: ", np.mean(mbae_advantage, axis=0))
            print("Normal Actions std: ", np.std(other_actions, axis=0),
                  " mean ", np.mean(np.std(other_actions, axis=0)))
            print("Normal Actions advantage: ", np.mean(other_advantage,
                                                        axis=0))

        if (train_DPG):
            q_ = np.mean(self._trainPolicy_DPG(states))
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['debug']):
                print("Policy loss: ", q_)
            return

        r_ = np.mean(self._r(states, actions))

        ### From Q-prop paper, compute adaptive control variate.
        sampled_q = self._model.getCriticNetwork().predict(
            [states, actions], batch_size=states.shape[0])
        # sampled_q = self._q_func_Target([states, actions])[0]
        sampled_q = scale_reward(sampled_q, self.getRewardBounds()) * (
            1.0 / (1.0 - self.getSettings()['discount_factor']))
        true_q = self._q_func([states])[0]
        ## Scale q func to be in same space as advantage
        true_q = scale_reward(true_q, self.getRewardBounds()) * (
            1.0 / (1.0 - self.getSettings()['discount_factor']))
        cov = advantage * (sampled_q - true_q)
        # var = true_q * true_q
        # n = cov / var
        ### practical implementation n = 1 when cov > 0, otherwise 0
        n = (np.sign(cov) + 1.0) / 2.0
        # n = np.zeros_like(n)
        advantage = (advantage - (n * (sampled_q - true_q)))

        std = np.std(advantage)
        mean = np.mean(advantage)
        if ('advantage_scaling' in self.getSettings()
                and (self.getSettings()['advantage_scaling'] != False)):
            std = std / self.getSettings()['advantage_scaling']
            mean = 0.0
        advantage = (advantage - mean) / std

        if (r_ < 2.0) and (r_ > 0.5):  ### update not to large
            (lossActor, r_, q_) = self._trainPolicy(states, actions, advantage,
                                                    n)
            # lossActor = score.history['loss'][0]
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['debug']):
                print(
                    "Policy loss: ",
                    lossActor,
                    " r: ",
                    np.mean(r_),
                    " q: ",
                    np.mean(q_),
                )
                print(
                    "Policy mean: ",
                    np.mean(self._model.getActorNetwork().predict(
                        states,
                        batch_size=states.shape[0])[:, :self._action_length],
                            axis=0))
                print("Policy std: ",
                      np.mean(self._q_action_std([states])[0], axis=0))
                print("Gradient Info: n, mean:", np.mean(n), " std: ",
                      np.std(n))
                print("Gradient Info: cov, mean:", np.mean(cov), " std: ",
                      np.std(cov))
        else:
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['train']):
                print("Policy Gradient too large: ", np.mean(r_))

        self.updateTargetModel()

        return lossActor
コード例 #10
0
    def trainActor(self,
                   states,
                   actions,
                   rewards,
                   result_states,
                   falls,
                   advantage,
                   exp_actions=None,
                   forwardDynamicsModel=None):

        self.setData(states, actions, rewards, result_states, falls)
        if ((('ppo_use_seperate_nets' in self.getSettings()))
                and (self.getSettings()['ppo_use_seperate_nets'])):
            pass
        else:
            if ((self._updates % self._weight_update_steps) == 0):
                self.updateTargetModel()
            self._updates += 1

        if ('use_GAE' in self.getSettings()
                and (self.getSettings()['use_GAE'])):
            # self._advantage_shared.set_value(advantage)
            ## Need to scale the advantage by the discount to help keep things normalized
            if (('normalize_advantage' in self.getSettings())
                    and self.getSettings()['normalize_advantage']):
                # advantage = advantage * (1.0-self._discount_factor)
                advantage = advantage * (1.0 - self._discount_factor)
            # pass # use given advantage parameter
        else:
            advantage = self._get_advantage()
        self._advantage_shared.set_value(advantage)

        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['debug']):
            print("Rewards: ", np.mean(rewards), " std: ", np.std(rewards),
                  " shape: ",
                  np.array(rewards).shape)
            print("Targets: ", np.mean(self._get_target()), " std: ",
                  np.std(self._get_target()))
            print("Falls: ", np.mean(falls), " std: ", np.std(falls))
            # print("values, falls: ", np.concatenate((scale_reward(self._q_val(), self.getRewardBounds()) * (1.0 / (1.0- self.getSettings()['discount_factor'])), falls), axis=1))
            print(
                "values: ",
                np.mean(
                    scale_reward(self._q_val(), self.getRewardBounds()) *
                    (1.0 / (1.0 - self.getSettings()['discount_factor']))),
                " std: ",
                np.std(
                    scale_reward(self._q_val(), self.getRewardBounds()) *
                    (1.0 / (1.0 - self.getSettings()['discount_factor']))))

        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['train']):
            print("Advantage: ", np.mean(advantage), " std: ",
                  np.std(advantage))
            print("Actions mean:     ", np.mean(actions, axis=0))
            print("Policy mean: ", np.mean(self._q_action(), axis=0))
            # print("Actions std:  ", np.mean(np.sqrt( (np.square(np.abs(actions - np.mean(actions, axis=0))))/1.0), axis=0) )
            print("Actions std:  ", np.std((actions - self._q_action()),
                                           axis=0))
            # print("Actions std:  ", np.std((actions), axis=0) )
            print("Policy   std: ", np.mean(self._q_action_std(), axis=0))
            # print("Policy log prob target: ", np.mean(self._get_log_prob_target(), axis=0))
            print("Actor loss: ", np.mean(self._get_action_diff()))
            print("Actor entropy: ", np.mean(self._get_actor_entropy()))
            # self._get_actor_entropy
            # print("States mean:     ", np.mean(states, axis=0))
            # print("States std:     ", np.std(states, axis=0))
            # print ( "R: ", np.mean(self._get_log_prob()/self._get_log_prob_target()))
            # print ("Actor diff: ", np.mean(np.array(self._get_diff()) / (1.0/(1.0-self._discount_factor))))
            ## Sometimes really HUGE losses appear, occasionally
        if (not self.getSettings()['use_fixed_std']
            ):  # whether or not to update the std of policy as well.
            lossActor = np.abs(np.mean(self._get_action_diff()))
            if (lossActor < 1000):
                if ('ppo_use_seperate_nets' in self.getSettings()
                        and (self.getSettings()['ppo_use_seperate_nets'])):
                    lossActor, _ = self._trainActor()
                else:
                    lossActor, _ = self._trainCollective()
            else:
                print(
                    "**********************Did not train actor this time: expected loss to high, ",
                    lossActor)
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['train']):
                print("Policy log prob after: ",
                      np.mean(self._get_log_prob(), axis=0))
                # print("KL Divergence: ", np.sum(self.kl_divergence()))
                print("KL Divergence: ", self.kl_divergence())

        actions = self.predict_batch(states)
        # print ("actions shape:", actions.shape)
        next_states = forwardDynamicsModel.predict_batch(states, actions)
        # print ("next_states shape: ", next_states.shape)
        next_state_grads = self.getGrads(next_states, alreadyNormed=True)[0]
        # print ("next_state_grads shape: ", next_state_grads.shape)
        action_grads = forwardDynamicsModel.getGrads(states,
                                                     actions,
                                                     next_states,
                                                     v_grad=next_state_grads,
                                                     alreadyNormed=True)[0]
        # print ( "action_grads shape: ", action_grads.shape)

        use_parameter_grad_inversion = True
        self.setData(states, actions, rewards, result_states, falls)

        if (self.getSettings()['train_reward_predictor']):
            reward_grad = forwardDynamicsModel.getRewardGrads(states,
                                                              actions)[0]
            ## Need to shrink this reward grad down to the same scale as the value function
            reward_grad = np.array(reward_grad,
                                   dtype=self.getSettings()['float_type'])
            action_grads = np.array(action_grads,
                                    dtype=self.getSettings()['float_type'])
            action_grads = (reward_grad *
                            (1.0 - self.getSettings()['discount_factor'])) + (
                                action_grads *
                                self.getSettings()['discount_factor'])
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['train']):
                print("Reward_Grad Raw: ", reward_grad)
        """
        
            From DEEP REINFORCEMENT LEARNING IN PARAMETERIZED ACTION SPACE
            Hausknecht, Matthew and Stone, Peter
            
            actions.shape == action_grads.shape
            
        """
        if (use_parameter_grad_inversion):
            # print ("Performing param inversion")
            for i in range(action_grads.shape[0]):
                for j in range(action_grads.shape[1]):
                    if (action_grads[i, j] > 0):
                        inversion = (1.0 - actions[i, j]) / 2.0
                    else:
                        inversion = (actions[i, j] - (-1.0)) / 2.0
                    action_grads[i, j] = action_grads[i, j] * inversion

        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['train']):
            # print("Actions mean:     ", np.mean(actions, axis=0))
            print("Policy mean: ", np.mean(self._q_action(), axis=0))
            # print("Actions std:  ", np.mean(np.sqrt( (np.square(np.abs(actions - np.mean(actions, axis=0))))/1.0), axis=0) )
            # print("Actions std:  ", np.std((actions - self._q_action()), axis=0) )
            # print("Actions std:  ", np.std((actions), axis=0) )
            print("Policy std: ", np.mean(self._q_action_std(), axis=0))
            print("Mean Next State Grad grad: ",
                  np.mean(np.fabs(next_state_grads), axis=0), " std ",
                  np.std(next_state_grads, axis=0))
            print("Mean action grad size: ",
                  np.mean(np.fabs(action_grads), axis=0), " std ",
                  np.std(action_grads, axis=0))

        ## Set data for gradient
        # self._model.setStates(states)
        # self._modelTarget.setStates(states)
        ## Why the -1.0??
        ## Because the SGD method is always performing MINIMIZATION!!
        if (np.all(np.isfinite(action_grads))
            ):  ## Check these are not bad grads...
            self._action_grad_shared.set_value(-1.0 * action_grads)
            self._trainActionGRAD()
        return 0
コード例 #11
0
ファイル: TRPO.py プロジェクト: skylbc/SMBAE
    def trainActor(self,
                   states,
                   actions,
                   rewards,
                   result_states,
                   falls,
                   advantage,
                   forwardDynamicsModel=None):

        # if ('use_GAE' in self.getSettings() and ( self.getSettings()['use_GAE'] )):
        # self._advantage_shared.set_value(advantage)
        ## Need to scale the advantage by the discount to help keep things normalized
        if (('normalize_advantage' in self.getSettings())
                and (not self.getSettings()['normalize_advantage'])):
            # advantage = advantage * (1.0-self._discount_factor)
            advantage = advantage * (1.0 - self._discount_factor)
            ## Standardize advantage
            # pass
        else:
            ## if not defined default is to normalize
            std = np.std(advantage)
            mean = np.mean(advantage)
            if ('advantage_scaling' in self.getSettings()
                    and (self.getSettings()['advantage_scaling'] != False)):
                std = std / self.getSettings()['advantage_scaling']
                mean = 0.0
            advantage = (advantage - mean) / std
        # pass # use given advantage parameter
        self.setData(states, actions, rewards, result_states, falls)
        # advantage = self._get_advantage()[0] * (1.0/(1.0-self._discount_factor))
        self._advantage_shared.set_value(advantage)
        #else:
        #    self.setData(states, actions, rewards, result_states, falls)
        # advantage = self._get_advantage()[0] * (1.0/(1.0-self._discount_factor))
        #    self._advantage_shared.set_value(advantage)

        all_paramsActA = lasagne.layers.helper.get_all_param_values(
            self._model.getActorNetwork())
        lasagne.layers.helper.set_all_param_values(
            self._modelTarget.getActorNetwork(), all_paramsActA)
        # print ("Performing Critic trainning update")
        # if (( self._updates % self._weight_update_steps) == 0):
        #     self.updateTargetModel()
        # self._updates += 1
        # loss, _ = self._train()
        # print( "Actor loss: ", self._get_action_diff())
        lossActor = 0

        # diff_ = self.bellman_error(states, actions, rewards, result_states, falls)
        # print("Advantage: ", np.mean(self._get_advantage()))
        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['debug']):
            print(
                "Rewards: ",
                np.mean(
                    scale_reward(rewards, self.getRewardBounds()) *
                    (1.0 / (1.0 - self.getSettings()['discount_factor']))),
                " std: ",
                np.std(
                    scale_reward(rewards, self.getRewardBounds()) *
                    (1.0 / (1.0 - self.getSettings()['discount_factor']))),
                " shape: ",
                np.array(rewards).shape)
            # print("Targets: ", np.mean(self._get_target()), " std: ", np.std(self._get_target()))
            print("Falls: ", np.mean(falls), " std: ", np.std(falls))
            # print("values, falls: ", np.concatenate((scale_reward(self._q_val(), self.getRewardBounds()) * (1.0 / (1.0- self.getSettings()['discount_factor'])), falls), axis=1))
            print(
                "values: ",
                np.mean(
                    scale_reward(self._q_val(), self.getRewardBounds()) *
                    (1.0 / (1.0 - self.getSettings()['discount_factor']))),
                " std: ",
                np.std(
                    scale_reward(self._q_val(), self.getRewardBounds()) *
                    (1.0 / (1.0 - self.getSettings()['discount_factor']))))
            print("Model Advantage: ", np.mean(self._get_diff()), " std: ",
                  np.std(self._get_diff()))

        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['train']):
            print("Advantage: ", np.mean(advantage), " std: ",
                  np.std(advantage))

            # print("Advantage, reward: ", np.concatenate((advantage, rewards), axis=1))
            print("Actions:     ", np.mean(actions, axis=0), " shape: ",
                  actions.shape)
            print("Policy mean: ", np.mean(self._q_action(), axis=0))
            # print("Actions std:  ", np.mean(np.sqrt( (np.square(np.abs(actions - np.mean(actions, axis=0))))/1.0), axis=0) )
            # print("Actions std:  ", np.std(actions - self._q_action(), axis=0) )
            print("Actions std:  ", np.std(actions - self._q_action(), axis=0))
            print("Policy   std: ", np.mean(self._q_action_std(), axis=0))
            print("Policy log prob before: ",
                  np.mean(self._get_log_prob(), axis=0))
            # print( "Actor loss: ", np.mean(self._get_action_diff()))
            # print ("Actor diff: ", np.mean(np.array(self._get_diff()) / (1.0/(1.0-self._discount_factor))))
            ## Sometimes really HUGE losses appear, ocasionally
            # if (np.abs(np.mean(self._get_action_diff())) < 10):
            #     lossActor, _ = self._trainActor()

        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['debug']):
            print(
                "Policy   std2: ",
                np.mean(self._q_action_std(), axis=0) +
                np.std(self._q_action(), axis=0))
            new_actions = self._q_action()
            new_action_stds = self._q_action_std()
            new_actions_ = []
            for i in range(new_actions.shape[0]):
                action__ = randomExporationSTD(0.0, new_actions[i],
                                               new_action_stds[i])
                new_actions_.append(action__)
            print("New action mean: ", np.mean(new_actions_, axis=0))
            print("New action std: ", np.std(new_actions_, axis=0))

        self.getSettings()['cg_damping'] = 1e-3
        """
        cfg = self.cfg
        prob_np = concat([path["prob"] for path in paths])
        ob_no = concat([path["observation"] for path in paths])
        action_na = concat([path["action"] for path in paths])
        advantage_n = concat([path["advantage"] for path in paths])
        """
        args = (states, actions, advantage)
        args_fvp = (states)

        thprev = get_params_flat(
            lasagne.layers.helper.get_all_param_values(
                self._model.getActorNetwork()))

        def fisher_vector_product(p):
            # print ("fvp p: ", p)
            # print ("states: ", p)
            # print ('cg_damping', self.getSettings()['cg_damping'] )
            fvp_ = self.compute_fisher_vector_product(
                p, states) + np.float32(self.getSettings()['cg_damping']) * p  #pylint: disable=E1101,W0640
            # print ("fvp_ : ", fvp_)
            return fvp_

        g = self.compute_policy_gradient(*args)
        print("g: ", g)
        losses_before = self.compute_losses(*args)
        if np.allclose(g, 0):
            print("got zero gradient. not updating")
        else:
            stepdir = cg(fisher_vector_product, -g)
            # print ("stepdir: ", stepdir)
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            # print ("shs: ", shs )
            lm = np.sqrt(
                shs /
                np.float32(self.getSettings()['kl_divergence_threshold']))
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['train']):
                print("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            neggdotstepdir = -g.dot(stepdir)

            def loss(th):
                # self.set_params_flat(th)
                params_tmp = setFromFlat(all_paramsActA, th)
                lasagne.layers.helper.set_all_param_values(
                    self._model.getActorNetwork(), params_tmp)
                return self.compute_losses(*args)[0]  #pylint: disable=W0640

            success, theta = linesearch(loss, thprev, fullstep,
                                        neggdotstepdir / lm)
            if (self.getSettings()["print_levels"][self.getSettings(
            )["print_level"]] >= self.getSettings()["print_levels"]['train']):
                print("success", success)
            params_tmp = setFromFlat(all_paramsActA, theta)
            lasagne.layers.helper.set_all_param_values(
                self._model.getActorNetwork(), params_tmp)
            # self.set_params_flat(theta)
        losses_after = self.compute_losses(*args)
        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['train']):
            print("Policy log prob after: ",
                  np.mean(self._get_log_prob(), axis=0))

        out = OrderedDict()
        for (lname, lbefore, lafter) in zipsame(self.loss_names, losses_before,
                                                losses_after):
            out[lname + "_before"] = lbefore
            out[lname + "_after"] = lafter

        if (self.getSettings()["print_levels"][self.getSettings(
        )["print_level"]] >= self.getSettings()["print_levels"]['train']):
            print("Losses before: ", self.loss_names, ", ", losses_before)
            print("Losses after: ", self.loss_names, ", ", losses_after)

        return out
        # print("Policy log prob after: ", np.mean(self._get_log_prob(), axis=0))
        # print( "Length of positive actions: " , str(len(tmp_actions)), " Actor loss: ", lossActor)
        # print( " Actor loss: ", lossActor)
        # self._advantage_shared.set_value(diff_)
        # lossActor, _ = self._trainActor()
        # kl_after = self.kl_divergence()
        """