def q_value(self, state): """ For returning a vector of q values, state should NOT be normalized """ # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state """ if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): pass else: """ # print ("Agent state bounds: ", self._state_bounds) state = norm_state(state, self._state_bounds) # print ("Agent normalized state: ", state) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) self._modelTarget.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): value = scale_reward(self._q_val(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # return (self._q_val())[0] else: value = scale_reward(self._q_val(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # print ("Agent scaled value: ", value) return value
def q_values(self, state): """ For returning a vector of q values, state should already be normalized """ state = norm_state(state, self._state_bounds) state = np.array(state, dtype=theano.config.floatX) self._model.setStates(state) self._modelTarget.setStates(state) action = self._q_action() self._model.setActions(action) self._modelTarget.setActions(action) if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)): q_vals = self._vals_extra() else: q_vals = self._q_val() if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): return scale_reward(q_vals, self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # return (self._q_val())[0] else: return scale_reward(q_vals, self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor']))
def q_values(self, state): """ For returning a vector of q values, state should already be normalized """ state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) self._modelTarget.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): return scale_reward(self._q_val(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) else: return scale_reward(self._q_val(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor']))
def q_valueWithDropout(self, state): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state state = np.array(state, dtype=self._settings['float_type']) state = norm_state(state, self._state_bounds) self._model.setStates(state) return scale_reward(self._q_val_drop(), self.getRewardBounds())
def q_value(self, state): state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) value = scale_reward( self._value([state, 0])[0], self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) return value
def q_valueWithDropout(self, state): if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): pass else: state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): return scale_reward( self._q_val_drop(), self.getRewardBounds())[0] * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) else: return scale_reward( self._q_val_drop(), self.getRewardBounds())[0] * ( 1.0 / (1.0 - self.getSettings()['discount_factor']))
def q_valueWithDropout(self, state): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): pass else: state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): return scale_reward(self._q_val_drop(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # return (self._q_val_drop())[0] else: return scale_reward(self._q_val_drop(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor']))
def q_value(self, state): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) self._modelTarget.setStates(state) # return scale_reward(self._q_valTarget(), self.getRewardBounds())[0] value = scale_reward( self._model.getCriticNetwork().predict(state, batch_size=1), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) return value
def trainActor(self, states, actions, rewards, result_states, falls, advantage, exp_actions=None, forwardDynamicsModel=None): lossActor = 0 if ((self._updates % self._weight_update_steps) == 0): self.updateTargetModelValue() self._updates += 1 """ score = self._model.getActorNetwork().fit([states, actions, advantage], np.zeros_like(rewards), nb_epoch=1, batch_size=32, verbose=0 # callbacks=[early_stopping], ) """ train_DPG = False if ((self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']) and True): mbae_actions = [] mbae_advantage = [] other_actions = [] other_advantage = [] policy_mean = self._model.getActorNetwork().predict( states, batch_size=states.shape[0])[:, :self._action_length] # print ("exp_actions: ", exp_actions) for k in range(actions.shape[0]): if (exp_actions[k] == 2): mbae_actions.append(actions[k] - policy_mean[k]) mbae_advantage.append(advantage[k]) else: other_actions.append(actions[k] - policy_mean[k]) other_advantage.append(advantage[k]) policy_mean = self._model.getActorNetwork().predict( states, batch_size=states.shape[0])[:, :self._action_length] print("MBAE Actions: ", len(mbae_actions), ", ", len(mbae_actions) / actions.shape[0], "%") print("MBAE Actions std: ", np.std(mbae_actions, axis=0), " mean ", np.mean(np.std(mbae_actions, axis=0))) print("MBAE Actions advantage: ", np.mean(mbae_advantage, axis=0)) print("Normal Actions std: ", np.std(other_actions, axis=0), " mean ", np.mean(np.std(other_actions, axis=0))) print("Normal Actions advantage: ", np.mean(other_advantage, axis=0)) if (train_DPG): q_ = np.mean(self._trainPolicy_DPG(states)) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['debug']): print("Policy loss: ", q_) return r_ = np.mean(self._r(states, actions)) ### From Q-prop paper, compute adaptive control variate. sampled_q = self._model.getCriticNetwork().predict( [states, actions], batch_size=states.shape[0]) # sampled_q = self._q_func_Target([states, actions])[0] sampled_q = scale_reward(sampled_q, self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) true_q = self._q_func([states])[0] ## Scale q func to be in same space as advantage true_q = scale_reward(true_q, self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) cov = advantage * (sampled_q - true_q) # var = true_q * true_q # n = cov / var ### practical implementation n = 1 when cov > 0, otherwise 0 n = (np.sign(cov) + 1.0) / 2.0 # n = np.zeros_like(n) advantage = (advantage - (n * (sampled_q - true_q))) std = np.std(advantage) mean = np.mean(advantage) if ('advantage_scaling' in self.getSettings() and (self.getSettings()['advantage_scaling'] != False)): std = std / self.getSettings()['advantage_scaling'] mean = 0.0 advantage = (advantage - mean) / std if (r_ < 2.0) and (r_ > 0.5): ### update not to large (lossActor, r_, q_) = self._trainPolicy(states, actions, advantage, n) # lossActor = score.history['loss'][0] if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['debug']): print( "Policy loss: ", lossActor, " r: ", np.mean(r_), " q: ", np.mean(q_), ) print( "Policy mean: ", np.mean(self._model.getActorNetwork().predict( states, batch_size=states.shape[0])[:, :self._action_length], axis=0)) print("Policy std: ", np.mean(self._q_action_std([states])[0], axis=0)) print("Gradient Info: n, mean:", np.mean(n), " std: ", np.std(n)) print("Gradient Info: cov, mean:", np.mean(cov), " std: ", np.std(cov)) else: if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("Policy Gradient too large: ", np.mean(r_)) self.updateTargetModel() return lossActor
def trainActor(self, states, actions, rewards, result_states, falls, advantage, exp_actions=None, forwardDynamicsModel=None): self.setData(states, actions, rewards, result_states, falls) if ((('ppo_use_seperate_nets' in self.getSettings())) and (self.getSettings()['ppo_use_seperate_nets'])): pass else: if ((self._updates % self._weight_update_steps) == 0): self.updateTargetModel() self._updates += 1 if ('use_GAE' in self.getSettings() and (self.getSettings()['use_GAE'])): # self._advantage_shared.set_value(advantage) ## Need to scale the advantage by the discount to help keep things normalized if (('normalize_advantage' in self.getSettings()) and self.getSettings()['normalize_advantage']): # advantage = advantage * (1.0-self._discount_factor) advantage = advantage * (1.0 - self._discount_factor) # pass # use given advantage parameter else: advantage = self._get_advantage() self._advantage_shared.set_value(advantage) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['debug']): print("Rewards: ", np.mean(rewards), " std: ", np.std(rewards), " shape: ", np.array(rewards).shape) print("Targets: ", np.mean(self._get_target()), " std: ", np.std(self._get_target())) print("Falls: ", np.mean(falls), " std: ", np.std(falls)) # print("values, falls: ", np.concatenate((scale_reward(self._q_val(), self.getRewardBounds()) * (1.0 / (1.0- self.getSettings()['discount_factor'])), falls), axis=1)) print( "values: ", np.mean( scale_reward(self._q_val(), self.getRewardBounds()) * (1.0 / (1.0 - self.getSettings()['discount_factor']))), " std: ", np.std( scale_reward(self._q_val(), self.getRewardBounds()) * (1.0 / (1.0 - self.getSettings()['discount_factor'])))) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("Advantage: ", np.mean(advantage), " std: ", np.std(advantage)) print("Actions mean: ", np.mean(actions, axis=0)) print("Policy mean: ", np.mean(self._q_action(), axis=0)) # print("Actions std: ", np.mean(np.sqrt( (np.square(np.abs(actions - np.mean(actions, axis=0))))/1.0), axis=0) ) print("Actions std: ", np.std((actions - self._q_action()), axis=0)) # print("Actions std: ", np.std((actions), axis=0) ) print("Policy std: ", np.mean(self._q_action_std(), axis=0)) # print("Policy log prob target: ", np.mean(self._get_log_prob_target(), axis=0)) print("Actor loss: ", np.mean(self._get_action_diff())) print("Actor entropy: ", np.mean(self._get_actor_entropy())) # self._get_actor_entropy # print("States mean: ", np.mean(states, axis=0)) # print("States std: ", np.std(states, axis=0)) # print ( "R: ", np.mean(self._get_log_prob()/self._get_log_prob_target())) # print ("Actor diff: ", np.mean(np.array(self._get_diff()) / (1.0/(1.0-self._discount_factor)))) ## Sometimes really HUGE losses appear, occasionally if (not self.getSettings()['use_fixed_std'] ): # whether or not to update the std of policy as well. lossActor = np.abs(np.mean(self._get_action_diff())) if (lossActor < 1000): if ('ppo_use_seperate_nets' in self.getSettings() and (self.getSettings()['ppo_use_seperate_nets'])): lossActor, _ = self._trainActor() else: lossActor, _ = self._trainCollective() else: print( "**********************Did not train actor this time: expected loss to high, ", lossActor) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("Policy log prob after: ", np.mean(self._get_log_prob(), axis=0)) # print("KL Divergence: ", np.sum(self.kl_divergence())) print("KL Divergence: ", self.kl_divergence()) actions = self.predict_batch(states) # print ("actions shape:", actions.shape) next_states = forwardDynamicsModel.predict_batch(states, actions) # print ("next_states shape: ", next_states.shape) next_state_grads = self.getGrads(next_states, alreadyNormed=True)[0] # print ("next_state_grads shape: ", next_state_grads.shape) action_grads = forwardDynamicsModel.getGrads(states, actions, next_states, v_grad=next_state_grads, alreadyNormed=True)[0] # print ( "action_grads shape: ", action_grads.shape) use_parameter_grad_inversion = True self.setData(states, actions, rewards, result_states, falls) if (self.getSettings()['train_reward_predictor']): reward_grad = forwardDynamicsModel.getRewardGrads(states, actions)[0] ## Need to shrink this reward grad down to the same scale as the value function reward_grad = np.array(reward_grad, dtype=self.getSettings()['float_type']) action_grads = np.array(action_grads, dtype=self.getSettings()['float_type']) action_grads = (reward_grad * (1.0 - self.getSettings()['discount_factor'])) + ( action_grads * self.getSettings()['discount_factor']) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("Reward_Grad Raw: ", reward_grad) """ From DEEP REINFORCEMENT LEARNING IN PARAMETERIZED ACTION SPACE Hausknecht, Matthew and Stone, Peter actions.shape == action_grads.shape """ if (use_parameter_grad_inversion): # print ("Performing param inversion") for i in range(action_grads.shape[0]): for j in range(action_grads.shape[1]): if (action_grads[i, j] > 0): inversion = (1.0 - actions[i, j]) / 2.0 else: inversion = (actions[i, j] - (-1.0)) / 2.0 action_grads[i, j] = action_grads[i, j] * inversion if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): # print("Actions mean: ", np.mean(actions, axis=0)) print("Policy mean: ", np.mean(self._q_action(), axis=0)) # print("Actions std: ", np.mean(np.sqrt( (np.square(np.abs(actions - np.mean(actions, axis=0))))/1.0), axis=0) ) # print("Actions std: ", np.std((actions - self._q_action()), axis=0) ) # print("Actions std: ", np.std((actions), axis=0) ) print("Policy std: ", np.mean(self._q_action_std(), axis=0)) print("Mean Next State Grad grad: ", np.mean(np.fabs(next_state_grads), axis=0), " std ", np.std(next_state_grads, axis=0)) print("Mean action grad size: ", np.mean(np.fabs(action_grads), axis=0), " std ", np.std(action_grads, axis=0)) ## Set data for gradient # self._model.setStates(states) # self._modelTarget.setStates(states) ## Why the -1.0?? ## Because the SGD method is always performing MINIMIZATION!! if (np.all(np.isfinite(action_grads)) ): ## Check these are not bad grads... self._action_grad_shared.set_value(-1.0 * action_grads) self._trainActionGRAD() return 0
def trainActor(self, states, actions, rewards, result_states, falls, advantage, forwardDynamicsModel=None): # if ('use_GAE' in self.getSettings() and ( self.getSettings()['use_GAE'] )): # self._advantage_shared.set_value(advantage) ## Need to scale the advantage by the discount to help keep things normalized if (('normalize_advantage' in self.getSettings()) and (not self.getSettings()['normalize_advantage'])): # advantage = advantage * (1.0-self._discount_factor) advantage = advantage * (1.0 - self._discount_factor) ## Standardize advantage # pass else: ## if not defined default is to normalize std = np.std(advantage) mean = np.mean(advantage) if ('advantage_scaling' in self.getSettings() and (self.getSettings()['advantage_scaling'] != False)): std = std / self.getSettings()['advantage_scaling'] mean = 0.0 advantage = (advantage - mean) / std # pass # use given advantage parameter self.setData(states, actions, rewards, result_states, falls) # advantage = self._get_advantage()[0] * (1.0/(1.0-self._discount_factor)) self._advantage_shared.set_value(advantage) #else: # self.setData(states, actions, rewards, result_states, falls) # advantage = self._get_advantage()[0] * (1.0/(1.0-self._discount_factor)) # self._advantage_shared.set_value(advantage) all_paramsActA = lasagne.layers.helper.get_all_param_values( self._model.getActorNetwork()) lasagne.layers.helper.set_all_param_values( self._modelTarget.getActorNetwork(), all_paramsActA) # print ("Performing Critic trainning update") # if (( self._updates % self._weight_update_steps) == 0): # self.updateTargetModel() # self._updates += 1 # loss, _ = self._train() # print( "Actor loss: ", self._get_action_diff()) lossActor = 0 # diff_ = self.bellman_error(states, actions, rewards, result_states, falls) # print("Advantage: ", np.mean(self._get_advantage())) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['debug']): print( "Rewards: ", np.mean( scale_reward(rewards, self.getRewardBounds()) * (1.0 / (1.0 - self.getSettings()['discount_factor']))), " std: ", np.std( scale_reward(rewards, self.getRewardBounds()) * (1.0 / (1.0 - self.getSettings()['discount_factor']))), " shape: ", np.array(rewards).shape) # print("Targets: ", np.mean(self._get_target()), " std: ", np.std(self._get_target())) print("Falls: ", np.mean(falls), " std: ", np.std(falls)) # print("values, falls: ", np.concatenate((scale_reward(self._q_val(), self.getRewardBounds()) * (1.0 / (1.0- self.getSettings()['discount_factor'])), falls), axis=1)) print( "values: ", np.mean( scale_reward(self._q_val(), self.getRewardBounds()) * (1.0 / (1.0 - self.getSettings()['discount_factor']))), " std: ", np.std( scale_reward(self._q_val(), self.getRewardBounds()) * (1.0 / (1.0 - self.getSettings()['discount_factor'])))) print("Model Advantage: ", np.mean(self._get_diff()), " std: ", np.std(self._get_diff())) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("Advantage: ", np.mean(advantage), " std: ", np.std(advantage)) # print("Advantage, reward: ", np.concatenate((advantage, rewards), axis=1)) print("Actions: ", np.mean(actions, axis=0), " shape: ", actions.shape) print("Policy mean: ", np.mean(self._q_action(), axis=0)) # print("Actions std: ", np.mean(np.sqrt( (np.square(np.abs(actions - np.mean(actions, axis=0))))/1.0), axis=0) ) # print("Actions std: ", np.std(actions - self._q_action(), axis=0) ) print("Actions std: ", np.std(actions - self._q_action(), axis=0)) print("Policy std: ", np.mean(self._q_action_std(), axis=0)) print("Policy log prob before: ", np.mean(self._get_log_prob(), axis=0)) # print( "Actor loss: ", np.mean(self._get_action_diff())) # print ("Actor diff: ", np.mean(np.array(self._get_diff()) / (1.0/(1.0-self._discount_factor)))) ## Sometimes really HUGE losses appear, ocasionally # if (np.abs(np.mean(self._get_action_diff())) < 10): # lossActor, _ = self._trainActor() if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['debug']): print( "Policy std2: ", np.mean(self._q_action_std(), axis=0) + np.std(self._q_action(), axis=0)) new_actions = self._q_action() new_action_stds = self._q_action_std() new_actions_ = [] for i in range(new_actions.shape[0]): action__ = randomExporationSTD(0.0, new_actions[i], new_action_stds[i]) new_actions_.append(action__) print("New action mean: ", np.mean(new_actions_, axis=0)) print("New action std: ", np.std(new_actions_, axis=0)) self.getSettings()['cg_damping'] = 1e-3 """ cfg = self.cfg prob_np = concat([path["prob"] for path in paths]) ob_no = concat([path["observation"] for path in paths]) action_na = concat([path["action"] for path in paths]) advantage_n = concat([path["advantage"] for path in paths]) """ args = (states, actions, advantage) args_fvp = (states) thprev = get_params_flat( lasagne.layers.helper.get_all_param_values( self._model.getActorNetwork())) def fisher_vector_product(p): # print ("fvp p: ", p) # print ("states: ", p) # print ('cg_damping', self.getSettings()['cg_damping'] ) fvp_ = self.compute_fisher_vector_product( p, states) + np.float32(self.getSettings()['cg_damping']) * p #pylint: disable=E1101,W0640 # print ("fvp_ : ", fvp_) return fvp_ g = self.compute_policy_gradient(*args) print("g: ", g) losses_before = self.compute_losses(*args) if np.allclose(g, 0): print("got zero gradient. not updating") else: stepdir = cg(fisher_vector_product, -g) # print ("stepdir: ", stepdir) shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) # print ("shs: ", shs ) lm = np.sqrt( shs / np.float32(self.getSettings()['kl_divergence_threshold'])) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm neggdotstepdir = -g.dot(stepdir) def loss(th): # self.set_params_flat(th) params_tmp = setFromFlat(all_paramsActA, th) lasagne.layers.helper.set_all_param_values( self._model.getActorNetwork(), params_tmp) return self.compute_losses(*args)[0] #pylint: disable=W0640 success, theta = linesearch(loss, thprev, fullstep, neggdotstepdir / lm) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("success", success) params_tmp = setFromFlat(all_paramsActA, theta) lasagne.layers.helper.set_all_param_values( self._model.getActorNetwork(), params_tmp) # self.set_params_flat(theta) losses_after = self.compute_losses(*args) if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("Policy log prob after: ", np.mean(self._get_log_prob(), axis=0)) out = OrderedDict() for (lname, lbefore, lafter) in zipsame(self.loss_names, losses_before, losses_after): out[lname + "_before"] = lbefore out[lname + "_after"] = lafter if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("Losses before: ", self.loss_names, ", ", losses_before) print("Losses after: ", self.loss_names, ", ", losses_after) return out # print("Policy log prob after: ", np.mean(self._get_log_prob(), axis=0)) # print( "Length of positive actions: " , str(len(tmp_actions)), " Actor loss: ", lossActor) # print( " Actor loss: ", lossActor) # self._advantage_shared.set_value(diff_) # lossActor, _ = self._trainActor() # kl_after = self.kl_divergence() """