class RewardLogging(gym.core.Wrapper): def __init__(self, env, loggingPeriod=100, **kwargs): super().__init__(env) if self.multiprocessing == 1: self.GLOBAL_RUNNING_R = MovingAverage(loggingPeriod) else: if 'GLOBAL_RUNNING_R' not in globals(): global GLOBAL_RUNNING_R GLOBAL_RUNNING_R = MovingAverage(loggingPeriod) self.GLOBAL_RUNNING_R = GLOBAL_RUNNING_R def reset(self, **kwargs): self.tracking_r = [] return self.env.reset(**kwargs) def step(self, action): observation, reward, done, info = self.env.step(action=action) self.tracking_r.append(reward) return observation, reward, done, info def getLogging(self): """ Processes the tracked data of the environment. In this case it sums the reward over the entire episode. """ self.GLOBAL_RUNNING_R.append(sum(self.tracking_r)) finalDict = {"TotalReward": self.GLOBAL_RUNNING_R()} return finalDict
class PriorityBuffer(): def __init__(self, maxSamples=10000): self.maxSamples = maxSamples self.buffer = [] self.priorities = [] self.trajLengths = [] self.flag = True self.slice = 0 self.sampleSize = 0 self.errorMA = MovingAverage(1000) def GetMuSigma(self): return self.errorMA(), self.errorMA.std() def AddError(self, val): self.errorMA.append(val) def AddTrajectory(self, sample, priority): if len(sample[0]) == 0: return self.buffer.append(sample) self.priorities.append(priority) self.trajLengths.append(len(sample[0])) def Sample(self): return self.buffer[0:self.slice], self.sampleSize def PrioritizeandPruneSamples(self, sampleSize): if len(self.trajLengths) == 0: return if self.flag: self.flag = False self.priorities, self.buffer, self.trajLengths = (list(t) for t in zip( *sorted(zip(self.priorities, self.buffer, self.trajLengths), key=operator.itemgetter(0), reverse=True))) #Pruning the least favorable samples while sum(self.trajLengths) >= self.maxSamples: self.priorities.pop(-1) self.buffer.pop(-1) self.trajLengths.pop(-1) self.sampleSize = 0 self.slice = 0 for length in self.trajLengths: self.sampleSize += length self.slice += 1 if self.sampleSize > sampleSize: break def UpdatePriorities(self, priorities): self.priorities[0:self.slice] = priorities self.flag = True return self.buffer def GetReprioritySamples(self): return self.buffer[0:self.slice]
class RewardLogging(gym.core.Wrapper): def __init__(self,env, **kwargs): super().__init__(env) if self.multiprocessing == 1: self.GLOBAL_RUNNING_R = MovingAverage(400) self.win_rate = MovingAverage(400) self.red_killed = MovingAverage(400) else: if 'GLOBAL_RUNNING_R' not in globals(): global GLOBAL_RUNNING_R GLOBAL_RUNNING_R = MovingAverage(400) self.GLOBAL_RUNNING_R = GLOBAL_RUNNING_R self.win_rate = MovingAverage(400) self.red_killed = MovingAverage(400) def reset(self, **kwargs): self.tracking_r = [] return self.env.reset(**kwargs) def step(self, action): observation, reward, done, info = self.env.step(action=action) self.tracking_r.append(reward) return observation, reward, done, info def getLogging(self): """ Processes the tracked data of the environment. In this case it sums the reward over the entire episode. """ self.win_rate.append(int(self.blue_win)) self.GLOBAL_RUNNING_R.append(sum(self.tracking_r)) self.red_killed.append(int(self.red_eliminated)) finalDict = {"Env Results/TotalReward":self.GLOBAL_RUNNING_R(), "Env Results/WinRate":self.win_rate(), "Env Results/RedKilled":self.red_killed()} return finalDict
class NGU(Method): def __init__(self, sharedModel, sess, stateShape, actionSize, scope, HPs, sharedBuffer, globalNet=None, nTrajs=1, LSTM=False): """ Initializes I/O placeholders used for Tensorflow session runs. Initializes and Actor and Critic Network to be used for the purpose of RL. """ #Placeholders self.LSTM = LSTM self.sess = sess self.scope = scope self.Model = sharedModel self.sharedBuffer = sharedBuffer #Common Stuff Between the networks: self.HPs = HPs #Creating the different values of beta def sigmoid(x): return 1 / (1 + np.exp(-x)) self.betas = [] for i in range(self.HPs["N"]): if i == 0: self.betas.append(0.0) elif i == self.HPs["N"] - 1: self.betas.append(self.HPs["betaMax"]) else: self.betas.append(self.HPs["betaMax"] * sigmoid( (2.0 * float(i) + 2.0 - self.HPs["N"]) / (self.HPs["N"] - 2.0))) with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope(scope): #Specifying placeholders for Tensorflow Networks if len(stateShape) == 4: self.states_ = tf.placeholder(shape=[None] + stateShape[1:4], dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None] + stateShape[1:4], dtype=tf.float32, name='next_states') else: self.states_ = tf.placeholder(shape=[None] + stateShape, dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None] + stateShape, dtype=tf.float32, name='next_states') self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold') self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold') self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='total_reward') self.bandit_one_hot = tf.placeholder( shape=[None, self.HPs["N"]], dtype=tf.int32, name='beta_bandit') self.action_past = tf.placeholder(shape=[None], dtype=tf.int32, name='action_past') self.reward_i_past = tf.placeholder(shape=[None], dtype=tf.float32, name='reward_i_past') self.reward_e_past = tf.placeholder(shape=[None], dtype=tf.float32, name='reward_e_past') self.reward_i_current = tf.placeholder(shape=[None], dtype=tf.float32, name='reward_i_current') self.reward_e_current = tf.placeholder(shape=[None], dtype=tf.float32, name='reward_e_current') # Creating the IO for the entire network input = { "state": self.states_, "state_next": self.next_states_, "bandit_one_hot": self.bandit_one_hot, "action_past": self.action_past, "reward_i_past": self.reward_i_past, "reward_e_past": self.reward_e_past, } out = self.Model(input) self.q = out["Q"] self.a_pred = out["action_prediction"] self.latent = out["latent_space"] self.rnd_random = out["RND_random"] self.rnd_predictor = out["RND_predictor"] input2 = { "state": self.next_states_, "state_next": self.next_states_, #Used as a placeholder in network "bandit_one_hot": self.bandit_one_hot, "action_past": self.actions_, "reward_i_past": self.reward_i_current, "reward_e_past": self.reward_e_current, } out2 = self.Model(input2) q_next = out["Q"] with tf.name_scope('q_learning'): #Current Q oh_action = tf.one_hot( self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size] curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent] #Next Q max_next_q = tf.reduce_max(q_next, axis=-1) #TD Error td_target = self.rewards_ + HPs["Gamma"] * max_next_q * ( 1. - self.done_) self.td_error = loss = tf.keras.losses.MSE( td_target, curr_q) softmax_q = tf.nn.softmax(curr_q) self.entropy = -tf.reduce_mean( softmax_q * tf.log(softmax_q)) self.loss = loss + HPs["EntropyBeta"] * self.entropy self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) if globalNet is None: #Creating the Training instance of the network. with tf.name_scope('embedding_network'): oh_action = tf.one_hot( self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size] self.embedding_loss = tf.keras.losses.MSE( oh_action, self.a_pred) with tf.name_scope('life_long_curiosity'): self.llc_loss = tf.keras.losses.MSE( self.rnd_random, self.rnd_predictor) loss = self.loss + self.llc_loss + self.embedding_loss optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"]) self.gradients = optimizer.get_gradients(loss, self.params) self.update_op = optimizer.apply_gradients( zip(self.gradients, self.params)) self.grads = [self.gradients] self.losses = [loss] self.update_ops = [self.update_op] self.grad_MA = [ MovingAverage(400) for i in range(len(self.grads)) ] self.loss_MA = [ MovingAverage(400) for i in range(len(self.losses)) ] self.entropy_MA = MovingAverage(400) self.labels = ["Total"] self.HPs = HPs else: #Creating a Actor Instance for the Network. #Creating the Episodic Memory, which compares samples self.episodicMemory = EpisodicMemory() #Creating Local Buffer to store data until it is ready to push to sample buffer self.buffer = [Trajectory(depth=10) for _ in range(nTrajs)] #Creating a pull operation to synch network parameters with tf.name_scope('sync'): self.pull_params_op = [ l_p.assign(g_p) for l_p, g_p in zip(self.params, globalNet.params) ] self.pull_ops = [self.pull_params_op] def GetAction(self, state, a_past, r_i_past, r_e_past, episode=None, step=0): """ Contains the code to run the network based on an input. """ #Fixing the state shape if there is somethinf wrong if len(state.shape) == 3: state = state[np.newaxis, :] if len(state.shape) == 1: state = state[np.newaxis, :] #Selecting new beta if the begining of the episode #Also bootstrapping rewards/actions for the if step == 0: currBeta = random.randint(0, self.HPs["N"] - 1) oh = np.zeros(self.HPs["N"]) oh[currBeta] = 1 self.betaSelect = oh self.currBeta = self.betas[currBeta] feedDict = { self.states_: state, self.bandit_one_hot: self.betaSelect[np.newaxis, :], self.action_past: np.asarray(a_past), self.reward_i_past: np.asarray(r_i_past), self.reward_e_past: np.asarray(r_e_past) } q = self.sess.run(self.q, feedDict) actions = np.argmax(q, axis=-1) return actions, [ self.currBeta, self.betaSelect ] # return a int and extra data that needs to be fed to buffer. def Encode(self, state): return self.sess.run(self.latent, {self.states_: state}) def RNDPredictionError(self, state): random, predictor = self.sess.run( [self.rnd_random, self.rnd_predictor], {self.states_: state}) return np.linalg.norm(random - predictor) def GetIntrinsicReward(self, state_prev, state, episode=None, step=0): #Clearing the episodic buffer if step == 0: self.episodicMemory.Clear() self.episodicMemory.Add(self.Encode(state_prev)) #Adding Sample to the buffer encodedState = self.Encode(state) stateError = self.RNDPredictionError(state) self.sharedBuffer.AddError(stateError) #####Calculating the episodic reward factor #-finding k nearest neighbors in buffer and distance to them K = self.episodicMemory.NearestNeighborsDist(encodedState, num=5) r_episodic = 1.0 / np.sqrt(K + 0.001) #Calculating alpha stateError_Average, stateError_std = self.sharedBuffer.GetMuSigma() alpha = 1.0 + (stateError - stateError_Average) / stateError_std #Calculating the intrinsic reward r_i = r_episodic * min(max(1.0, alpha), 5.0) #adding the sample to the buffer after nearest neighbors has been calculated. self.episodicMemory.Add(encodedState) return r_i def Update(self, HPs, episode=0, statistics=True): """ The main update function for A3C. The function pushes gradients to the global AC Network. The second function is to Pull """ #Process the data from the buffer samples, num = self.sharedBuffer.Sample() if num < self.HPs["BatchSize"]: return priorities = [] for traj in samples: if len(traj[0]) <= 5: continue batches = len(traj[0]) // self.HPs["MinibatchSize"] + 1 s = np.array_split(traj[0], batches) a_his = np.array_split(np.asarray(traj[1]).reshape(-1), batches) r = np.array_split(np.asarray(traj[2]).reshape(-1), batches) s_next = np.array_split(traj[3], batches) done = np.array_split(traj[4], batches) bandit_one_hot = np.array_split(traj[8], batches) action_past = np.array_split(traj[5], batches) reward_i_past = np.array_split(traj[6], batches) reward_e_past = np.array_split(traj[7], batches) for epoch in range(self.HPs["Epochs"]): for i in range(batches): #Create a feedDict from the buffer if len(np.squeeze(np.asarray(s[i])).shape) == 3: continue feedDict = { self.states_: np.squeeze(np.asarray(s[i])), self.next_states_: np.squeeze(np.asarray(s_next[i])), self.actions_: np.squeeze(np.asarray(a_his[i])), self.rewards_: np.squeeze(np.asarray(r[i])), self.done_: np.squeeze(np.asarray(done[i], dtype=float)), self.bandit_one_hot: np.squeeze(np.asarray(bandit_one_hot[i])), self.action_past: np.squeeze(np.asarray(action_past[i])), self.reward_i_past: np.squeeze(np.asarray(reward_i_past[i])), self.reward_e_past: np.squeeze(np.asarray(reward_e_past[i])), } out = self.sess.run( self.update_ops + self.losses + self.grads, feedDict) # local grads applied to global net. out = np.array_split(out, 3) losses = out[1] grads = out[2] for i, loss in enumerate(losses): self.loss_MA[i].append(loss) for i, grads_i in enumerate(grads): total_counter = 1 vanish_counter = 0 for grad in grads_i: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad) < 1e-8).sum() self.grad_MA[i].append(vanish_counter / total_counter) ent = self.sess.run( self.entropy, feedDict) # local grads applied to global net. entropy = np.average(np.asarray(ent)) self.entropy_MA.append(entropy) feedDict = { self.states_: np.squeeze(np.asarray(traj[0])), self.next_states_: np.squeeze(np.asarray(traj[3])), self.actions_: traj[1], self.rewards_: traj[2], self.done_: np.squeeze(np.asarray(traj[4], dtype=float)), self.bandit_one_hot: np.asarray(traj[8]), self.action_past: np.squeeze(np.asarray(traj[5], )), self.reward_i_past: np.squeeze(np.asarray(traj[6], )), self.reward_e_past: np.squeeze(np.asarray(traj[7], )), } priorities.append(self.sess.run(self.td_error, feedDict)) self.sharedBuffer.UpdatePriorities(priorities) def GetStatistics(self): dict = {} for i, label in enumerate(self.labels): dict["Training Results/Vanishing Gradient " + label] = self.grad_MA[i]() dict["Training Results/Loss " + label] = self.loss_MA[i]() dict["Training Results/Entropy"] = self.entropy_MA() return dict def PushToBuffer(self): #Packaging samples in manner that requires modification on the learner end. #Estimating TD Difference to give priority to the data. for traj in range(len(self.buffer)): s = self.buffer[traj][0] a_his = np.asarray(self.buffer[traj][1]).reshape(-1) r = np.asarray(self.buffer[traj][2]).reshape(-1) s_next = self.buffer[traj][3] done = self.buffer[traj][4] action_past = self.buffer[traj][5] reward_i_past = self.buffer[traj][6] reward_e_past = self.buffer[traj][7] bandit_one_hot = self.buffer[traj][9] #Create a feedDict from the buffer feedDict = { self.states_: np.squeeze(np.asarray(s)), self.next_states_: np.squeeze(np.asarray(s_next)), self.actions_: np.squeeze(np.asarray(a_his)), self.rewards_: np.squeeze(np.asarray(r)), self.done_: np.squeeze(np.asarray(done, dtype=float)), self.bandit_one_hot: np.squeeze(np.asarray(bandit_one_hot)), self.action_past: np.squeeze(np.asarray(action_past)), self.reward_i_past: np.squeeze(np.asarray(reward_i_past)), self.reward_e_past: np.squeeze(np.asarray(reward_e_past)), } priority = self.sess.run(self.td_error, feedDict) self.sharedBuffer.AddTrajectory([ s, a_his, r, s_next, done, action_past, reward_i_past, reward_e_past, bandit_one_hot ], priority) self.sharedBuffer.PrioritizeandPruneSamples(2048) self.ClearTrajectory() self.sess.run(self.pull_ops) @property def getVars(self): return self.Model.getVars(self.scope)
class MAML(Method): def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs): """ Initializes a training method for a neural network. Parameters ---------- Model : Keras Model Object A Keras model object with fully defined layers and a call function. See examples in networks module. sess : Tensorflow Session Initialized Tensorflow session stateShape : list List of integers of the inputs shape size. Ex [39,39,6] actionSize : int Output size of the network. nTrajs : int (Optional) Number that specifies the number of trajectories to be created for collecting training data. scope : str (Optional) Name of the PPO method. Used to group and differentiate variables between other networks. Returns ------- N/A """ #Processing inputs self.actionSize = actionSize self.sess=sess self.HPs = settings["NetworkHPs"] self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="local") self.Model2 = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="global") self.scope =scope ="MAML" #Creating appropriate buffer for the method. self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope("MAML"): #Placeholders if len(stateShape) == 4: self.s = tf.placeholder(tf.float32, [None]+stateShape[1:4], 'S') else: self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S') self.a_his = tf.placeholder(tf.int32, [None, ], 'A') self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget') self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold') self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold') #Initializing Netowrk I/O inputs = {"state":self.s} out = self.Model(inputs) _ = self.Model2(inputs) self.a_prob = out["actor"] self.v = out["critic"] self.log_logits = out["log_logits"] # Entropy def _log(val): return tf.log(tf.clip_by_value(val, 1e-10, 10.0)) entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy') # Critic Loss td_error = self.td_target_ - self.v critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss') # Actor Loss action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32) log_prob = tf.reduce_sum(self.log_logits * action_OH, 1) old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1) # Clipped surrogate function ratio = tf.exp(log_prob - old_log_prob) surrogate = ratio * self.advantage_ clipped_surrogate = tf.clip_by_value(ratio, 1-self.HPs["eps"], 1+self.HPs["eps"]) * self.advantage_ surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss') actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss') actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"] loss = actor_loss + critic_loss * self.HPs["CriticBeta"] # Build Trainer if self.HPs["Optimizer"] == "Adam": self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Adam(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "RMS": self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.RMSProp(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Adagrad": self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Adagrad(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Adadelta": self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Adadelta(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Adamax": self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Adamax(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Nadam": self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "SGD": self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.SGD(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Amsgrad": self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True) self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"],amsgrad=True) else: print("Not selected a proper Optimizer") exit() vars1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/local') self.gradients = self.optimizer.get_gradients(loss, vars1) self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, vars1)) with tf.name_scope("MetaUpdater"): vars2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/global') self.meta_update_ops = self.metaOptimizer.apply_gradients(zip(self.gradients, vars2)) with tf.name_scope('sync'): self.pull_params_op = [l_p.assign(g_p) for l_p, g_p in zip(vars1,vars2)] #Creating variables for logging. self.EntropyMA = MovingAverage(400) self.CriticLossMA = MovingAverage(400) self.ActorLossMA = MovingAverage(400) self.GradMA = MovingAverage(400) self.counter = 0 def next_task(self): if self.counter > 3: self.counter = 0 # self.sess.run(self.update_op) self.sess.run(self.pull_params_op) return True else: return False def GetAction(self, state, episode=1,step=0): """ Method to run data through the neural network. Parameters ---------- state : np.array Data with the shape of [N, self.stateShape] where N is number of smaples Returns ------- actions : list[int] List of actions based on NN output. extraData : list List of data that is passed to the execution code to be bundled with state data. """ try: probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: state}) except ValueError: probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: np.expand_dims(state,axis=0)}) actions = np.array([np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs]) return actions, [v,log_logits] def Update(self,episode=0): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- HPs : dict Hyperparameters for training. Returns ------- N/A """ samples=0 for i in range(len(self.buffer)): samples +=len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): #Finding if there are more than 1 done in the sequence. Clipping values if required. td_target, advantage = self.ProcessBuffer(traj) batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1 s = np.array_split( self.buffer[traj][0], batches) a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches) td_target_ = np.array_split( td_target, batches) advantage_ = np.array_split( np.reshape(advantage, [-1]), batches) old_log_logits_ = np.array_split( np.reshape(self.buffer[traj][6], [-1,self.actionSize]), batches) #Create a dictionary with all of the samples? #Use a sampler to feed the update operation? #Staging Buffer inputs into the entries to run through the network. # print(td_target) for epoch in range(self.HPs["Epochs"]): for i in range(batches): feed_dict = {self.s: np.squeeze(np.asarray(s[i])), self.a_his: np.asarray(a_his[i]), self.td_target_:np.asarray(td_target_[i]), self.advantage_: np.asarray(advantage_[i]), self.old_log_logits_: np.asarray(old_log_logits_[i])} # aLoss= self.sess.run([self.actor_loss], feed_dict) if self.counter == 3: aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.meta_update_ops], feed_dict) else: aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.update_ops], feed_dict) self.EntropyMA.append(entropy) self.CriticLossMA.append(cLoss) self.ActorLossMA.append(aLoss) total_counter = 0 vanish_counter = 0 for grad in grads: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad)<1e-8).sum() self.GradMA.append(vanish_counter/total_counter) self.counter += 1 self.ClearTrajectory() def GetStatistics(self): dict = {"Training Results/Entropy":self.EntropyMA(), "Training Results/Loss Critic":self.CriticLossMA(), "Training Results/Loss Actor":self.ActorLossMA(), "Training Results/Vanishing Gradient":self.GradMA(),} return dict def ProcessBuffer(self,traj): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. clip : list[bool] List where the trajectory has finished. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x] reward_lists = np.split(self.buffer[traj][2],split_loc) value_lists = np.split(self.buffer[traj][5],split_loc) td_target=[]; advantage=[] for rew,value in zip(reward_lists,value_lists): td_target_i, advantage_i = gae(rew.reshape(-1),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"]) td_target.extend(td_target_i); advantage.extend( advantage_i) return td_target, advantage @property def getVars(self): return self.Model.getVars("PPO_Training")
class PPO(Method): def __init__(self, sess, settings, netConfigOverride, stateShape, actionSize, nTrajs=1, **kwargs): """ Initializes a training method for a neural network. Parameters ---------- Model : Keras Model Object A Keras model object with fully defined layers and a call function. See examples in networks module. sess : Tensorflow Session Initialized Tensorflow session stateShape : list List of integers of the inputs shape size. Ex [39,39,6] actionSize : int Output size of the network. HPs : dict Dictionary that contains all hyperparameters to be used in the methods training nTrajs : int (Optional) Number that specifies the number of trajectories to be created for collecting training data. scope : str (Optional) Name of the PPO method. Used to group and differentiate variables between other networks. Returns ------- N/A """ #Processing inputs self.actionSize = actionSize self.sess = sess self.HPs = settings["NetworkHPs"] self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"], netConfigOverride=netConfigOverride, actionSize=actionSize) scope = "PPO" #Creating appropriate buffer for the method. self.buffer = [Trajectory(depth=8) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope(scope): #Placeholders self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S') self.a_his = tf.placeholder(tf.int32, [ None, ], 'A') self.td_target_ = tf.placeholder(tf.float32, [None], 'TD_target') self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold') self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold') #Initializing Netowrk I/O inputs = {"state": self.s} out = self.Model(inputs) self.a_prob = out["actor"] self.v = out["critic"] self.log_logits = out["log_logits"] # Entropy def _log(val): return tf.log(tf.clip_by_value(val, 1e-10, 10.0)) entropy = self.entropy = -tf.reduce_mean( self.a_prob * _log(self.a_prob), name='entropy') # Critic Loss td_error = self.td_target_ - self.v critic_loss = self.critic_loss = tf.reduce_mean( tf.square(td_error), name='critic_loss') # Actor Loss action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32) log_prob = tf.reduce_sum(self.log_logits * action_OH, 1) old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1) # Clipped surrogate function ratio = tf.exp(log_prob - old_log_prob) surrogate = ratio * self.advantage_ clipped_surrogate = tf.clip_by_value( ratio, 1 - self.HPs["eps"], 1 + self.HPs["eps"]) * self.advantage_ surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss') actor_loss = self.actor_loss = -tf.reduce_mean( surrogate_loss, name='actor_loss') loss = self.actor_loss + self.critic_loss * self.HPs[ "CriticBeta"] # Build Trainer if self.HPs["Optimizer"] == "Adam": self.optimizerA = tf.keras.optimizers.Adam( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adam( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "RMS": self.optimizerA = tf.keras.optimizers.RMSProp( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.RMSProp( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Adagrad": self.optimizerA = tf.keras.optimizers.Adagrad( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adagrad( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Adadelta": self.optimizerA = tf.keras.optimizers.Adadelta( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adadelta( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Adamax": self.optimizerA = tf.keras.optimizers.Adamax( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adamax( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Nadam": self.optimizerA = tf.keras.optimizers.Nadam( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Nadam( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "SGD": self.optimizerA = tf.keras.optimizers.SGD( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.SGD( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Amsgrad": self.optimizerA = tf.keras.optimizers.Nadam( self.HPs["LR Actor"], amsgrad=True) self.optimizerE = tf.keras.optimizers.Nadam( self.HPs["LR Entropy"], amsgrad=True) else: print("Not selected a proper Optimizer") exit() a_params = self.Model.GetVariables("Actor") c_params = self.Model.GetVariables("Critic") self.gradients_a = self.optimizerA.get_gradients( loss, self.Model.trainable_variables) # capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in self.gradients_a] self.update_op_a = self.optimizerA.apply_gradients( zip(self.gradients_a, self.Model.trainable_variables)) entropy_loss = -self.entropy * self.HPs["EntropyBeta"] self.gradients_e = self.optimizerE.get_gradients( entropy_loss, a_params) self.update_op_e = self.optimizerE.apply_gradients( zip(self.gradients_e, a_params)) total_counter = 1 vanish_counter = 0 for gradient in self.gradients_a: total_counter += np.prod(gradient.shape) stuff = tf.reduce_sum( tf.cast( tf.math.less_equal(tf.math.abs(gradient), tf.constant(1e-8)), tf.int32)) vanish_counter += stuff self.vanishing_gradient = vanish_counter / total_counter self.update_ops = [self.update_op_a, self.update_op_e] self.logging_ops = [ self.actor_loss, self.critic_loss, self.entropy, tf.reduce_mean(self.advantage_), tf.reduce_mean(ratio), loss, self.vanishing_gradient ] self.labels = [ "Loss Actor", "Loss Critic", "Entropy", "Advantage", "PPO Ratio", "Loss Total", "Vanishing Gradient" ] self.logging_MA = [ MovingAverage(400) for i in range(len(self.logging_ops)) ] self.count_MA = MovingAverage(400) def GetAction(self, state, episode=1, step=0): """ Method to run data through the neural network. Parameters ---------- state : np.array Data with the shape of [N, self.stateShape] where N is number of smaples Returns ------- actions : list[int] List of actions based on NN output. extraData : list List of data that is passed to the execution code to be bundled with state data. """ try: probs, log_logits, v = self.sess.run( [self.a_prob, self.log_logits, self.v], {self.s: state}) except ValueError: probs, log_logits, v = self.sess.run( [self.a_prob, self.log_logits, self.v], {self.s: np.expand_dims(state, axis=0)}) actions = np.array([ np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs ]) confid = -np.mean(probs * np.log(probs), axis=1) if step == 0: self.store_actions = actions self.old_confid = confid self.count = 0 return actions, [v, log_logits, True] else: if confid < self.old_confid: # compare inverse entropy self.old_confid = confid self.store_actions = actions self.count_MA.append(self.count) self.count = 0 return actions, [v, log_logits, True] else: if self.count >= 4: self.old_confid = np.maximum( self.old_confid + self.HPs["ConfidenceAnnealing"], self.HPs["MinConfidence"]) self.count += 1 return self.store_actions, [v, log_logits, False] def Update(self, episode=0): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- HPs : dict Hyperparameters for training. Returns ------- N/A """ #Counting number of samples. samples = 0 for i in range(len(self.buffer)): samples += len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): td_target_hier, advantage_hier, actions_hier, ll_hier, s_hier = self.ProcessBuffer( traj) for epoch in range(self.HPs["Epochs"]): for batch in MultiBatchDivider([ s_hier, actions_hier, td_target_hier, advantage_hier, ll_hier ], self.HPs["MinibatchSize"]): #Staging Buffer inputs into the entries to run through the network. feedDict = { self.s: np.asarray(batch[0]).squeeze(), self.a_his: np.asarray(batch[1]).squeeze(), self.td_target_: np.asarray(batch[2]).squeeze(), self.advantage_: np.reshape(batch[3], [-1]), self.old_log_logits_: np.asarray(batch[4]).squeeze() } out = self.sess.run( self.update_ops + self.logging_ops, feedDict) # local grads applied to global net. logging = out[len(self.update_ops):] for i, log in enumerate(logging): self.logging_MA[i].append(log) self.ClearTrajectory() def GetStatistics(self): dict = {} for i, label in enumerate(self.labels): dict["Training Results/" + label] = self.logging_MA[i]() dict["Training Results/Average Traj Length"] = self.count_MA() return dict def ProcessBuffer(self, traj): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ # Split into different episodes based on the "done" signal. Assumes that episode terminates at done. # Cannot account for instances where there are multiple done signals in a row. split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x] # reward_lists = np.split(self.buffer[traj][2],split_loc) # value_lists = np.split(self.buffer[traj][5],split_loc) # # td_target=[]; advantage=[] # for rew,value in zip(reward_lists,value_lists): # td_target_i, advantage_i = gae(rew.reshape(-1).tolist(),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"]) # td_target.extend(td_target_i); advantage.extend( advantage_i) # return td_target, advantage reward_lists = np.split(self.buffer[traj][2], split_loc[:-1]) #Stuff needed for the HL_S_lists = np.split(self.buffer[traj][0], split_loc[:-1]) HL_Critic_lists = np.split(self.buffer[traj][5], split_loc[:-1]) HL_Logits_lists = np.split(self.buffer[traj][6], split_loc[:-1]) HL_action_lists = np.split(self.buffer[traj][1], split_loc[:-1]) HL_flag_lists = np.split(self.buffer[traj][7], split_loc[:-1]) td_target_hier = [] advantage_hier = [] ll = [] actions = [] s = [] for rew, HL_critic, HL_ll, HL_a, HL_flag, HL_s in zip( reward_lists, HL_Critic_lists, HL_Logits_lists, HL_action_lists, HL_flag_lists, HL_S_lists): #Colapsing different trajectory lengths for the hierarchical controller split_loc_ = [i for i, x in enumerate(HL_flag[:-1]) if x][1:] rew_hier = [np.sum(l) for l in np.split(rew, split_loc_)] value_hier = [l[0] for l in np.split(HL_critic, split_loc_)] actions.extend([l[0] for l in np.split(HL_a, split_loc_)]) ll.extend([l[0] for l in np.split(HL_ll, split_loc_)]) s.extend([l[0] for l in np.split(HL_s, split_loc_)]) #Calculating the td_target and advantage for the hierarchical controller. td_target_i_, advantage_i_ = gae( np.asarray(rew_hier).reshape(-1).tolist(), np.asarray(value_hier).reshape(-1).tolist(), 0, self.HPs["Gamma"], self.HPs["lambda"]) td_target_hier.extend(td_target_i_) advantage_hier.extend(advantage_i_) return td_target_hier, advantage_hier, actions, ll, s @property def getVars(self): return self.Model.getVars("PPO_Training")
class PPO(Method): def __init__(self,Model,sess,stateShape,actionSize,HPs,nTrajs=1,scope="PPO_Training"): """ Initializes a training method for a neural network. Parameters ---------- Model : Keras Model Object A Keras model object with fully defined layers and a call function. See examples in networks module. sess : Tensorflow Session Initialized Tensorflow session stateShape : list List of integers of the inputs shape size. Ex [39,39,6] actionSize : int Output size of the network. HPs : dict Dictionary that contains all hyperparameters to be used in the methods training nTrajs : int (Optional) Number that specifies the number of trajectories to be created for collecting training data. scope : str (Optional) Name of the PPO method. Used to group and differentiate variables between other networks. Returns ------- N/A """ #Processing inputs self.actionSize = actionSize self.sess=sess self.Model = Model #Creating appropriate buffer for the method. self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope(scope): #Placeholders self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S') self.a_his = tf.placeholder(tf.int32, [None, ], 'A') self.td_target_ = tf.placeholder(tf.float32, [None], 'TD_target') self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold') self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold') #Initializing Netowrk I/O inputs = {"state":self.s} out = self.Model(inputs) self.a_prob = out["actor"] self.v = out["critic"] self.log_logits = out["log_logits"] # Entropy def _log(val): return tf.log(tf.clip_by_value(val, 1e-10, 10.0)) entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy') # Critic Loss td_error = self.td_target_ - self.v critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss') # Actor Loss action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32) log_prob = tf.reduce_sum(self.log_logits * action_OH, 1) old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1) # Clipped surrogate function ratio = tf.exp(log_prob - old_log_prob) surrogate = ratio * self.advantage_ clipped_surrogate = tf.clip_by_value(ratio, 1-HPs["eps"], 1+HPs["eps"]) * self.advantage_ surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss') actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss') actor_loss = actor_loss - entropy * HPs["EntropyBeta"] loss = actor_loss + critic_loss * HPs["CriticBeta"] # Build Trainer self.optimizer = tf.keras.optimizers.Adam(HPs["Critic LR"]) self.gradients = self.optimizer.get_gradients(loss, tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)) self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope))) #Creating variables for logging. self.EntropyMA = MovingAverage(400) self.CriticLossMA = MovingAverage(400) self.ActorLossMA = MovingAverage(400) self.GradMA = MovingAverage(400) self.HPs = HPs def GetAction(self, state, episode=1,step=0): """ Method to run data through the neural network. Parameters ---------- state : np.array Data with the shape of [N, self.stateShape] where N is number of smaples Returns ------- actions : list[int] List of actions based on NN output. extraData : list List of data that is passed to the execution code to be bundled with state data. """ try: probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: state}) except ValueError: probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: np.expand_dims(state,axis=0)}) actions = np.array([np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs]) if step % self.HPs["FS"] == 0: self.store_actions = actions return actions, [v,log_logits] else: return self.store_actions, [v,log_logits] def Update(self,HPs,episode=0): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- HPs : dict Hyperparameters for training. Returns ------- N/A """ #Counting number of samples. samples=0 for i in range(len(self.buffer)): samples +=len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): td_target, advantage = self.ProcessBuffer(traj) batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1 s = np.array_split( self.buffer[traj][0], batches) a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches) td_target_ = np.array_split( td_target, batches) advantage_ = np.array_split( np.reshape(advantage, [-1]), batches) old_log_logits_ = np.array_split( np.reshape(self.buffer[traj][6], [-1,self.actionSize]), batches) for epoch in range(self.HPs["Epochs"]): for i in range(batches): #Staging Buffer inputs into the entries to run through the network. feed_dict = {self.s: np.squeeze(s[i]), self.a_his: a_his[i], self.td_target_:td_target_[i], self.advantage_: advantage_[i], self.old_log_logits_: old_log_logits_[i]} aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.update_ops], feed_dict) self.EntropyMA.append(entropy) self.CriticLossMA.append(cLoss) self.ActorLossMA.append(aLoss) total_counter = 0 vanish_counter = 0 for grad in grads: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad)<1e-8).sum() self.GradMA.append(vanish_counter/total_counter) self.ClearTrajectory() def GetStatistics(self): dict = {"Training Results/Entropy":self.EntropyMA(), "Training Results/Loss Critic":self.CriticLossMA(), "Training Results/Loss Actor":self.ActorLossMA(), "Training Results/Vanishing Gradient":self.GradMA(),} return dict def ProcessBuffer(self,traj): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ # Split into different episodes based on the "done" signal. Assumes that episode terminates at done. # Cannot account for instances where there are multiple done signals in a row. split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x] reward_lists = np.split(self.buffer[traj][2],split_loc) value_lists = np.split(self.buffer[traj][5],split_loc) td_target=[]; advantage=[] for rew,value in zip(reward_lists,value_lists): td_target_i, advantage_i = gae(rew.reshape(-1).tolist(),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"]) td_target.extend(td_target_i); advantage.extend( advantage_i) return td_target, advantage @property def getVars(self): return self.Model.getVars("PPO_Training")
class DQN(Method): def __init__(self, sharedModel, sess, stateShape, actionSize, scope, HPs, globalAC=None, nTrajs=1): """ Initializes I/O placeholders used for Tensorflow session runs. Initializes and Actor and Critic Network to be used for the purpose of RL. """ #Placeholders self.sess = sess self.scope = scope self.Model = sharedModel self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope(scope): if len(stateShape) == 4: self.states_ = tf.placeholder(shape=[None] + stateShape[1:4], dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None] + stateShape[1:4], dtype=tf.float32, name='next_states') else: self.states_ = tf.placeholder(shape=[None] + stateShape, dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None] + stateShape, dtype=tf.float32, name='next_states') self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold') self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='rewards_hold') self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold') input = {"state": self.states_} out = self.Model(input) self.q = out["Q"] out2 = self.Model({"state": self.next_states_}) q_next = out2["Q"] with tf.name_scope('current_Q'): oh_action = tf.one_hot( self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size] curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent] with tf.name_scope('target_Q'): max_next_q = tf.reduce_max(q_next, axis=-1) td_target = self.rewards_ + HPs["Gamma"] * max_next_q * ( 1. - self.done_) with tf.name_scope('td_error'): loss = tf.keras.losses.MSE(td_target, curr_q) softmax_q = tf.nn.softmax(curr_q) self.entropy = -tf.reduce_mean( softmax_q * tf.log(softmax_q)) self.loss = total_loss = loss + HPs[ "EntropyBeta"] * self.entropy optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"]) self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) self.gradients = optimizer.get_gradients( total_loss, self.params) self.update_op = optimizer.apply_gradients( zip(self.gradients, self.params)) self.grads = [self.gradients] self.losses = [self.loss] self.update_ops = [self.update_op] self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))] self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))] self.entropy_MA = MovingAverage(400) self.labels = ["Critic"] self.HPs = HPs def GetAction(self, state, episode, step): """ Contains the code to run the network based on an input. """ if len(state.shape) == 3: state = state[np.newaxis, :] if len(state.shape) == 1: state = state[np.newaxis, :] q = self.sess.run(self.q, {self.states_: state}) if "Exploration" in self.HPs: if self.HPs["Exploration"] == "EGreedy": prob = 0.1 + 0.9 * (np.exp( -episode / self.HPs["ExplorationDecay"])) if random.uniform(0, 1) < prob: actions = random.randint(0, 4) else: actions = np.argmax(q, axis=-1) else: actions = np.argmax(q, axis=-1) return actions, [ ] # return a int and extra data that needs to be fed to buffer. def Update(self, HPs, episode=0, statistics=True): """ The main update function for A3C. The function pushes gradients to the global AC Network. The second function is to Pull """ #Process the data from the buffer samples = 0 for i in range(len(self.buffer)): samples += len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): batches = len( self.buffer[traj][0]) // self.HPs["MinibatchSize"] + 1 s = np.array_split(self.buffer[traj][0], batches) a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches) r = np.array_split( np.asarray(self.buffer[traj][2]).reshape(-1), batches) s_next = np.array_split(self.buffer[traj][3], batches) done = np.array_split(self.buffer[traj][4], batches) for epoch in range(self.HPs["Epochs"]): for i in range(batches): #Create a feedDict from the buffer feedDict = { self.states_: np.squeeze(np.asarray(s[i])), self.next_states_: np.squeeze(np.asarray(s_next[i])), self.actions_: np.squeeze(np.asarray(a_his[i])), self.rewards_: np.squeeze(np.asarray(r[i])), self.done_: np.squeeze(np.asarray(done[i], dtype=float)) } out = self.sess.run( self.update_ops + self.losses + self.grads, feedDict) # local grads applied to global net. out = np.array_split(out, 3) losses = out[1] grads = out[2] for i, loss in enumerate(losses): self.loss_MA[i].append(loss) for i, grads_i in enumerate(grads): total_counter = 1 vanish_counter = 0 for grad in grads_i: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad) < 1e-8).sum() self.grad_MA[i].append(vanish_counter / total_counter) ent = self.sess.run( self.entropy, feedDict) # local grads applied to global net. entropy = np.average(np.asarray(ent)) self.entropy_MA.append(entropy) self.ClearTrajectory() def GetStatistics(self): dict = {} for i, label in enumerate(self.labels): dict["Training Results/Vanishing Gradient " + label] = self.grad_MA[i]() dict["Training Results/Loss " + label] = self.loss_MA[i]() dict["Training Results/Entropy"] = self.entropy_MA() return dict def ProcessBuffer(self, HPs, traj): """ Process the buffer to calculate td_target. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ pass @property def getVars(self): return self.Model.getVars(self.scope)
class NGU(Method): def __init__(self, sharedModel, sess, stateShape, actionSize, scope, HPs, sharedBuffer, globalNet=None, nTrajs=1, LSTM=False): """ Initializes I/O placeholders used for Tensorflow session runs. Initializes and Actor and Critic Network to be used for the purpose of RL. """ #Placeholders self.LSTM = LSTM self.sess = sess self.scope = scope self.Model = sharedModel self.sharedBuffer = sharedBuffer self.HPs = HPs self.actionSize = actionSize #Creating the different values of beta and gamma def sigmoid(x): return 1 / (1 + np.exp(-x)) self.betas = [] for i in range(self.HPs["N"]): if i == 0: self.betas.append(0.0) elif i == self.HPs["N"] - 1: self.betas.append(self.HPs["betaMax"]) else: self.betas.append(self.HPs["betaMax"] * sigmoid( (2.0 * float(i) + 2.0 - self.HPs["N"]) / (self.HPs["N"] - 2.0))) self.gammas = [] for i in range(self.HPs["N"]): if i == 0: self.gammas.append(self.HPs["Gamma0"]) elif i < 7: self.gammas.append(self.HPs["Gamma1"] + (self.HPs["Gamma0"] - self.HPs["Gamma1"]) * sigmoid(10.0 * (2.0 * float(i) - 6.0) / 6.0)) elif i == 7: self.gammas.append(self.HPs["Gamma1"]) else: self.gammas.append(1.0 - np.exp(( (self.HPs["N"] - 9.0) * np.log(1.0 - self.HPs["Gamma1"]) + (float(i) - 8.0) * np.log(1 - self.HPs["Gamma2"])) / (self.HPs["N"] - 9.0))) self.gammas = np.asarray(self.gammas) with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope(scope): #Specifying placeholders for Tensorflow Networks if len(stateShape) == 4: self.states_ = tf.placeholder(shape=[None] + stateShape[1:4], dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None] + stateShape[1:4], dtype=tf.float32, name='next_states') else: self.states_ = tf.placeholder(shape=[None] + stateShape, dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None] + stateShape, dtype=tf.float32, name='next_states') self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold') self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold') self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='total_reward') self.bandit_one_hot = tf.placeholder( shape=[None, self.HPs["N"]], dtype=tf.int32, name='beta_bandit') self.action_past = tf.placeholder(shape=[None], dtype=tf.int32, name='action_past') self.reward_i_past = tf.placeholder(shape=[None], dtype=tf.float32, name='reward_i_past') self.reward_e_past = tf.placeholder(shape=[None], dtype=tf.float32, name='reward_e_past') self.reward_i_current = tf.placeholder(shape=[None], dtype=tf.float32, name='reward_i_current') self.reward_e_current = tf.placeholder(shape=[None], dtype=tf.float32, name='reward_e_current') # Creating the IO for the entire network input = { "state": self.states_, "state_next": self.next_states_, "bandit_one_hot": self.bandit_one_hot, "action_past": self.action_past, "reward_i_past": self.reward_i_past, "reward_e_past": self.reward_e_past, } out = self.Model(input) self.q = out["Q"] self.a_pred = out["action_prediction"] self.latent = out["latent_space"] self.rnd_random = out["RND_random"] self.rnd_predictor = out["RND_predictor"] input2 = { "state": self.next_states_, "state_next": self.next_states_, #Used as a placeholder in network "bandit_one_hot": self.bandit_one_hot, "action_past": self.actions_, "reward_i_past": self.reward_i_current, "reward_e_past": self.reward_e_current, } out2 = self.Model(input2) q_next = out["Q"] with tf.name_scope('q_learning'): #Current Q oh_action = tf.one_hot( self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size] curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent] #Next Q max_next_q = tf.reduce_max(q_next, axis=-1) #TD Error td_target = self.rewards_ + tf.reduce_sum( tf.cast(self.bandit_one_hot, tf.float32) * self.gammas) * max_next_q * (1. - self.done_) # td_target = self.rewards_ + HPs["Gamma"] * max_next_q * (1. - self.done_) self.td_error = loss = tf.keras.losses.MSE( td_target, curr_q) softmax_q = tf.nn.softmax(curr_q) self.entropy = -tf.reduce_mean( softmax_q * tf.log(softmax_q + 1e-5)) self.loss = loss + HPs["EntropyBeta"] * self.entropy self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) self.int_params = self.Model.GetVariables("Intrinsic") self.critic_params = self.Model.GetVariables("Critic") if globalNet is None: #Creating the Training instance of the network. with tf.name_scope('embedding_network'): oh_action = tf.one_hot( self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size] self.embedding_loss = tf.reduce_mean( tf.keras.losses.MSE(oh_action, self.a_pred)) with tf.name_scope('life_long_curiosity'): self.llc_loss = tf.reduce_mean( tf.keras.losses.MSE(self.rnd_random, self.rnd_predictor)) loss_critic = tf.reduce_mean(self.loss) optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"]) self.gradients = optimizer.get_gradients( loss_critic, self.critic_params) self.update_op = optimizer.apply_gradients( zip(self.gradients, self.critic_params)) # loss_intrinsic = tf.reduce_mean(self.llc_loss + self.embedding_loss) optimizer2 = tf.keras.optimizers.Adam( HPs["LearningRateEmbedding"]) self.embedding_gradients = optimizer2.get_gradients( loss_intrinsic, self.int_params) self.embedding_update = optimizer2.apply_gradients( zip(self.embedding_gradients, self.int_params)) total_counter = 1 vanish_counter = 0 for gradient in self.gradients: total_counter += np.prod(gradient.shape) stuff = tf.reduce_sum( tf.cast( tf.math.less_equal(tf.math.abs(gradient), tf.constant(1e-8)), tf.int32)) vanish_counter += stuff self.vanishing_gradient = vanish_counter / total_counter # self.vanishing_gradient = 0 self.update_ops = [self.update_op, self.embedding_update] # self.update_ops=[self.update_op] self.logging_ops = [ loss, self.embedding_loss, self.llc_loss, self.entropy, self.vanishing_gradient ] self.logging_MA = [ MovingAverage(400) for i in range(len(self.logging_ops)) ] self.labels = [ "Total Loss", "Embedding Loss", "Life Long Curiosity Loss", "Entropy", "Vanishing Gradient" ] else: #Creating a Actor Instance for the Network. #Creating the Episodic Memory, which compares samples self.episodicMemory = EpisodicMemory() #Creating Local Buffer to store data until it is ready to push to sample buffer self.buffer = [Trajectory(depth=10) for _ in range(nTrajs)] #Creating a pull operation to synch network parameters with tf.name_scope('sync'): self.pull_params_op = [ l_p.assign(g_p) for l_p, g_p in zip(self.params, globalNet.params) ] self.pull_ops = [self.pull_params_op] self.alpha = MovingAverage(2000) self.K = MovingAverage(2000) def GetAction(self, state, a_past, r_i_past, r_e_past, episode=None, step=0): """ Contains the code to run the network based on an input. """ #Fixing the state shape if there is somethinf wrong if len(state.shape) == 3: state = state[np.newaxis, :] if len(state.shape) == 1: state = state[np.newaxis, :] #Selecting new beta if the begining of the episode #Also bootstrapping rewards/actions for the if step == 0: currBeta = random.randint(0, self.HPs["N"] - 1) oh = np.zeros(self.HPs["N"]) oh[currBeta] = 1 self.betaSelect = oh self.currBeta = self.betas[currBeta] feedDict = { self.states_: state, self.bandit_one_hot: self.betaSelect[np.newaxis, :], self.action_past: np.asarray(a_past), self.reward_i_past: np.asarray(r_i_past), self.reward_e_past: np.asarray(r_e_past) } q = self.sess.run(self.q, feedDict) if "Exploration" in self.HPs: if self.HPs["Exploration"] == "EGreedy": prob = self.HPs["ExploreSS"] + (1 - self.HPs["ExploreSS"]) * ( np.exp(-episode / self.HPs["ExplorationDecay"])) if random.uniform(0, 1) < prob: actions = np.array( [random.randint(0, self.actionSize - 1)]) else: actions = np.argmax(q, axis=-1) else: actions = np.argmax(q, axis=-1) else: actions = np.argmax(q, axis=-1) return actions, [ self.currBeta, self.betaSelect ] # return a int and extra data that needs to be fed to buffer. def Encode(self, state): if len(state.shape) == 3: state = state[np.newaxis, :] if len(state.shape) == 1: state = state[np.newaxis, :] return self.sess.run(self.latent, {self.states_: state}) def RNDPredictionError(self, state): if len(state.shape) == 3: state = state[np.newaxis, :] if len(state.shape) == 1: state = state[np.newaxis, :] random, predictor = self.sess.run( [self.rnd_random, self.rnd_predictor], {self.states_: state}) return np.linalg.norm(random - predictor) def GetIntrinsicReward(self, state_prev, state, episode=None, step=0): #Clearing the episodic buffer if step == 0: self.episodicMemory.Clear() self.episodicMemory.Add(self.Encode(state_prev)) #Adding Sample to the buffer encodedState = self.Encode(state) stateError = self.RNDPredictionError(state) self.sharedBuffer.AddError(stateError) #####Calculating the episodic reward factor #-finding k nearest neighbors in buffer and distance to them K = self.episodicMemory.NearestNeighborsDist( encodedState, num=self.HPs["NearestNeighbors"]) r_episodic = 1.0 / np.sqrt(K + 0.001) #Calculating alpha stateError_Average, stateError_std = self.sharedBuffer.GetMuSigma() alpha = 1.0 + (stateError - stateError_Average) / stateError_std self.alpha.append(alpha) self.K.append(K) #Calculating the intrinsic reward r_i = r_episodic * min(max(1.0, alpha), self.HPs["L"]) #adding the sample to the buffer after nearest neighbors has been calculated. self.episodicMemory.Add(encodedState) return r_i def Update(self, HPs, episode=0, statistics=True): """ """ #Process the data from the buffer samples, num = self.sharedBuffer.Sample() if num < self.HPs["BatchSize"]: return priorities = [] for traj in samples: if len(traj[0]) <= 5: continue for epoch in range(self.HPs["Epochs"]): #Create a feedDict from the buffer feedDict = { self.states_: np.squeeze(np.asarray(traj[0])), self.actions_: np.squeeze(np.asarray(traj[1])), self.rewards_: np.squeeze(np.asarray(traj[2])), self.next_states_: np.squeeze(np.asarray(traj[3])), self.done_: np.squeeze(np.asarray(traj[4], dtype=float)), self.action_past: np.squeeze(np.asarray(traj[5])), self.reward_i_past: np.squeeze(np.asarray(traj[6])), self.reward_e_past: np.squeeze(np.asarray(traj[7])), self.bandit_one_hot: np.squeeze(np.asarray(traj[8])), } out = self.sess.run( self.update_ops + self.logging_ops, feedDict) # local grads applied to global net. logging = out[len(self.update_ops):] for i, log in enumerate(logging): self.logging_MA[i].append(log) def GetStatistics(self): dict = {} for i, label in enumerate(self.labels): dict["Training Results/" + label] = self.logging_MA[i]() return dict def GetWorkerStatistics(self): dict = {} dict["Training Results/Alpha"] = self.alpha() dict["Training Results/K"] = self.K() return dict def PushToBuffer(self): self.sess.run(self.pull_ops) #Packaging samples in manner that requires modification on the learner end. #Estimating TD Difference to give priority to the data. for traj in range(len(self.buffer)): # g,s_n=MultiStepDiscountProcessing(np.asarray(self.buffer[traj][2]),self.buffer[traj][3],np.sum(self.buffer[traj][9][0]*self.gammas),self.HPs["MultiStep"]) g, s_n = MultiStepDiscountProcessing( np.asarray(self.buffer[traj][2]), self.buffer[traj][3], np.sum(self.buffer[traj][9][0] * self.gammas), self.HPs["MultiStep"]) batches = len( self.buffer[traj][0]) // self.HPs["MinibatchSize"] + 1 s = np.array_split(self.buffer[traj][0], batches) a_his = np.array_split(self.buffer[traj][1], batches) r = np.array_split(np.asarray(g), batches) s_next = np.array_split(s_n, batches) done = np.array_split(self.buffer[traj][4], batches) action_past = np.array_split(self.buffer[traj][5], batches) reward_i_past = np.array_split(self.buffer[traj][6], batches) reward_e_past = np.array_split(self.buffer[traj][7], batches) bandit_one_hot = np.array_split(self.buffer[traj][9], batches) for i in range(batches): feedDict = { self.states_: np.squeeze(np.asarray(s[i])), self.next_states_: np.squeeze(np.asarray(s_next[i])), self.actions_: np.squeeze(np.asarray(a_his[i])), self.rewards_: np.squeeze(np.asarray(r[i])), self.done_: np.squeeze(np.asarray(done[i], dtype=float)), self.bandit_one_hot: np.squeeze(np.asarray(bandit_one_hot[i])), self.action_past: np.squeeze(np.asarray(action_past[i])), self.reward_i_past: np.squeeze(np.asarray(reward_i_past[i])), self.reward_e_past: np.squeeze(np.asarray(reward_e_past[i])), } priority = self.sess.run(self.td_error, feedDict) self.sharedBuffer.AddTrajectory([ s[i], a_his[i], r[i], s_next[i], done[i], action_past[i], reward_i_past[i], reward_e_past[i], bandit_one_hot[i] ], priority) self.ClearTrajectory() def PrioritizeBuffer(self): #Updating the network weights before calculating new priorities self.sess.run(self.pull_ops) #Getting the data that needs to be assigned a new priority. trajs = self.sharedBuffer.GetReprioritySamples() priority = [] for traj in trajs: feedDict = { self.states_: np.squeeze(np.asarray(traj[0])), self.actions_: np.squeeze(np.asarray(traj[1])), self.rewards_: np.squeeze(np.asarray(traj[2])), self.next_states_: np.squeeze(np.asarray(traj[3])), self.done_: np.squeeze(np.asarray(traj[4], dtype=float)), self.action_past: np.squeeze(np.asarray(traj[5])), self.reward_i_past: np.squeeze(np.asarray(traj[6])), self.reward_e_past: np.squeeze(np.asarray(traj[7])), self.bandit_one_hot: np.squeeze(np.asarray(traj[8])), } priority.append(self.sess.run(self.td_error, feedDict)) #Calculating the priority. self.sharedBuffer.UpdatePriorities(priority) #Pushing the priorities back to the buffer self.sharedBuffer.PrioritizeandPruneSamples(2048) @property def getVars(self): return self.Model.getVars(self.scope)
class A3C(Method): def __init__(self, sharedModel, sess, stateShape, actionSize, scope, HPs, globalAC=None, nTrajs=1): """ Initializes I/O placeholders used for Tensorflow session runs. Initializes and Actor and Critic Network to be used for the purpose of RL. """ #Placeholders self.sess = sess self.scope = scope self.Model = sharedModel if len(stateShape) == 4: self.s = tf.placeholder(tf.float32, [None] + stateShape[1:4], 'S') else: self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S') self.a_his = tf.placeholder(tf.int32, [ None, ], 'A') self.v_target = tf.placeholder(tf.float32, [None], 'Vtarget') input = {"state": self.s} out = self.Model(input) self.a_prob = out["actor"] self.v = out["critic"] if globalAC is None: # get global network with tf.variable_scope(scope): self.a_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.Model.scope + '/Shared') + tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Model.scope + '/Actor') self.c_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.Model.scope + '/Shared') + tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Model.scope + '/Critic') else: # local net, calculate losses self.buffer = [Trajectory(depth=6) for _ in range(nTrajs)] with tf.variable_scope(scope + "_update"): self.a_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.Model.scope + '/Shared') + tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Model.scope + '/Actor') self.c_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.Model.scope + '/Shared') + tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Model.scope + '/Critic') print(self.c_params) td = tf.subtract(self.v_target, self.v, name='TD_error') with tf.name_scope('c_loss'): self.c_loss = tf.reduce_mean(tf.square(td)) with tf.name_scope('a_loss'): log_prob = tf.reduce_sum( tf.log(self.a_prob + 1e-5) * tf.one_hot(self.a_his, actionSize, dtype=tf.float32), axis=1, keep_dims=True) exp_v = log_prob * tf.stop_gradient(td) entropy = -tf.reduce_sum( self.a_prob * tf.log(self.a_prob + 1e-5), axis=1, keep_dims=True) # encourage exploration self.entropy = entropy self.exp_v = HPs["EntropyBeta"] * entropy + exp_v self.a_loss = tf.reduce_mean(-self.exp_v) with tf.name_scope('local_grad'): self.a_grads = tf.gradients(self.a_loss, self.a_params) self.c_grads = tf.gradients(self.c_loss, self.c_params) with tf.name_scope('sync'): with tf.name_scope('pull'): self.pull_a_params_op = [ l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params) ] self.pull_c_params_op = [ l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params) ] with tf.name_scope('push'): self.update_a_op = tf.train.AdamOptimizer( HPs["Actor LR"]).apply_gradients( zip(self.a_grads, globalAC.a_params)) self.update_c_op = tf.train.AdamOptimizer( HPs["Critic LR"]).apply_gradients( zip(self.c_grads, globalAC.c_params)) self.update_ops = [ self.update_a_op, self.update_c_op, ] self.pull_ops = [ self.pull_a_params_op, self.pull_c_params_op, ] self.grads = [ self.a_grads, self.c_grads, ] self.losses = [ self.a_loss, self.c_loss, ] self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))] self.loss_MA = [MovingAverage(400) for i in range(len(self.grads))] self.entropy_MA = MovingAverage(400) self.labels = [ "Actor", "Critic", ] self.HPs = HPs def GetAction(self, state): """ Contains the code to run the network based on an input. """ if len(state.shape) == 3: state = state[np.newaxis, :] if len(state.shape) == 1: state = state[np.newaxis, :] probs, v = self.sess.run( [self.a_prob, self.v], {self.s: state}) # get probabilities for all actions actions = np.array([ np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs ]) return actions, [ v ] # return a int and extra data that needs to be fed to buffer. def Update(self, HPs, episode=0, statistics=True): """ The main update function for A3C. The function pushes gradients to the global AC Network. The second function is to Pull """ #Process the data from the buffer samples = 0 for i in range(len(self.buffer)): samples += len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): td_target, _ = self.ProcessBuffer(HPs, traj) batches = len( self.buffer[traj][0]) // self.HPs["MinibatchSize"] + 1 s = np.array_split(np.squeeze(self.buffer[traj][0]), batches) a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches) v_target = np.array_split(td_target, batches) for epoch in range(self.HPs["Epochs"]): for i in range(batches): #Create a feedDict from the buffer feedDict = { self.s: s[i], self.a_his: a_his[i], self.v_target: v_target[i], } if not statistics: self.sess.run(self.update_ops, feedDict) else: #Perform update operations out = self.sess.run( self.update_ops + self.losses + self.grads, feedDict) # local grads applied to global net. out = np.array_split(out, 3) losses = out[1] grads = out[2] for i, loss in enumerate(losses): self.loss_MA[i].append(loss) for i, grads_i in enumerate(grads): total_counter = 0 vanish_counter = 0 for grad in grads_i: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad) < 1e-8).sum() self.grad_MA[i].append(vanish_counter / total_counter) ent = self.sess.run( self.entropy, feedDict) # local grads applied to global net. entropy = np.average(np.asarray(ent)) self.entropy_MA.append(entropy) self.ClearTrajectory() self.sess.run( self.pull_ops) # global variables synched to the local net. def GetStatistics(self): dict = {} for i, label in enumerate(self.labels): dict["Training Results/Vanishing Gradient " + label] = self.grad_MA[i]() dict["Training Results/Loss " + label] = self.loss_MA[i]() dict["Training Results/Entropy"] = self.entropy_MA() return dict def ProcessBuffer(self, HPs, traj): """ Process the buffer to calculate td_target. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x] reward_lists = np.split(self.buffer[traj][2], split_loc) value_lists = np.split(self.buffer[traj][5], split_loc) td_target = [] advantage = [] for rew, value in zip(reward_lists, value_lists): td_target_i, advantage_i = gae(rew.reshape(-1), value.reshape(-1).tolist(), 0, self.HPs["Gamma"], self.HPs["lambda"]) td_target.extend(td_target_i) advantage.extend(advantage_i) return td_target, advantage @property def getVars(self): return self.Model.getVars(self.scope) @property def getAParams(self): return tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.Model.scope + '/Shared') + tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Model.scope + 'Actor') @property def getCParams(self): return tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.Model.scope + '/Shared') + tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Model.scope + '/Critic')
class ApeX(Method): def __init__(self,sharedModel,sess,stateShape,actionSize,scope,HPs,sharedBuffer,globalAC=None,nTrajs=1): """ Initializes I/O placeholders used for Tensorflow session runs. Initializes and Actor and Critic Network to be used for the purpose of RL. """ self.sess=sess self.scope=scope self.sharedBuffer=sharedBuffer #Creating the General I/O of the network self.Model = sharedModel with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope(scope): if len(stateShape) == 4: self.states_ = tf.placeholder(shape=[None]+stateShape[1:4], dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None]+stateShape[1:4], dtype=tf.float32, name='next_states') else: self.states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='next_states') self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold') self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='rewards_hold') self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold') input = {"state":self.states_} out = self.Model(input) self.q = out["Q"] out2 = self.Model({"state":self.next_states_}) q_next = out2["Q"] # GettingVariables for the specified network. with tf.variable_scope(scope): self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) # Creating the Global Actor that does all of the learning if globalAC is None: with tf.variable_scope(scope+"_update"): with tf.name_scope('current_Q'): oh_action = tf.one_hot(self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size] curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent] with tf.name_scope('target_Q'): max_next_q = tf.reduce_max(q_next, axis=-1) td_target = self.rewards_ + HPs["Gamma"] * max_next_q * (1. - self.done_) with tf.name_scope('td_error'): self.td_error=loss = tf.keras.losses.MSE(td_target, curr_q) softmax_q = tf.nn.softmax(curr_q) self.entropy = -tf.reduce_mean(softmax_q * tf.log(softmax_q)) self.loss=total_loss = loss + HPs["EntropyBeta"] * self.entropy optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"]) self.gradients = optimizer.get_gradients(total_loss, self.params) self.update_op = optimizer.apply_gradients(zip(self.gradients, self.params)) self.grads=[self.gradients] self.losses=[self.loss] self.update_ops=[self.update_op] self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))] self.loss_MA = [MovingAverage(400) for i in range(len(self.grads))] self.entropy_MA = MovingAverage(400) self.labels = ["Critic",] # Creating the local networks that only pull parameters and run experiments. else: #Creating local Buffer that self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)] with tf.variable_scope(scope+"_priority"): with tf.name_scope('current_Q'): oh_action = tf.one_hot(self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size] curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent] with tf.name_scope('target_Q'): max_next_q = tf.reduce_max(q_next, axis=-1) td_target = self.rewards_ + HPs["Gamma"] * max_next_q * (1. - self.done_) with tf.name_scope('td_error'): self.td_error = tf.keras.losses.MSE(td_target, curr_q) with tf.name_scope('sync'): self.pull_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.params, globalAC.params)] self.pull_ops = [self.pull_params_op] self.HPs = HPs def GetAction(self, state,episode,step): """ Contains the code to run the network based on an input. """ if len(state.shape) == 3: state = state[np.newaxis, :] if len(state.shape) == 1: state = state[np.newaxis, :] q = self.sess.run(self.q, {self.states_: state}) if "Exploration" in self.HPs: if self.HPs["Exploration"]=="EGreedy": prob = 0.1 + 0.9*(np.exp(-episode/self.HPs["ExplorationDecay"])) if random.uniform(0, 1) < prob: actions = random.randint(0,4) else: actions = np.argmax(q, axis=-1) else: actions = np.argmax(q, axis=-1) return actions ,[] # return a int and extra data that needs to be fed to buffer. def Update(self,HPs,episode=0,statistics=True): """ The main update function for A3C. The function pushes gradients to the global AC Network. The second function is to Pull """ #Process the data from the buffer samples,num = self.sharedBuffer.Sample() if num < self.HPs["BatchSize"]: return priorities = [] for traj in samples: if len(traj[0]) <= 5: continue batches = num//self.HPs["MinibatchSize"]+1 s = np.array_split(traj[0], batches) a_his = np.array_split( traj[1], batches) r = np.array_split( traj[2], batches) s_next = np.array_split( traj[3], batches) done = np.array_split( traj[4], batches) for epoch in range(self.HPs["Epochs"]): for i in range(batches): #Create a feedDict from the buffer if len(np.squeeze(np.asarray(s[i])).shape)==3: continue feedDict = { self.states_ : np.squeeze(np.asarray(s[i])), self.next_states_ : np.squeeze(np.asarray(s_next[i])), self.actions_ : a_his[i], self.rewards_ : r[i], self.done_ : np.squeeze(np.asarray(done[i],dtype=float)) } out = self.sess.run(self.update_ops+self.losses+self.grads, feedDict) # local grads applied to global net. out = np.array_split(out,3) losses = out[1] grads = out[2] for i,loss in enumerate(losses): self.loss_MA[i].append(loss) for i,grads_i in enumerate(grads): total_counter = 1 vanish_counter = 0 for grad in grads_i: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad)<1e-8).sum() self.grad_MA[i].append(vanish_counter/total_counter) ent = self.sess.run(self.entropy, feedDict) # local grads applied to global net. entropy = np.average(np.asarray(ent)) self.entropy_MA.append(entropy) #Updating the Priorities of the samples. feedDict = { self.states_ : np.squeeze(np.asarray(traj[0])), self.next_states_ : np.squeeze(np.asarray(traj[3])), self.actions_ : traj[1], self.rewards_ : traj[2], self.done_ : np.squeeze(np.asarray(traj[4],dtype=float)) } priorities.append(self.sess.run(self.td_error, feedDict)) self.sharedBuffer.UpdatePriorities(priorities) def PushToBuffer(self): #Packaging samples in manner that requires modification on the learner end. #Estimating TD Difference to give priority to the data. for traj in range(len(self.buffer)): s = self.buffer[traj][0] a_his = np.asarray(self.buffer[traj][1]).reshape(-1) r = np.asarray(self.buffer[traj][2]).reshape(-1) s_next = self.buffer[traj][3] done = self.buffer[traj][4] #Create a feedDict from the buffer feedDict = { self.states_ : np.squeeze(np.asarray(s)), self.next_states_ : np.squeeze(np.asarray(s_next)), self.actions_ : np.squeeze(np.asarray(a_his)), self.rewards_ : np.squeeze(np.asarray(r)), self.done_ : np.squeeze(np.asarray(done,dtype=float)) } priority = self.sess.run(self.td_error, feedDict) # local grads applied to global net. self.sharedBuffer.AddTrajectory([s,a_his,r,s_next,done],priority) self.sharedBuffer.PrioritizeandPruneSamples(2048) self.ClearTrajectory() self.sess.run(self.pull_ops) def GetStatistics(self): dict ={} for i,label in enumerate(self.labels): dict["Training Results/Vanishing Gradient " + label] = self.grad_MA[i]() dict["Training Results/Loss " + label] = self.loss_MA[i]() dict["Training Results/Entropy"] = self.entropy_MA() return dict def ProcessBuffer(self,HPs,traj): """ Process the buffer to calculate td_target. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x] reward_lists = np.split(self.buffer[traj][2],split_loc) value_lists = np.split(self.buffer[traj][5],split_loc) td_target=[]; advantage=[] for rew,value in zip(reward_lists,value_lists): td_target_i, advantage_i = gae(rew.reshape(-1),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"]) td_target.extend(td_target_i); advantage.extend( advantage_i) return td_target, advantage @property def getVars(self): return self.Model.getVars(self.scope) @property def getAParams(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.Model.scope + '/Shared') + tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Model.scope+ 'Actor') @property def getCParams(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.Model.scope + '/Shared') + tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Model.scope+ '/Critic')