def p_train(observPlaceHolderList, actionSpaceList, agentIndex, p_func, q_func, optimizer, grad_norm_clipping, ddpg, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # set up placeholders actionPlaceHolderList = [ tf.placeholder(dtype=tf.float32, shape=[None] + [actionSpaceList[i].n], name="action" + str(i)) for i in range(len(actionSpaceList)) ] policyNetInput = observPlaceHolderList[ agentIndex] # personal observation policyOutputShape = int(actionSpaceList[agentIndex].n) policyTrainOutput = p_func(policyNetInput, policyOutputShape, scope="p_func", num_units=num_units) policyNetVariables = U.scope_vars(U.absolute_scope_name("p_func")) sampleNoise = tf.random_uniform(tf.shape(policyTrainOutput), seed=0) actionSample = U.softmax(policyTrainOutput - tf.log(-tf.log(sampleNoise)), axis=-1) # output of function act p_reg = tf.reduce_mean(tf.square(policyTrainOutput)) actionInputPlaceHolderList = actionPlaceHolderList + [] actionInputPlaceHolderList[agentIndex] = actionSample qNetInput = tf.concat( observPlaceHolderList + actionInputPlaceHolderList, 1) if ddpg: qNetInput = tf.concat( [observPlaceHolderList[agentIndex], actionSample], 1) q = q_func(qNetInput, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 ####### didnt change this optimization process in my ddpg optimize_expr = U.minimize_and_clip(optimizer, loss, policyNetVariables, grad_norm_clipping) # Create callable functions train = U.function(inputs=observPlaceHolderList + actionPlaceHolderList, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=actionSample) p_values = U.function([observPlaceHolderList[agentIndex]], policyTrainOutput) # target network target_p = p_func(policyNetInput, int(actionSpaceList[agentIndex].n), scope="target_p_func", num_units=num_units) targetNetVariables = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(policyNetVariables, targetNetVariables) uTarget = tf.random_uniform(tf.shape(target_p)) target_act_sample = U.softmax(target_p - tf.log(-tf.log(uTarget)), axis=-1) target_act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def __call__(self, layersWidths, agentID=None): agentStr = 'Agent' + str(agentID) if agentID is not None else '' print("Generating Actor NN Model with layers: {}".format(layersWidths)) graph = tf.Graph() with graph.as_default(): with tf.name_scope("inputs" + agentStr): states_ = tf.placeholder(tf.float32, [None, self.numStateSpace], name='states_') qVal_ = tf.placeholder(tf.float32, [None, 1], name='qVal_') tf.add_to_collection("states_", states_) tf.add_to_collection("qVal_", qVal_) with tf.name_scope("trainingParams" + agentStr): learningRate_ = tf.constant(0, dtype=tf.float32) tau_ = tf.constant(0, dtype=tf.float32) tf.add_to_collection("learningRate_", learningRate_) tf.add_to_collection("tau_", tau_) with tf.variable_scope("trainHidden" + agentStr): activation_ = states_ for i in range(len(layersWidths)): # activation_ = layers.fully_connected(activation_, num_outputs= layersWidths[i], activation_fn=tf.nn.relu, # scope="fc{}".format(i+1), weights_initializer=tf.initializers.glorot_uniform(seed=0)) activation_ = layers.fully_connected( activation_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu, scope="fc{}".format(i + 1)) trainActivationOutput_ = layers.fully_connected( activation_, num_outputs=self.actionDim, activation_fn=None, scope="fc{}".format(len(layersWidths) + 1)) with tf.variable_scope("targetHidden" + agentStr): activation_ = states_ for i in range(len(layersWidths)): activation_ = layers.fully_connected( activation_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu, scope="fc{}".format(i + 1)) targetActivationOutput_ = layers.fully_connected( activation_, num_outputs=self.actionDim, activation_fn=None, scope="fc{}".format(len(layersWidths) + 1)) with tf.name_scope("updateParameters" + agentStr): trainParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='trainHidden') targetParams_ = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='targetHidden') updateParam_ = [ targetParams_[i].assign((1 - tau_) * targetParams_[i] + tau_ * trainParams_[i]) for i in range(len(targetParams_)) ] tf.add_to_collection("trainParams_", trainParams_) tf.add_to_collection("targetParams_", targetParams_) tf.add_to_collection("updateParam_", updateParam_) hardReplaceTargetParam_ = [ tf.assign(trainParam, targetParam) for trainParam, targetParam in zip( trainParams_, targetParams_) ] tf.add_to_collection("hardReplaceTargetParam_", hardReplaceTargetParam_) with tf.name_scope("output" + agentStr): trainAction_ = tf.multiply(trainActivationOutput_, self.actionRange, name='trainAction_') targetAction_ = tf.multiply(targetActivationOutput_, self.actionRange, name='targetAction_') sampleNoiseTrain_ = tf.random_uniform( tf.shape(trainActivationOutput_)) noisyTrainAction_ = U.softmax( trainActivationOutput_ - tf.log(-tf.log(sampleNoiseTrain_)), axis=-1) # give this to q input tf.add_to_collection("sampleNoiseTrain_", sampleNoiseTrain_) sampleNoiseTarget_ = tf.random_uniform( tf.shape(targetActivationOutput_)) noisyTargetAction_ = U.softmax( targetActivationOutput_ - tf.log(-tf.log(sampleNoiseTarget_)), axis=-1) tf.add_to_collection("trainAction_", trainAction_) tf.add_to_collection("targetAction_", targetAction_) tf.add_to_collection("noisyTrainAction_", noisyTrainAction_) tf.add_to_collection("noisyTargetAction_", noisyTargetAction_) with tf.name_scope("train" + agentStr): p_reg = tf.reduce_mean(tf.square(trainActivationOutput_)) pg_loss = -tf.reduce_mean(qVal_) actorLoss_ = pg_loss + p_reg * 1e-3 tf.summary.scalar("pg_loss", pg_loss) tf.add_to_collection("actorLoss_", actorLoss_) # optimizer = tf.train.AdamOptimizer(learningRate_, name='adamOptimizer') # grad_norm_clipping = 0.5 # trainOpt_ = U.minimize_and_clip(optimizer, actorLoss_, trainParams_, grad_norm_clipping) optimizer = tf.train.AdamOptimizer(learningRate_, name='adamOptimizer') grad_norm_clipping = 0.5 gradients = optimizer.compute_gradients(actorLoss_, var_list=trainParams_) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm( grad, grad_norm_clipping), var) with tf.name_scope("inspectGrad"): for i, (grad_, var_) in enumerate(gradients): keyPrefix = "weightGradient" if "weights" in var_.name else "biasGradient" tf.add_to_collection(f"{keyPrefix}/{var_.name}", grad_) gradients_ = [ tf.reshape(grad_, [1, -1]) for i, (grad_, var_) in enumerate(gradients) ] allGradTensor_ = tf.concat(gradients_, 1) allGradNorm_ = tf.norm(allGradTensor_) tf.add_to_collection("allGradNorm", allGradNorm_) tf.summary.histogram("allGradients", allGradTensor_) tf.summary.scalar("allGradNorm", allGradNorm_) trainOpt_ = optimizer.apply_gradients(gradients) tf.add_to_collection("trainOpt_", trainOpt_) with tf.name_scope("summary" + agentStr): actorLossSummary_ = tf.identity(actorLoss_) tf.add_to_collection("actorLossSummary_", actorLossSummary_) tf.summary.scalar("actorLossSummary", actorLossSummary_) fullSummary = tf.summary.merge_all() tf.add_to_collection("summaryOps", fullSummary) actorSaver = tf.train.Saver(max_to_keep=None) tf.add_to_collection("saver", actorSaver) model = tf.Session(graph=graph) model.run(tf.global_variables_initializer()) actorWriter = tf.summary.FileWriter('tensorBoard/actorOnlineDDPG' + agentStr, graph=graph) tf.add_to_collection("actorWriter", actorWriter) return actorWriter, model
def __call__(self, layersWidths, agentID=None): agentStr = 'Agent' + str(agentID) if agentID is not None else '' graph = tf.Graph() with graph.as_default(): with tf.variable_scope("inputs/" + agentStr): states_ = tf.placeholder(tf.float32, [None, self.numStateSpace], name='states_') nextStates_ = tf.placeholder(tf.float32, [None, self.numStateSpace], name='nextStates_') action_ = tf.stop_gradient(tf.placeholder( tf.float32, [None, self.actionDim]), name='action_') reward_ = tf.placeholder(tf.float32, [None, 1], name='reward_') tf.add_to_collection("states_", states_) tf.add_to_collection("nextStates_", nextStates_) tf.add_to_collection("action_", action_) tf.add_to_collection("reward_", reward_) with tf.variable_scope("trainingParams" + agentStr): learningRate_ = tf.constant(0, dtype=tf.float32) tau_ = tf.constant(0, dtype=tf.float32) gamma_ = tf.constant(0, dtype=tf.float32) tf.add_to_collection("learningRate_", learningRate_) tf.add_to_collection("tau_", tau_) tf.add_to_collection("gamma_", gamma_) with tf.variable_scope("actor/trainHidden/" + agentStr): actorTrainActivation_ = states_ for i in range(len(layersWidths)): actorTrainActivation_ = layers.fully_connected( actorTrainActivation_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu) actorTrainActivation_ = layers.fully_connected( actorTrainActivation_, num_outputs=self.actionDim, activation_fn=None) with tf.variable_scope("actor/targetHidden/" + agentStr): actorTargetActivation_ = nextStates_ for i in range(len(layersWidths)): actorTargetActivation_ = layers.fully_connected( actorTargetActivation_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu) actorTargetActivation_ = layers.fully_connected( actorTargetActivation_, num_outputs=self.actionDim, activation_fn=None) with tf.variable_scope("actorNetOutput/" + agentStr): trainAction_ = tf.multiply(actorTrainActivation_, self.actionRange, name='trainAction_') targetAction_ = tf.multiply(actorTargetActivation_, self.actionRange, name='targetAction_') trainActionSpread = [] batchSize = tf.shape(trainAction_)[0] trainActionReshaped_ = tf.reshape( trainAction_, [self.numAgentsToControl, batchSize, self.singleActionDim]) for i in range(self.numAgentsToControl): agentAction_ = trainActionReshaped_[i] sampleNoiseTrainAgent_ = tf.random_uniform( tf.shape(agentAction_)) agentNoisyTrainAction_ = U.softmax( agentAction_ - tf.log(-tf.log(sampleNoiseTrainAgent_)), axis=-1) # give this to q input trainActionSpread.append(agentNoisyTrainAction_) noisyTrainAction_ = tf.concat(trainActionSpread, axis=1) targetActionSpread = [] batchSize = tf.shape(targetAction_)[0] targetActionReshaped_ = tf.reshape( targetAction_, [self.numAgentsToControl, batchSize, self.singleActionDim]) for i in range(self.numAgentsToControl): agentAction_ = targetActionReshaped_[i] sampleNoiseTargetAgent_ = tf.random_uniform( tf.shape(agentAction_)) agentNoisyTargetAction_ = U.softmax( agentAction_ - tf.log(-tf.log(sampleNoiseTargetAgent_)), axis=-1) # give this to q input targetActionSpread.append(agentNoisyTargetAction_) noisyTargetAction_ = tf.concat(targetActionSpread, axis=1) tf.add_to_collection("trainAction_", trainAction_) tf.add_to_collection("targetAction_", targetAction_) tf.add_to_collection("noisyTrainAction_", noisyTrainAction_) tf.add_to_collection("noisyTargetAction_", noisyTargetAction_) with tf.variable_scope("critic/trainHidden/" + agentStr): criticTrainActivationOfGivenAction_ = tf.concat( [states_, action_], axis=1) for i in range(len(layersWidths)): criticTrainActivationOfGivenAction_ = layers.fully_connected( criticTrainActivationOfGivenAction_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu) criticTrainActivationOfGivenAction_ = layers.fully_connected( criticTrainActivationOfGivenAction_, num_outputs=1, activation_fn=None) with tf.variable_scope("critic/trainHidden/" + agentStr, reuse=True): criticTrainActivation_ = tf.concat( [states_, noisyTrainAction_], axis=1) for i in range(len(layersWidths)): criticTrainActivation_ = layers.fully_connected( criticTrainActivation_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu) criticTrainActivation_ = layers.fully_connected( criticTrainActivation_, num_outputs=1, activation_fn=None) with tf.variable_scope("critic/targetHidden/" + agentStr): criticTargetActivation_ = tf.concat( [nextStates_, noisyTargetAction_], axis=1) for i in range(len(layersWidths)): criticTargetActivation_ = layers.fully_connected( criticTargetActivation_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu) criticTargetActivation_ = layers.fully_connected( criticTargetActivation_, num_outputs=1, activation_fn=None) with tf.variable_scope("updateParameters/" + agentStr): actorTrainParams_ = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/trainHidden/' + agentStr) actorTargetParams_ = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/targetHidden/' + agentStr) actorUpdateParam_ = [ actorTargetParams_[i].assign((1 - tau_) * actorTargetParams_[i] + tau_ * actorTrainParams_[i]) for i in range(len(actorTargetParams_)) ] tf.add_to_collection("actorTrainParams_", actorTrainParams_) tf.add_to_collection("actorTargetParams_", actorTargetParams_) tf.add_to_collection("actorUpdateParam_", actorUpdateParam_) hardReplaceActorTargetParam_ = [ tf.assign(trainParam, targetParam) for trainParam, targetParam in zip( actorTrainParams_, actorTargetParams_) ] tf.add_to_collection("hardReplaceActorTargetParam_", hardReplaceActorTargetParam_) criticTrainParams_ = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/trainHidden/' + agentStr) criticTargetParams_ = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/targetHidden/' + agentStr) criticUpdateParam_ = [ criticTargetParams_[i].assign((1 - tau_) * criticTargetParams_[i] + tau_ * criticTrainParams_[i]) for i in range(len(criticTargetParams_)) ] tf.add_to_collection("criticTrainParams_", criticTrainParams_) tf.add_to_collection("criticTargetParams_", criticTargetParams_) tf.add_to_collection("criticUpdateParam_", criticUpdateParam_) hardReplaceCriticTargetParam_ = [ tf.assign(trainParam, targetParam) for trainParam, targetParam in zip( criticTrainParams_, criticTargetParams_) ] tf.add_to_collection("hardReplaceCriticTargetParam_", hardReplaceCriticTargetParam_) updateParam_ = actorUpdateParam_ + criticUpdateParam_ hardReplaceTargetParam_ = hardReplaceActorTargetParam_ + hardReplaceCriticTargetParam_ tf.add_to_collection("updateParam_", updateParam_) tf.add_to_collection("hardReplaceTargetParam_", hardReplaceTargetParam_) with tf.variable_scope("trainActorNet/" + agentStr): trainQ = criticTrainActivation_[:, 0] pg_loss = -tf.reduce_mean(trainQ) p_reg = tf.reduce_mean(tf.square(actorTrainActivation_)) actorLoss_ = pg_loss + p_reg * 1e-3 actorOptimizer = tf.train.AdamOptimizer(learningRate_, name='actorOptimizer') grad_norm_clipping = 0.5 actorTrainOpt_ = U.minimize_and_clip(actorOptimizer, actorLoss_, actorTrainParams_, grad_norm_clipping) tf.add_to_collection("actorLoss_", actorLoss_) tf.add_to_collection("actorTrainOpt_", actorTrainOpt_) with tf.variable_scope("trainCriticNet/" + agentStr): yi_ = reward_ + gamma_ * criticTargetActivation_ criticLoss_ = tf.reduce_mean( tf.squared_difference( tf.squeeze(yi_), tf.squeeze(criticTrainActivationOfGivenAction_))) tf.add_to_collection("yi_", yi_) tf.add_to_collection("valueLoss_", criticLoss_) criticOptimizer = tf.train.AdamOptimizer( learningRate_, name='criticOptimizer') grad_norm_clipping = 0.5 crticTrainOpt_ = U.minimize_and_clip(criticOptimizer, criticLoss_, criticTrainParams_, grad_norm_clipping) tf.add_to_collection("crticTrainOpt_", crticTrainOpt_) with tf.variable_scope("summary" + agentStr): criticLossSummary = tf.identity(criticLoss_) tf.add_to_collection("criticLossSummary", criticLossSummary) tf.summary.scalar("criticLossSummary", criticLossSummary) fullSummary = tf.summary.merge_all() tf.add_to_collection("summaryOps", fullSummary) saver = tf.train.Saver(max_to_keep=None) tf.add_to_collection("saver", saver) model = tf.Session(graph=graph) model.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('tensorBoard/onlineDDPG/' + agentStr, graph=graph) tf.add_to_collection("writer", writer) return model
def __call__(self, layersWidths, agentID): agentStr = 'Agent'+ str(agentID) graph = tf.Graph() with graph.as_default(): with tf.variable_scope("inputs/"+ agentStr): allAgentsStates_ = [tf.placeholder(dtype=tf.float32, shape=[None, agentObsDim], name="state"+str(i)) for i, agentObsDim in enumerate(self.obsShapeList)] allAgentsNextStates_ = [tf.placeholder(dtype=tf.float32, shape=[None, agentObsDim], name="nextState"+str(i)) for i, agentObsDim in enumerate(self.obsShapeList)] allAgentsActions_ = [tf.placeholder(dtype=tf.float32, shape=[None, self.actionDim], name="action"+str(i)) for i in range(self.numAgents)] allAgentsNextActionsByTargetNet_ = [tf.placeholder(dtype=tf.float32, shape=[None, self.actionDim], name= "actionTarget"+str(i)) for i in range(self.numAgents)] agentReward_ = tf.placeholder(tf.float32, [None, 1], name='reward_') tf.add_to_collection("allAgentsStates_", allAgentsStates_) tf.add_to_collection("allAgentsNextStates_", allAgentsNextStates_) tf.add_to_collection("allAgentsActions_", allAgentsActions_) tf.add_to_collection("allAgentsNextActionsByTargetNet_", allAgentsNextActionsByTargetNet_) tf.add_to_collection("agentReward_", agentReward_) with tf.variable_scope("trainingParams" + agentStr): learningRate_ = tf.constant(0, dtype=tf.float32) tau_ = tf.constant(0, dtype=tf.float32) gamma_ = tf.constant(0, dtype=tf.float32) tf.add_to_collection("learningRate_", learningRate_) tf.add_to_collection("tau_", tau_) tf.add_to_collection("gamma_", gamma_) with tf.variable_scope("actor/trainHidden/"+ agentStr): # act by personal observation currentAgentState_ = allAgentsStates_[agentID] actorTrainActivation_ = currentAgentState_ for i in range(len(layersWidths)): actorTrainActivation_ = layers.fully_connected(actorTrainActivation_, num_outputs= layersWidths[i], activation_fn=tf.nn.relu) actorTrainActivation_ = layers.fully_connected(actorTrainActivation_, num_outputs= self.actionDim, activation_fn= None) with tf.variable_scope("actor/targetHidden/"+ agentStr): currentAgentNextState_ = allAgentsNextStates_[agentID] actorTargetActivation_ = currentAgentNextState_ for i in range(len(layersWidths)): actorTargetActivation_ = layers.fully_connected(actorTargetActivation_, num_outputs= layersWidths[i], activation_fn=tf.nn.relu) actorTargetActivation_ = layers.fully_connected(actorTargetActivation_, num_outputs= self.actionDim, activation_fn=None) with tf.variable_scope("actorNetOutput/"+ agentStr): trainAction_ = tf.multiply(actorTrainActivation_, self.actionRange, name='trainAction_') targetAction_ = tf.multiply(actorTargetActivation_, self.actionRange, name='targetAction_') sampleNoiseTrain_ = tf.random_uniform(tf.shape(trainAction_)) noisyTrainAction_ = U.softmax(trainAction_ - tf.log(-tf.log(sampleNoiseTrain_)), axis=-1) # give this to q input sampleNoiseTarget_ = tf.random_uniform(tf.shape(targetAction_)) noisyTargetAction_ = U.softmax(targetAction_ - tf.log(-tf.log(sampleNoiseTarget_)), axis=-1) tf.add_to_collection("trainAction_", trainAction_) tf.add_to_collection("targetAction_", targetAction_) tf.add_to_collection("noisyTrainAction_", noisyTrainAction_) tf.add_to_collection("noisyTargetAction_", noisyTargetAction_) with tf.variable_scope("critic/trainHidden/"+ agentStr): criticTrainActivationOfGivenAction_ = tf.concat(allAgentsStates_ + allAgentsActions_, axis=1) for i in range(len(layersWidths)): criticTrainActivationOfGivenAction_ = layers.fully_connected(criticTrainActivationOfGivenAction_, num_outputs= layersWidths[i], activation_fn=tf.nn.relu) criticTrainActivationOfGivenAction_ = layers.fully_connected(criticTrainActivationOfGivenAction_, num_outputs= 1, activation_fn= None) with tf.variable_scope("critic/trainHidden/" + agentStr, reuse= True): criticInputActionList = allAgentsActions_ + [] criticInputActionList[agentID] = noisyTrainAction_ criticTrainActivation_ = tf.concat(allAgentsStates_ + criticInputActionList, axis=1) for i in range(len(layersWidths)): criticTrainActivation_ = layers.fully_connected(criticTrainActivation_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu) criticTrainActivation_ = layers.fully_connected(criticTrainActivation_, num_outputs=1, activation_fn=None) with tf.variable_scope("critic/targetHidden/"+ agentStr): criticTargetActivation_ = tf.concat(allAgentsNextStates_ + allAgentsNextActionsByTargetNet_, axis=1) for i in range(len(layersWidths)): criticTargetActivation_ = layers.fully_connected(criticTargetActivation_, num_outputs= layersWidths[i],activation_fn=tf.nn.relu) criticTargetActivation_ = layers.fully_connected(criticTargetActivation_, num_outputs= 1,activation_fn=None) with tf.variable_scope("updateParameters/"+ agentStr): actorTrainParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/trainHidden/'+ agentStr) actorTargetParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/targetHidden/'+ agentStr) actorUpdateParam_ = [actorTargetParams_[i].assign((1 - tau_) * actorTargetParams_[i] + tau_ * actorTrainParams_[i]) for i in range(len(actorTargetParams_))] tf.add_to_collection("actorTrainParams_", actorTrainParams_) tf.add_to_collection("actorTargetParams_", actorTargetParams_) tf.add_to_collection("actorUpdateParam_", actorUpdateParam_) hardReplaceActorTargetParam_ = [tf.assign(trainParam, targetParam) for trainParam, targetParam in zip(actorTrainParams_, actorTargetParams_)] tf.add_to_collection("hardReplaceActorTargetParam_", hardReplaceActorTargetParam_) criticTrainParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/trainHidden/'+ agentStr) criticTargetParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/targetHidden/'+ agentStr) criticUpdateParam_ = [criticTargetParams_[i].assign((1 - tau_) * criticTargetParams_[i] + tau_ * criticTrainParams_[i]) for i in range(len(criticTargetParams_))] tf.add_to_collection("criticTrainParams_", criticTrainParams_) tf.add_to_collection("criticTargetParams_", criticTargetParams_) tf.add_to_collection("criticUpdateParam_", criticUpdateParam_) hardReplaceCriticTargetParam_ = [tf.assign(trainParam, targetParam) for trainParam, targetParam in zip(criticTrainParams_, criticTargetParams_)] tf.add_to_collection("hardReplaceCriticTargetParam_", hardReplaceCriticTargetParam_) updateParam_ = actorUpdateParam_ + criticUpdateParam_ hardReplaceTargetParam_ = hardReplaceActorTargetParam_ + hardReplaceCriticTargetParam_ tf.add_to_collection("updateParam_", updateParam_) tf.add_to_collection("hardReplaceTargetParam_", hardReplaceTargetParam_) with tf.variable_scope("trainActorNet/"+ agentStr): trainQ = criticTrainActivation_[:, 0] pg_loss = -tf.reduce_mean(trainQ) p_reg = tf.reduce_mean(tf.square(actorTrainActivation_)) actorLoss_ = pg_loss + p_reg * 1e-3 actorOptimizer = tf.train.AdamOptimizer(learningRate_, name='actorOptimizer') actorTrainOpt_ = U.minimize_and_clip(actorOptimizer, actorLoss_, actorTrainParams_, self.gradNormClipping) tf.add_to_collection("actorLoss_", actorLoss_) tf.add_to_collection("actorTrainOpt_", actorTrainOpt_) with tf.variable_scope("trainCriticNet/"+ agentStr): yi_ = agentReward_ + gamma_ * criticTargetActivation_ criticLoss_ = tf.reduce_mean(tf.squared_difference(tf.squeeze(yi_), tf.squeeze(criticTrainActivationOfGivenAction_))) tf.add_to_collection("yi_", yi_) tf.add_to_collection("valueLoss_", criticLoss_) criticOptimizer = tf.train.AdamOptimizer(learningRate_, name='criticOptimizer') crticTrainOpt_ = U.minimize_and_clip(criticOptimizer, criticLoss_, criticTrainParams_, self.gradNormClipping) tf.add_to_collection("crticTrainOpt_", crticTrainOpt_) with tf.variable_scope("summary"+ agentStr): criticLossSummary = tf.identity(criticLoss_) tf.add_to_collection("criticLossSummary", criticLossSummary) tf.summary.scalar("criticLossSummary", criticLossSummary) fullSummary = tf.summary.merge_all() tf.add_to_collection("summaryOps", fullSummary) saver = tf.train.Saver(max_to_keep=None) tf.add_to_collection("saver", saver) model = tf.Session(graph=graph) model.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('tensorBoard/onlineDDPG/'+ agentStr, graph= graph) tf.add_to_collection("writer", writer) return model
def sample(self): u = tf.random_uniform(tf.shape(self.logits)) return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1)
def mode(self): return U.softmax(self.logits, axis=-1)