Exemple #1
0
def q_train_ddpg(observPlaceHolderList, actionSpaceList, q_index, q_func, optimizer, grad_norm_clipping=None, scope="trainer", reuse=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        actionPlaceHolderTypeList = [make_pdtype(actionSpace) for actionSpace in actionSpaceList]

        # set up placeholders
        act_ph_n = [actionPlaceHolderTypeList[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(actionSpaceList))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        # q_input = tf.concat(observPlaceHolderList + act_ph_n, 1)
        q_input = tf.concat([observPlaceHolderList[q_index], act_ph_n[q_index]], 1) # specific for ddpg
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        loss = q_loss #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=observPlaceHolderList + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(observPlaceHolderList + act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(observPlaceHolderList + act_ph_n, target_q)
        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # SoftCategoricalPdType Object

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
# 0th element: name = 'agent_0_1/action0:0', shape = (?, 5)

        p_input = obs_ph_n[p_index]

        # mlp_model(tensor(,12), 5, scope="p_func", num_units=64)
        p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p) # SoftCategoricalPd Object

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1) # shape = 12+12+10+5*3
        if local_q_func: # ddpg, uses only personal obs/ act
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) # shape = 17 = 12+ 5

        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
Exemple #3
0
def p_train(observPlaceHolderList, actionSpaceList, agentIndex, getMLPModel, q_func, optimizer,
            grad_norm_clipping=None, ddpg=False, num_units=64, scope="trainer", reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        actionPlaceHolderTypeList = [make_pdtype(actionSpace) for actionSpace in actionSpaceList]

        # set up placeholders
        act_ph_n = [actionPlaceHolderTypeList[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(actionSpaceList))]

        p_input = observPlaceHolderList[agentIndex]

        p = getMLPModel(p_input, int(actionPlaceHolderTypeList[agentIndex].param_shape()[0]), scope="getMLPModel", num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("getMLPModel"))

        # wrap parameters in distribution
        act_pd = actionPlaceHolderTypeList[agentIndex].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[agentIndex] = act_pd.sample()
        q_input = tf.concat(observPlaceHolderList + act_input_n, 1)
        if ddpg:
            q_input = tf.concat([observPlaceHolderList[agentIndex], act_input_n[agentIndex]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=observPlaceHolderList + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=act_sample)
        p_values = U.function([observPlaceHolderList[agentIndex]], p)

        # target network
        target_p = getMLPModel(p_input, int(actionPlaceHolderTypeList[agentIndex].param_shape()[0]), scope="target_p_func", num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = actionPlaceHolderTypeList[agentIndex].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=target_act_sample)

        return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):  #    parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp")

    # local_q_func = False if maddpgAlgor, = true if ddpg
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def p_train(observPlaceHolderList,
            actionSpaceList,
            agentIndex,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping,
            ddpg,
            num_units=64,
            scope="trainer",
            reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        actionPlaceHolderList = [
            tf.placeholder(dtype=tf.float32,
                           shape=[None] + [actionSpaceList[i].n],
                           name="action" + str(i))
            for i in range(len(actionSpaceList))
        ]

        policyNetInput = observPlaceHolderList[
            agentIndex]  # personal observation
        policyOutputShape = int(actionSpaceList[agentIndex].n)
        policyTrainOutput = p_func(policyNetInput,
                                   policyOutputShape,
                                   scope="p_func",
                                   num_units=num_units)
        policyNetVariables = U.scope_vars(U.absolute_scope_name("p_func"))

        sampleNoise = tf.random_uniform(tf.shape(policyTrainOutput), seed=0)
        actionSample = U.softmax(policyTrainOutput -
                                 tf.log(-tf.log(sampleNoise)),
                                 axis=-1)  # output of function act
        p_reg = tf.reduce_mean(tf.square(policyTrainOutput))

        actionInputPlaceHolderList = actionPlaceHolderList + []
        actionInputPlaceHolderList[agentIndex] = actionSample

        qNetInput = tf.concat(
            observPlaceHolderList + actionInputPlaceHolderList, 1)
        if ddpg:
            qNetInput = tf.concat(
                [observPlaceHolderList[agentIndex], actionSample], 1)

        q = q_func(qNetInput,
                   1,
                   scope="q_func",
                   reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3  ####### didnt change this optimization process in my ddpg

        optimize_expr = U.minimize_and_clip(optimizer, loss,
                                            policyNetVariables,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=observPlaceHolderList +
                           actionPlaceHolderList,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[observPlaceHolderList[agentIndex]],
                         outputs=actionSample)
        p_values = U.function([observPlaceHolderList[agentIndex]],
                              policyTrainOutput)

        # target network
        target_p = p_func(policyNetInput,
                          int(actionSpaceList[agentIndex].n),
                          scope="target_p_func",
                          num_units=num_units)
        targetNetVariables = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(policyNetVariables,
                                          targetNetVariables)

        uTarget = tf.random_uniform(tf.shape(target_p))
        target_act_sample = U.softmax(target_p - tf.log(-tf.log(uTarget)),
                                      axis=-1)
        target_act = U.function(inputs=[observPlaceHolderList[agentIndex]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
def q_train(observPlaceHolderList,
            actionSpaceList,
            agentIndex,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            ddpg=False,
            scope="trainer",
            reuse=None,
            num_units=64):

    with tf.variable_scope(scope, reuse=reuse):
        actionPlaceHolderList = [
            tf.placeholder(dtype=tf.float32,
                           shape=[None] + [actionSpaceList[i].n],
                           name="action" + str(i))
            for i in range(len(actionSpaceList))
        ]
        yi_ = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(observPlaceHolderList + actionPlaceHolderList,
                            1)  # shape (?, 24)
        if ddpg:
            q_input = tf.concat([
                observPlaceHolderList[agentIndex],
                actionPlaceHolderList[agentIndex]
            ], 1)  # shape (?, 13)

        q = q_func(
            q_input, 1, scope="q_func",
            num_units=num_units)[:,
                                 0]  # drop a level: shape (?, 1) to shape (?,)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        loss = tf.reduce_mean(tf.square(q - yi_))
        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=observPlaceHolderList +
                           actionPlaceHolderList + [yi_],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(observPlaceHolderList + actionPlaceHolderList, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(
            observPlaceHolderList + actionPlaceHolderList, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Exemple #7
0
    def __call__(self, layersWidths, agentID=None):
        agentStr = 'Agent' + str(agentID) if agentID is not None else ''
        print(
            "Generating Critic NN Model with layers: {}".format(layersWidths))
        graph = tf.Graph()
        with graph.as_default():
            with tf.name_scope("inputs" + agentStr):
                states_ = tf.placeholder(tf.float32,
                                         [None, self.numStateSpace],
                                         name='states_')
                action_ = tf.stop_gradient(tf.placeholder(
                    tf.float32, [None, self.actionDim]),
                                           name='action_')

                actionTarget_ = tf.placeholder(tf.float32,
                                               [None, self.actionDim],
                                               name='actionTarget_')
                reward_ = tf.placeholder(tf.float32, [None, 1], name='reward_')
                valueTarget_ = tf.placeholder(tf.float32, [None, 1],
                                              name='valueTarget_')

                tf.add_to_collection("states_", states_)
                tf.add_to_collection("action_", action_)
                tf.add_to_collection("actionTarget_", actionTarget_)
                tf.add_to_collection("reward_", reward_)
                tf.add_to_collection("valueTarget_", valueTarget_)

            with tf.name_scope("trainingParams" + agentStr):
                learningRate_ = tf.constant(0, dtype=tf.float32)
                tau_ = tf.constant(0, dtype=tf.float32)
                gamma_ = tf.constant(0, dtype=tf.float32)

                tf.add_to_collection("learningRate_", learningRate_)
                tf.add_to_collection("tau_", tau_)
                tf.add_to_collection("gamma_", gamma_)

            with tf.variable_scope("trainHidden" + agentStr):
                activation_ = tf.concat([states_, action_], axis=1)
                for i in range(len(layersWidths)):
                    activation_ = layers.fully_connected(
                        activation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu,
                        scope="fc{}".format(i + 1))

                trainValues_ = layers.fully_connected(
                    activation_,
                    num_outputs=1,
                    activation_fn=tf.nn.tanh,
                    scope="fc{}".format(len(layersWidths) + 1))

            with tf.variable_scope("targetHidden" + agentStr):
                activation_ = tf.concat([states_, actionTarget_], axis=1)
                for i in range(len(layersWidths)):
                    activation_ = layers.fully_connected(
                        activation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu,
                        scope="fc{}".format(i + 1))

                targetValues_ = layers.fully_connected(
                    activation_,
                    num_outputs=1,
                    activation_fn=tf.nn.tanh,
                    scope="fc{}".format(len(layersWidths) + 1))

            with tf.name_scope("parameters" + agentStr):
                trainParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope='trainHidden')
                targetParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES, scope='targetHidden')
                updateParam_ = [
                    targetParams_[i].assign((1 - tau_) * targetParams_[i] +
                                            tau_ * trainParams_[i])
                    for i in range(len(targetParams_))
                ]

                tf.add_to_collection("trainParams_", trainParams_)
                tf.add_to_collection("targetParams_", targetParams_)
                tf.add_to_collection("updateParam_", updateParam_)

                hardReplaceTargetParam_ = [
                    tf.assign(trainParam,
                              targetParam) for trainParam, targetParam in zip(
                                  trainParams_, targetParams_)
                ]
                tf.add_to_collection("hardReplaceTargetParam_",
                                     hardReplaceTargetParam_)

            with tf.name_scope("output" + agentStr):
                trainQ_ = tf.multiply(trainValues_, 1, name='trainQ_')
                targetQ_ = tf.multiply(targetValues_, 1, name='targetQ_')
                tf.add_to_collection("trainQ_", trainQ_)
                tf.add_to_collection("targetQ_", targetQ_)

            with tf.name_scope("evaluate" + agentStr):
                yi_ = reward_ + gamma_ * valueTarget_
                # criticLoss_ = tf.losses.mean_squared_error(labels=yi_, predictions=trainQ_)

                criticLoss_ = tf.reduce_mean(
                    tf.squared_difference(tf.squeeze(yi_),
                                          tf.squeeze(trainQ_)))

                # loss = tf.reduce_mean(tf.square(q - yi_))

                tf.add_to_collection("yi_", yi_)
                tf.add_to_collection("valueLoss_", criticLoss_)

            with tf.name_scope("train" + agentStr):
                # trainOpt_ = tf.train.AdamOptimizer(learningRate_, name='adamOptimizer').minimize(criticLoss_, var_list=trainParams_)
                optimizer = tf.train.AdamOptimizer(learningRate_,
                                                   name='adamOptimizer')
                grad_norm_clipping = 0.5
                trainOpt_ = U.minimize_and_clip(optimizer, criticLoss_,
                                                trainParams_,
                                                grad_norm_clipping)

                tf.add_to_collection("trainOpt_", trainOpt_)

            with tf.name_scope("summary" + agentStr):
                criticLossSummary = tf.identity(criticLoss_)
                tf.add_to_collection("criticLossSummary", criticLossSummary)
                tf.summary.scalar("criticLossSummary", criticLossSummary)

            fullSummary = tf.summary.merge_all()
            tf.add_to_collection("summaryOps", fullSummary)

            criticSaver = tf.train.Saver(max_to_keep=None)
            tf.add_to_collection("saver", criticSaver)

            model = tf.Session(graph=graph)
            model.run(tf.global_variables_initializer())

            criticWriter = tf.summary.FileWriter(
                'tensorBoard/criticOnlineDDPG' + agentStr, graph=graph)
            tf.add_to_collection("criticWriter", criticWriter)

        return criticWriter, model
    def __call__(self, layersWidths, agentID=None):
        agentStr = 'Agent' + str(agentID) if agentID is not None else ''
        graph = tf.Graph()
        with graph.as_default():
            with tf.variable_scope("inputs/" + agentStr):
                states_ = tf.placeholder(tf.float32,
                                         [None, self.numStateSpace],
                                         name='states_')
                nextStates_ = tf.placeholder(tf.float32,
                                             [None, self.numStateSpace],
                                             name='nextStates_')
                action_ = tf.stop_gradient(tf.placeholder(
                    tf.float32, [None, self.actionDim]),
                                           name='action_')
                reward_ = tf.placeholder(tf.float32, [None, 1], name='reward_')

                tf.add_to_collection("states_", states_)
                tf.add_to_collection("nextStates_", nextStates_)
                tf.add_to_collection("action_", action_)
                tf.add_to_collection("reward_", reward_)

            with tf.variable_scope("trainingParams" + agentStr):
                learningRate_ = tf.constant(0, dtype=tf.float32)
                tau_ = tf.constant(0, dtype=tf.float32)
                gamma_ = tf.constant(0, dtype=tf.float32)

                tf.add_to_collection("learningRate_", learningRate_)
                tf.add_to_collection("tau_", tau_)
                tf.add_to_collection("gamma_", gamma_)

            with tf.variable_scope("actor/trainHidden/" + agentStr):
                actorTrainActivation_ = states_
                for i in range(len(layersWidths)):
                    actorTrainActivation_ = layers.fully_connected(
                        actorTrainActivation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                actorTrainActivation_ = layers.fully_connected(
                    actorTrainActivation_,
                    num_outputs=self.actionDim,
                    activation_fn=None)

            with tf.variable_scope("actor/targetHidden/" + agentStr):
                actorTargetActivation_ = nextStates_
                for i in range(len(layersWidths)):
                    actorTargetActivation_ = layers.fully_connected(
                        actorTargetActivation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                actorTargetActivation_ = layers.fully_connected(
                    actorTargetActivation_,
                    num_outputs=self.actionDim,
                    activation_fn=None)

            with tf.variable_scope("actorNetOutput/" + agentStr):
                trainAction_ = tf.multiply(actorTrainActivation_,
                                           self.actionRange,
                                           name='trainAction_')
                targetAction_ = tf.multiply(actorTargetActivation_,
                                            self.actionRange,
                                            name='targetAction_')

                trainActionSpread = []
                batchSize = tf.shape(trainAction_)[0]
                trainActionReshaped_ = tf.reshape(
                    trainAction_,
                    [self.numAgentsToControl, batchSize, self.singleActionDim])
                for i in range(self.numAgentsToControl):
                    agentAction_ = trainActionReshaped_[i]
                    sampleNoiseTrainAgent_ = tf.random_uniform(
                        tf.shape(agentAction_))
                    agentNoisyTrainAction_ = U.softmax(
                        agentAction_ - tf.log(-tf.log(sampleNoiseTrainAgent_)),
                        axis=-1)  # give this to q input
                    trainActionSpread.append(agentNoisyTrainAction_)
                noisyTrainAction_ = tf.concat(trainActionSpread, axis=1)

                targetActionSpread = []
                batchSize = tf.shape(targetAction_)[0]
                targetActionReshaped_ = tf.reshape(
                    targetAction_,
                    [self.numAgentsToControl, batchSize, self.singleActionDim])
                for i in range(self.numAgentsToControl):
                    agentAction_ = targetActionReshaped_[i]
                    sampleNoiseTargetAgent_ = tf.random_uniform(
                        tf.shape(agentAction_))
                    agentNoisyTargetAction_ = U.softmax(
                        agentAction_ -
                        tf.log(-tf.log(sampleNoiseTargetAgent_)),
                        axis=-1)  # give this to q input
                    targetActionSpread.append(agentNoisyTargetAction_)
                noisyTargetAction_ = tf.concat(targetActionSpread, axis=1)

                tf.add_to_collection("trainAction_", trainAction_)
                tf.add_to_collection("targetAction_", targetAction_)

                tf.add_to_collection("noisyTrainAction_", noisyTrainAction_)
                tf.add_to_collection("noisyTargetAction_", noisyTargetAction_)

            with tf.variable_scope("critic/trainHidden/" + agentStr):
                criticTrainActivationOfGivenAction_ = tf.concat(
                    [states_, action_], axis=1)
                for i in range(len(layersWidths)):
                    criticTrainActivationOfGivenAction_ = layers.fully_connected(
                        criticTrainActivationOfGivenAction_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                criticTrainActivationOfGivenAction_ = layers.fully_connected(
                    criticTrainActivationOfGivenAction_,
                    num_outputs=1,
                    activation_fn=None)

            with tf.variable_scope("critic/trainHidden/" + agentStr,
                                   reuse=True):
                criticTrainActivation_ = tf.concat(
                    [states_, noisyTrainAction_], axis=1)
                for i in range(len(layersWidths)):
                    criticTrainActivation_ = layers.fully_connected(
                        criticTrainActivation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                criticTrainActivation_ = layers.fully_connected(
                    criticTrainActivation_, num_outputs=1, activation_fn=None)

            with tf.variable_scope("critic/targetHidden/" + agentStr):
                criticTargetActivation_ = tf.concat(
                    [nextStates_, noisyTargetAction_], axis=1)
                for i in range(len(layersWidths)):
                    criticTargetActivation_ = layers.fully_connected(
                        criticTargetActivation_,
                        num_outputs=layersWidths[i],
                        activation_fn=tf.nn.relu)

                criticTargetActivation_ = layers.fully_connected(
                    criticTargetActivation_, num_outputs=1, activation_fn=None)

            with tf.variable_scope("updateParameters/" + agentStr):
                actorTrainParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope='actor/trainHidden/' + agentStr)
                actorTargetParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope='actor/targetHidden/' + agentStr)
                actorUpdateParam_ = [
                    actorTargetParams_[i].assign((1 - tau_) *
                                                 actorTargetParams_[i] +
                                                 tau_ * actorTrainParams_[i])
                    for i in range(len(actorTargetParams_))
                ]

                tf.add_to_collection("actorTrainParams_", actorTrainParams_)
                tf.add_to_collection("actorTargetParams_", actorTargetParams_)
                tf.add_to_collection("actorUpdateParam_", actorUpdateParam_)

                hardReplaceActorTargetParam_ = [
                    tf.assign(trainParam,
                              targetParam) for trainParam, targetParam in zip(
                                  actorTrainParams_, actorTargetParams_)
                ]
                tf.add_to_collection("hardReplaceActorTargetParam_",
                                     hardReplaceActorTargetParam_)

                criticTrainParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope='critic/trainHidden/' + agentStr)
                criticTargetParams_ = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope='critic/targetHidden/' + agentStr)

                criticUpdateParam_ = [
                    criticTargetParams_[i].assign((1 - tau_) *
                                                  criticTargetParams_[i] +
                                                  tau_ * criticTrainParams_[i])
                    for i in range(len(criticTargetParams_))
                ]

                tf.add_to_collection("criticTrainParams_", criticTrainParams_)
                tf.add_to_collection("criticTargetParams_",
                                     criticTargetParams_)
                tf.add_to_collection("criticUpdateParam_", criticUpdateParam_)

                hardReplaceCriticTargetParam_ = [
                    tf.assign(trainParam,
                              targetParam) for trainParam, targetParam in zip(
                                  criticTrainParams_, criticTargetParams_)
                ]
                tf.add_to_collection("hardReplaceCriticTargetParam_",
                                     hardReplaceCriticTargetParam_)

                updateParam_ = actorUpdateParam_ + criticUpdateParam_
                hardReplaceTargetParam_ = hardReplaceActorTargetParam_ + hardReplaceCriticTargetParam_
                tf.add_to_collection("updateParam_", updateParam_)
                tf.add_to_collection("hardReplaceTargetParam_",
                                     hardReplaceTargetParam_)

            with tf.variable_scope("trainActorNet/" + agentStr):
                trainQ = criticTrainActivation_[:, 0]
                pg_loss = -tf.reduce_mean(trainQ)
                p_reg = tf.reduce_mean(tf.square(actorTrainActivation_))
                actorLoss_ = pg_loss + p_reg * 1e-3

                actorOptimizer = tf.train.AdamOptimizer(learningRate_,
                                                        name='actorOptimizer')
                grad_norm_clipping = 0.5
                actorTrainOpt_ = U.minimize_and_clip(actorOptimizer,
                                                     actorLoss_,
                                                     actorTrainParams_,
                                                     grad_norm_clipping)

                tf.add_to_collection("actorLoss_", actorLoss_)
                tf.add_to_collection("actorTrainOpt_", actorTrainOpt_)

            with tf.variable_scope("trainCriticNet/" + agentStr):
                yi_ = reward_ + gamma_ * criticTargetActivation_
                criticLoss_ = tf.reduce_mean(
                    tf.squared_difference(
                        tf.squeeze(yi_),
                        tf.squeeze(criticTrainActivationOfGivenAction_)))

                tf.add_to_collection("yi_", yi_)
                tf.add_to_collection("valueLoss_", criticLoss_)

                criticOptimizer = tf.train.AdamOptimizer(
                    learningRate_, name='criticOptimizer')
                grad_norm_clipping = 0.5
                crticTrainOpt_ = U.minimize_and_clip(criticOptimizer,
                                                     criticLoss_,
                                                     criticTrainParams_,
                                                     grad_norm_clipping)

                tf.add_to_collection("crticTrainOpt_", crticTrainOpt_)

            with tf.variable_scope("summary" + agentStr):
                criticLossSummary = tf.identity(criticLoss_)
                tf.add_to_collection("criticLossSummary", criticLossSummary)
                tf.summary.scalar("criticLossSummary", criticLossSummary)

            fullSummary = tf.summary.merge_all()
            tf.add_to_collection("summaryOps", fullSummary)

            saver = tf.train.Saver(max_to_keep=None)
            tf.add_to_collection("saver", saver)

            model = tf.Session(graph=graph)
            model.run(tf.global_variables_initializer())

            writer = tf.summary.FileWriter('tensorBoard/onlineDDPG/' +
                                           agentStr,
                                           graph=graph)
            tf.add_to_collection("writer", writer)

        return model
Exemple #9
0
    def __call__(self, layersWidths, agentID):
        agentStr = 'Agent'+ str(agentID)
        graph = tf.Graph()
        with graph.as_default():
            with tf.variable_scope("inputs/"+ agentStr):
                allAgentsStates_ = [tf.placeholder(dtype=tf.float32, shape=[None, agentObsDim], name="state"+str(i)) for i, agentObsDim in enumerate(self.obsShapeList)]
                allAgentsNextStates_ =  [tf.placeholder(dtype=tf.float32, shape=[None, agentObsDim], name="nextState"+str(i)) for i, agentObsDim in enumerate(self.obsShapeList)]

                allAgentsActions_ = [tf.placeholder(dtype=tf.float32, shape=[None, self.actionDim], name="action"+str(i)) for i in range(self.numAgents)]
                allAgentsNextActionsByTargetNet_ = [tf.placeholder(dtype=tf.float32, shape=[None, self.actionDim], name= "actionTarget"+str(i)) for i in range(self.numAgents)]

                agentReward_ = tf.placeholder(tf.float32, [None, 1], name='reward_')

                tf.add_to_collection("allAgentsStates_", allAgentsStates_)
                tf.add_to_collection("allAgentsNextStates_", allAgentsNextStates_)
                tf.add_to_collection("allAgentsActions_", allAgentsActions_)
                tf.add_to_collection("allAgentsNextActionsByTargetNet_", allAgentsNextActionsByTargetNet_)
                tf.add_to_collection("agentReward_", agentReward_)

            with tf.variable_scope("trainingParams" + agentStr):
                learningRate_ = tf.constant(0, dtype=tf.float32)
                tau_ = tf.constant(0, dtype=tf.float32)
                gamma_ = tf.constant(0, dtype=tf.float32)

                tf.add_to_collection("learningRate_", learningRate_)
                tf.add_to_collection("tau_", tau_)
                tf.add_to_collection("gamma_", gamma_)

            with tf.variable_scope("actor/trainHidden/"+ agentStr): # act by personal observation
                currentAgentState_ = allAgentsStates_[agentID]
                actorTrainActivation_ = currentAgentState_

                for i in range(len(layersWidths)):
                    actorTrainActivation_ = layers.fully_connected(actorTrainActivation_, num_outputs= layersWidths[i],
                                                                   activation_fn=tf.nn.relu)

                actorTrainActivation_ = layers.fully_connected(actorTrainActivation_, num_outputs= self.actionDim,
                                                               activation_fn= None)

            with tf.variable_scope("actor/targetHidden/"+ agentStr):
                currentAgentNextState_ = allAgentsNextStates_[agentID]
                actorTargetActivation_ = currentAgentNextState_

                for i in range(len(layersWidths)):
                    actorTargetActivation_ = layers.fully_connected(actorTargetActivation_, num_outputs= layersWidths[i],
                                                                    activation_fn=tf.nn.relu)

                actorTargetActivation_ = layers.fully_connected(actorTargetActivation_, num_outputs= self.actionDim,
                                                                activation_fn=None)

            with tf.variable_scope("actorNetOutput/"+ agentStr):
                trainAction_ = tf.multiply(actorTrainActivation_, self.actionRange, name='trainAction_')
                targetAction_ = tf.multiply(actorTargetActivation_, self.actionRange, name='targetAction_')

                sampleNoiseTrain_ = tf.random_uniform(tf.shape(trainAction_))
                noisyTrainAction_ = U.softmax(trainAction_ - tf.log(-tf.log(sampleNoiseTrain_)), axis=-1) # give this to q input

                sampleNoiseTarget_ = tf.random_uniform(tf.shape(targetAction_))
                noisyTargetAction_ = U.softmax(targetAction_ - tf.log(-tf.log(sampleNoiseTarget_)), axis=-1)

                tf.add_to_collection("trainAction_", trainAction_)
                tf.add_to_collection("targetAction_", targetAction_)

                tf.add_to_collection("noisyTrainAction_", noisyTrainAction_)
                tf.add_to_collection("noisyTargetAction_", noisyTargetAction_)


            with tf.variable_scope("critic/trainHidden/"+ agentStr):
                criticTrainActivationOfGivenAction_ = tf.concat(allAgentsStates_ + allAgentsActions_, axis=1)

                for i in range(len(layersWidths)):
                    criticTrainActivationOfGivenAction_ = layers.fully_connected(criticTrainActivationOfGivenAction_, num_outputs= layersWidths[i], activation_fn=tf.nn.relu)

                criticTrainActivationOfGivenAction_ = layers.fully_connected(criticTrainActivationOfGivenAction_, num_outputs= 1, activation_fn= None)

            with tf.variable_scope("critic/trainHidden/" + agentStr, reuse= True):
                criticInputActionList = allAgentsActions_ + []
                criticInputActionList[agentID] = noisyTrainAction_
                criticTrainActivation_ = tf.concat(allAgentsStates_ + criticInputActionList, axis=1)

                for i in range(len(layersWidths)):
                    criticTrainActivation_ = layers.fully_connected(criticTrainActivation_, num_outputs=layersWidths[i], activation_fn=tf.nn.relu)

                criticTrainActivation_ = layers.fully_connected(criticTrainActivation_, num_outputs=1, activation_fn=None)

            with tf.variable_scope("critic/targetHidden/"+ agentStr):
                criticTargetActivation_ = tf.concat(allAgentsNextStates_ + allAgentsNextActionsByTargetNet_, axis=1)
                for i in range(len(layersWidths)):
                    criticTargetActivation_ = layers.fully_connected(criticTargetActivation_, num_outputs= layersWidths[i],activation_fn=tf.nn.relu)

                criticTargetActivation_ = layers.fully_connected(criticTargetActivation_, num_outputs= 1,activation_fn=None)

            with tf.variable_scope("updateParameters/"+ agentStr):
                actorTrainParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/trainHidden/'+ agentStr)
                actorTargetParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/targetHidden/'+ agentStr)
                actorUpdateParam_ = [actorTargetParams_[i].assign((1 - tau_) * actorTargetParams_[i] + tau_ * actorTrainParams_[i]) for i in range(len(actorTargetParams_))]

                tf.add_to_collection("actorTrainParams_", actorTrainParams_)
                tf.add_to_collection("actorTargetParams_", actorTargetParams_)
                tf.add_to_collection("actorUpdateParam_", actorUpdateParam_)

                hardReplaceActorTargetParam_ = [tf.assign(trainParam, targetParam) for trainParam, targetParam in zip(actorTrainParams_, actorTargetParams_)]
                tf.add_to_collection("hardReplaceActorTargetParam_", hardReplaceActorTargetParam_)

                criticTrainParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/trainHidden/'+ agentStr)
                criticTargetParams_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/targetHidden/'+ agentStr)

                criticUpdateParam_ = [criticTargetParams_[i].assign((1 - tau_) * criticTargetParams_[i] + tau_ * criticTrainParams_[i]) for i in range(len(criticTargetParams_))]

                tf.add_to_collection("criticTrainParams_", criticTrainParams_)
                tf.add_to_collection("criticTargetParams_", criticTargetParams_)
                tf.add_to_collection("criticUpdateParam_", criticUpdateParam_)

                hardReplaceCriticTargetParam_ = [tf.assign(trainParam, targetParam) for trainParam, targetParam in zip(criticTrainParams_, criticTargetParams_)]
                tf.add_to_collection("hardReplaceCriticTargetParam_", hardReplaceCriticTargetParam_)

                updateParam_ = actorUpdateParam_ + criticUpdateParam_
                hardReplaceTargetParam_ = hardReplaceActorTargetParam_ + hardReplaceCriticTargetParam_
                tf.add_to_collection("updateParam_", updateParam_)
                tf.add_to_collection("hardReplaceTargetParam_", hardReplaceTargetParam_)


            with tf.variable_scope("trainActorNet/"+ agentStr):
                trainQ = criticTrainActivation_[:, 0]
                pg_loss = -tf.reduce_mean(trainQ)
                p_reg = tf.reduce_mean(tf.square(actorTrainActivation_))
                actorLoss_ = pg_loss + p_reg * 1e-3

                actorOptimizer = tf.train.AdamOptimizer(learningRate_, name='actorOptimizer')
                actorTrainOpt_ = U.minimize_and_clip(actorOptimizer, actorLoss_, actorTrainParams_, self.gradNormClipping)

                tf.add_to_collection("actorLoss_", actorLoss_)
                tf.add_to_collection("actorTrainOpt_", actorTrainOpt_)

            with tf.variable_scope("trainCriticNet/"+ agentStr):
                yi_ = agentReward_ + gamma_ * criticTargetActivation_
                criticLoss_ = tf.reduce_mean(tf.squared_difference(tf.squeeze(yi_), tf.squeeze(criticTrainActivationOfGivenAction_)))

                tf.add_to_collection("yi_", yi_)
                tf.add_to_collection("valueLoss_", criticLoss_)

                criticOptimizer = tf.train.AdamOptimizer(learningRate_, name='criticOptimizer')
                crticTrainOpt_ = U.minimize_and_clip(criticOptimizer, criticLoss_, criticTrainParams_, self.gradNormClipping)

                tf.add_to_collection("crticTrainOpt_", crticTrainOpt_)

            with tf.variable_scope("summary"+ agentStr):
                criticLossSummary = tf.identity(criticLoss_)
                tf.add_to_collection("criticLossSummary", criticLossSummary)
                tf.summary.scalar("criticLossSummary", criticLossSummary)

            fullSummary = tf.summary.merge_all()
            tf.add_to_collection("summaryOps", fullSummary)

            saver = tf.train.Saver(max_to_keep=None)
            tf.add_to_collection("saver", saver)

            model = tf.Session(graph=graph)
            model.run(tf.global_variables_initializer())

            writer = tf.summary.FileWriter('tensorBoard/onlineDDPG/'+ agentStr, graph= graph)
            tf.add_to_collection("writer", writer)

        return model