コード例 #1
0
ファイル: Common.py プロジェクト: vanstrn/RL_public
class RewardLogging(gym.core.Wrapper):
    def __init__(self, env, loggingPeriod=100, **kwargs):
        super().__init__(env)
        if self.multiprocessing == 1:
            self.GLOBAL_RUNNING_R = MovingAverage(loggingPeriod)
        else:
            if 'GLOBAL_RUNNING_R' not in globals():
                global GLOBAL_RUNNING_R
                GLOBAL_RUNNING_R = MovingAverage(loggingPeriod)
            self.GLOBAL_RUNNING_R = GLOBAL_RUNNING_R

    def reset(self, **kwargs):
        self.tracking_r = []
        return self.env.reset(**kwargs)

    def step(self, action):
        observation, reward, done, info = self.env.step(action=action)
        self.tracking_r.append(reward)
        return observation, reward, done, info

    def getLogging(self):
        """
        Processes the tracked data of the environment.
        In this case it sums the reward over the entire episode.
        """
        self.GLOBAL_RUNNING_R.append(sum(self.tracking_r))
        finalDict = {"TotalReward": self.GLOBAL_RUNNING_R()}
        return finalDict
コード例 #2
0
    def __init__(self,sharedModel,sess,stateShape,actionSize,scope,HPs,globalAC=None,nTrajs=1):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders
        self.actionSize =actionSize
        self.sess=sess
        self.scope=scope
        self.Model = sharedModel
        self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
        self.s_next = tf.placeholder(tf.float32, [None] + stateShape, 'S_next')

        input = {"state":self.s}
        out = self.Model(input)
        self.state_pred = out["prediction"]
        self.mu = out["mu"]
        self.log_var = out["log_var"]

        if globalAC is None:   # get global network
            with tf.variable_scope(scope):
                self.s_params = self.Model.GetVariables("Reconstruction")
        else:   # local net, calculate losses
            self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]
            with tf.variable_scope(scope+"_update"):

                self.s_params = self.Model.GetVariables("Reconstruction")

                with tf.name_scope('s_loss'):
                    if HPs["loss"] == "MSE":
                        self.s_loss = tf.losses.mean_squared_error(self.state_pred,self.s_next)
                    elif HPs["loss"] == "KL":
                        self.s_loss = tf.losses.KLD(self.state_pred,self.s_next)
                    elif HPs["loss"] == "M4E":
                        self.s_loss = tf.reduce_mean((self.state_pred-self.s_next)**4)
                    self.s_loss = self.s_loss - .5 * tf.reduce_sum(1 + self.log_var -
                                            K.square(self.mu) -
                                            K.exp(self.log_var))

                with tf.name_scope('local_grad'):
                    self.s_grads = tf.gradients(self.s_loss, self.s_params)

            with tf.name_scope('sync'):
                with tf.name_scope('pull'):
                    self.pull_s_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.s_params, globalAC.s_params)]

                with tf.name_scope('push'):
                    self.update_s_op = tf.train.AdamOptimizer(HPs["State LR"]).apply_gradients(zip(self.s_grads, globalAC.s_params))

            self.update_ops = [self.update_s_op]
            self.pull_ops = [self.pull_s_params_op]
            self.grads = [self.s_grads]
            self.losses = [self.s_loss]

            self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
            self.loss_MA = [MovingAverage(400) for i in range(len(self.grads))]
            self.labels = ["State"]

            self.sess.run(self.pull_ops) #Pulling the variables from the global network to initialize.
コード例 #3
0
ファイル: Common.py プロジェクト: vanstrn/RL_public
 def __init__(self, env, loggingPeriod=100, **kwargs):
     super().__init__(env)
     if self.multiprocessing == 1:
         self.GLOBAL_RUNNING_R = MovingAverage(loggingPeriod)
     else:
         if 'GLOBAL_RUNNING_R' not in globals():
             global GLOBAL_RUNNING_R
             GLOBAL_RUNNING_R = MovingAverage(loggingPeriod)
         self.GLOBAL_RUNNING_R = GLOBAL_RUNNING_R
コード例 #4
0
ファイル: PPO_v4.py プロジェクト: vanstrn/RL_public
class PriorityBuffer():
    def __init__(self, maxSamples=10000):
        self.maxSamples = maxSamples
        self.buffer = []
        self.priorities = []
        self.trajLengths = []
        self.flag = True
        self.slice = 0
        self.sampleSize = 0
        self.errorMA = MovingAverage(1000)

    def GetMuSigma(self):
        return self.errorMA(), self.errorMA.std()

    def AddError(self, val):
        self.errorMA.append(val)

    def AddTrajectory(self, sample, priority):
        if len(sample[0]) == 0:
            return
        self.buffer.append(sample)
        self.priorities.append(priority)
        self.trajLengths.append(len(sample[0]))

    def Sample(self):
        return self.buffer[0:self.slice], self.sampleSize

    def PrioritizeandPruneSamples(self, sampleSize):
        if len(self.trajLengths) == 0:
            return
        if self.flag:
            self.flag = False
        self.priorities, self.buffer, self.trajLengths = (list(t) for t in zip(
            *sorted(zip(self.priorities, self.buffer, self.trajLengths),
                    key=operator.itemgetter(0),
                    reverse=True)))

        #Pruning the least favorable samples
        while sum(self.trajLengths) >= self.maxSamples:
            self.priorities.pop(-1)
            self.buffer.pop(-1)
            self.trajLengths.pop(-1)
        self.sampleSize = 0
        self.slice = 0
        for length in self.trajLengths:
            self.sampleSize += length
            self.slice += 1
            if self.sampleSize > sampleSize:
                break

    def UpdatePriorities(self, priorities):
        self.priorities[0:self.slice] = priorities
        self.flag = True
        return self.buffer

    def GetReprioritySamples(self):
        return self.buffer[0:self.slice]
コード例 #5
0
 def __init__(self, maxSamples=10000):
     self.maxSamples = maxSamples
     self.buffer = []
     self.priorities = []
     self.trajLengths = []
     self.flag = True
     self.slice = 0
     self.sampleSize = 0
     self.errorMA = MovingAverage(1000)
コード例 #6
0
    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]

        #Placeholders
        self.sess=sess
        self.HPs = settings["NetworkHPs"]

        self.s = tf.placeholder(dtype=tf.float32, shape=[None]+stateShape, name="state")
        self.a = tf.placeholder(tf.int32, [None,1], "act")
        # self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error
        self.v_ = tf.placeholder(tf.float32, [None, 1], "v_next")
        self.r = tf.placeholder(tf.float32, [None,1], 'r')


        #These need to be returned in the call function of a tf.keras.Model class.
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize)

        inputs = {"state":self.s}
        out = self.Model(inputs)
        self.acts_prob = out["actor"]
        self.critic = out["critic"]

        #Defining Training Operations which will be called in the Update Function.
        with tf.variable_scope('Update_Operation'):
            with tf.name_scope('squared_TD_error'):
                self.td_error = self.r + self.HPs["Gamma"] * self.v_ - self.critic
                self.c_loss = tf.reduce_mean(tf.square(self.td_error))    # TD_error = (r+gamma*V_next) - V_eval

            with tf.name_scope('train_critic'):
                self.c_params = self.Model.GetVariables("Critic")
                self.c_grads = tf.gradients(self.c_loss, self.c_params)
                self.update_c_op = tf.train.AdamOptimizer(self.HPs["Critic LR"]).apply_gradients(zip(self.c_grads, self.c_params))

            with tf.name_scope('exp_v'):
                log_prob = tf.log(self.acts_prob + 1e-5) * tf.one_hot(self.a, actionSize, dtype=tf.float32)
                self.a_loss = -tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss

            with tf.name_scope('train_actor'):
                self.a_params = self.Model.GetVariables("Actor")
                print(self.a_params)
                self.a_grads = tf.gradients(self.a_loss, self.a_params)
                self.update_a_op = tf.train.AdamOptimizer(self.HPs["Actor LR"]).apply_gradients(zip(self.a_grads, self.a_params))

            self.update_ops=[self.update_c_op,self.update_a_op]

            self.entropy = -tf.reduce_mean(self.acts_prob * _log(self.acts_prob), name='entropy')

            self.logging_ops = [self.a_loss,self.c_loss,self.entropy]
            self.labels = ["Loss Actor","Loss Critic","Entropy"]
            self.logging_MA = [MovingAverage(400) for i in range(len(self.logging_ops))]
コード例 #7
0
ファイル: CTF_v2.py プロジェクト: zd6/RL
 def __init__(self, env, **kwargs):
     super().__init__(env)
     if self.multiprocessing == 1:
         self.GLOBAL_RUNNING_R = MovingAverage(400)
         self.win_rate = MovingAverage(400)
     else:
         if 'GLOBAL_RUNNING_R' not in globals():
             global GLOBAL_RUNNING_R
             GLOBAL_RUNNING_R = MovingAverage(400)
         self.GLOBAL_RUNNING_R = GLOBAL_RUNNING_R
         self.win_rate = MovingAverage(400)
コード例 #8
0
ファイル: CTF_v2.py プロジェクト: vanstrn/RL_public
class RewardLogging(gym.core.Wrapper):
    def __init__(self,env, **kwargs):
        super().__init__(env)
        if self.multiprocessing == 1:
            self.GLOBAL_RUNNING_R = MovingAverage(400)
            self.win_rate = MovingAverage(400)
            self.red_killed = MovingAverage(400)
        else:
            if 'GLOBAL_RUNNING_R' not in globals():
                global GLOBAL_RUNNING_R
                GLOBAL_RUNNING_R = MovingAverage(400)
            self.GLOBAL_RUNNING_R = GLOBAL_RUNNING_R
            self.win_rate = MovingAverage(400)
            self.red_killed = MovingAverage(400)

    def reset(self, **kwargs):
        self.tracking_r = []
        return self.env.reset(**kwargs)

    def step(self, action):
        observation, reward, done, info = self.env.step(action=action)
        self.tracking_r.append(reward)
        return observation, reward, done, info

    def getLogging(self):
        """
        Processes the tracked data of the environment.
        In this case it sums the reward over the entire episode.
        """
        self.win_rate.append(int(self.blue_win))
        self.GLOBAL_RUNNING_R.append(sum(self.tracking_r))
        self.red_killed.append(int(self.red_eliminated))
        finalDict = {"Env Results/TotalReward":self.GLOBAL_RUNNING_R(),
                     "Env Results/WinRate":self.win_rate(),
                     "Env Results/RedKilled":self.red_killed()}
        return finalDict
コード例 #9
0
ファイル: MAML.py プロジェクト: vanstrn/RL_public
    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess=sess
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="local")
        self.Model2 = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="global")
        self.scope =scope ="MAML"
        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("MAML"):
                #Placeholders
                if len(stateShape) == 4:
                    self.s = tf.placeholder(tf.float32, [None]+stateShape[1:4], 'S')
                else:
                    self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state":self.s}
                out = self.Model(inputs)
                _ = self.Model2(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]
                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))
                entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(ratio, 1-self.HPs["eps"], 1+self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss')

                actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"]
                loss = actor_loss + critic_loss * self.HPs["CriticBeta"]

                # Build Trainer
                if self.HPs["Optimizer"] == "Adam":
                    self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adam(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.RMSProp(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adagrad(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adadelta(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adamax(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.SGD(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True)
                    self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"],amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()

                vars1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/local')
                self.gradients = self.optimizer.get_gradients(loss, vars1)
                self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, vars1))

                with tf.name_scope("MetaUpdater"):
                    vars2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/global')
                    self.meta_update_ops = self.metaOptimizer.apply_gradients(zip(self.gradients, vars2))

                with tf.name_scope('sync'):
                    self.pull_params_op = [l_p.assign(g_p) for l_p, g_p in zip(vars1,vars2)]

        #Creating variables for logging.
        self.EntropyMA = MovingAverage(400)
        self.CriticLossMA = MovingAverage(400)
        self.ActorLossMA = MovingAverage(400)
        self.GradMA = MovingAverage(400)
        self.counter = 0
コード例 #10
0
ファイル: MAML.py プロジェクト: vanstrn/RL_public
class MAML(Method):

    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess=sess
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="local")
        self.Model2 = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="global")
        self.scope =scope ="MAML"
        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("MAML"):
                #Placeholders
                if len(stateShape) == 4:
                    self.s = tf.placeholder(tf.float32, [None]+stateShape[1:4], 'S')
                else:
                    self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state":self.s}
                out = self.Model(inputs)
                _ = self.Model2(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]
                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))
                entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(ratio, 1-self.HPs["eps"], 1+self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss')

                actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"]
                loss = actor_loss + critic_loss * self.HPs["CriticBeta"]

                # Build Trainer
                if self.HPs["Optimizer"] == "Adam":
                    self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adam(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.RMSProp(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adagrad(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adadelta(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adamax(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.SGD(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True)
                    self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"],amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()

                vars1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/local')
                self.gradients = self.optimizer.get_gradients(loss, vars1)
                self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, vars1))

                with tf.name_scope("MetaUpdater"):
                    vars2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/global')
                    self.meta_update_ops = self.metaOptimizer.apply_gradients(zip(self.gradients, vars2))

                with tf.name_scope('sync'):
                    self.pull_params_op = [l_p.assign(g_p) for l_p, g_p in zip(vars1,vars2)]

        #Creating variables for logging.
        self.EntropyMA = MovingAverage(400)
        self.CriticLossMA = MovingAverage(400)
        self.ActorLossMA = MovingAverage(400)
        self.GradMA = MovingAverage(400)
        self.counter = 0

    def next_task(self):
        if self.counter > 3:
            self.counter = 0
            # self.sess.run(self.update_op)
            self.sess.run(self.pull_params_op)
            return True
        else:
            return False


    def GetAction(self, state, episode=1,step=0):
        """
        Method to run data through the neural network.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        try:
            probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: state})
        except ValueError:
            probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: np.expand_dims(state,axis=0)})
        actions = np.array([np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs])
        return actions, [v,log_logits]

    def Update(self,episode=0):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        samples=0
        for i in range(len(self.buffer)):
            samples +=len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            #Finding if there are more than 1 done in the sequence. Clipping values if required.

            td_target, advantage = self.ProcessBuffer(traj)

            batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1
            s = np.array_split( self.buffer[traj][0], batches)
            a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches)
            td_target_ = np.array_split( td_target, batches)
            advantage_ = np.array_split( np.reshape(advantage, [-1]), batches)
            old_log_logits_ = np.array_split( np.reshape(self.buffer[traj][6], [-1,self.actionSize]), batches)

            #Create a dictionary with all of the samples?
            #Use a sampler to feed the update operation?

            #Staging Buffer inputs into the entries to run through the network.
            # print(td_target)
            for epoch in range(self.HPs["Epochs"]):
                for i in range(batches):

                    feed_dict = {self.s: np.squeeze(np.asarray(s[i])),
                                 self.a_his: np.asarray(a_his[i]),
                                 self.td_target_:np.asarray(td_target_[i]),
                                 self.advantage_: np.asarray(advantage_[i]),
                                 self.old_log_logits_: np.asarray(old_log_logits_[i])}
                    # aLoss= self.sess.run([self.actor_loss], feed_dict)
                    if self.counter == 3:
                        aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.meta_update_ops], feed_dict)
                    else:
                        aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.update_ops], feed_dict)

                    self.EntropyMA.append(entropy)
                    self.CriticLossMA.append(cLoss)
                    self.ActorLossMA.append(aLoss)
                    total_counter = 0
                    vanish_counter = 0
                    for grad in grads:
                        total_counter += np.prod(grad.shape)
                        vanish_counter += (np.absolute(grad)<1e-8).sum()
                    self.GradMA.append(vanish_counter/total_counter)
        self.counter += 1
        self.ClearTrajectory()


    def GetStatistics(self):
        dict = {"Training Results/Entropy":self.EntropyMA(),
        "Training Results/Loss Critic":self.CriticLossMA(),
        "Training Results/Loss Actor":self.ActorLossMA(),
        "Training Results/Vanishing Gradient":self.GradMA(),}
        return dict


    def ProcessBuffer(self,traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.
        clip : list[bool]
            List where the trajectory has finished.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """

        split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x]

        reward_lists = np.split(self.buffer[traj][2],split_loc)
        value_lists = np.split(self.buffer[traj][5],split_loc)

        td_target=[]; advantage=[]
        for rew,value in zip(reward_lists,value_lists):
            td_target_i, advantage_i = gae(rew.reshape(-1),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"])
            td_target.extend(td_target_i); advantage.extend( advantage_i)
        return td_target, advantage

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
コード例 #11
0
ファイル: SF_Off_v2.py プロジェクト: zd6/RL
    def __init__(self,sharedModel,sess,stateShape,actionSize,scope,HPs,nTrajs=1):
        """
        Off policy Successor Representation using neural networks
        Does not create an action for the

        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders
        self.actionSize =actionSize
        self.HPs = HPs
        self.sess=sess
        self.scope=scope
        self.Model = sharedModel
        self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
        self.s_next = tf.placeholder(tf.float32, [None] + stateShape, 'S_next')
        self.reward = tf.placeholder(tf.float32, [None, ], 'R')
        self.td_target = tf.placeholder(tf.float32, [None,self.Model.data["DefaultParams"]["SFSize"]], 'TDtarget')

        input = {"state":self.s}
        out = self.Model(input)
        self.value_pred = out["critic"]
        self.state_pred = out["prediction"]
        self.reward_pred = out["reward_pred"]
        self.phi = out["phi"]
        self.psi = out["psi"]

        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        self.params = self.Model.getVars()

        with tf.name_scope('loss'):
            sf_error = tf.subtract(self.td_target, self.psi, name='TD_error')
            sf_error = tf.square(sf_error)
            self.c_loss = tf.reduce_mean(sf_error,name="sf_loss")

            if HPs["Loss"] == "MSE":
                self.s_loss = tf.losses.mean_squared_error(self.state_pred,self.s_next)
            elif HPs["Loss"] == "KL":
                self.s_loss = tf.losses.KLD(self.state_pred,self.s_next)
            elif HPs["Loss"] == "M4E":
                self.s_loss = tf.reduce_mean((self.state_pred-self.s_next)**4)

            self.r_loss = tf.losses.mean_squared_error(self.reward,tf.squeeze(self.reward_pred))

            self.loss = self.s_loss + HPs["CriticBeta"]*self.c_loss + HPs["RewardBeta"]*self.r_loss

        if HPs["Optimizer"] == "Adam":
            self.optimizer = tf.keras.optimizers.Adam(HPs["LR"])
        elif HPs["Optimizer"] == "RMS":
            self.optimizer = tf.keras.optimizers.RMSProp(HPs["LR"])
        elif HPs["Optimizer"] == "Adagrad":
            self.optimizer = tf.keras.optimizers.Adagrad(HPs["LR"])
        elif HPs["Optimizer"] == "Adadelta":
            self.optimizer = tf.keras.optimizers.Adadelta(HPs["LR"])
        elif HPs["Optimizer"] == "Adamax":
            self.optimizer = tf.keras.optimizers.Adamax(HPs["LR"])
        elif HPs["Optimizer"] == "Nadam":
            self.optimizer = tf.keras.optimizers.Nadam(HPs["LR"])
        elif HPs["Optimizer"] == "SGD":
            self.optimizer = tf.keras.optimizers.SGD(HPs["LR"])
        elif HPs["Optimizer"] == "Amsgrad":
            self.optimizer = tf.keras.optimizers.Nadam(HPs["LR"],amsgrad=True)
        else:
            print("Not selected a proper Optimizer")
            exit()

        with tf.name_scope('local_grad'):
            self.grads = self.optimizer.get_gradients(self.loss, self.params)

        with tf.name_scope('update'):
            self.update_op = self.optimizer.apply_gradients(zip(self.grads, self.params))


        self.update_ops = [self.update_op]
        self.grads = [self.grads]
        self.losses = [self.c_loss,self.s_loss,self.r_loss]

        self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
        self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))]
        self.Gradlabels = ["Total"]
        self.Losslabels = ["Critic","State","Reward"]

        self.clearBuffer = False
コード例 #12
0
    def __init__(self,Model,sess,stateShape,actionSize,HPs,nTrajs=1,scope="PPO_Training",subReward=False):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess=sess
        self.Model = Model
        self.method = "Confidence" #Create input for this.
        self.HPs=HPs
        self.subReward = subReward
        self.UpdateSubpolicies = True
        self.nTrajs = nTrajs

        #Creating two buffers to separate information between the different levels of the network.
        if self.subReward:
            self.buffer = [Trajectory(depth=12) for _ in range(nTrajs)]
            #[s0,a,r,r_sub,s1,done]+[HL_actions, HL_log_logits, HL_v, flag, critics, logits]
        else:
            self.buffer = [Trajectory(depth=11) for _ in range(nTrajs)]
            #[s0,a,r,s1,done]+[HL_action, HL_log_logits, HL_v, flag, critics, logits]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Generic placeholders
                self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.options_ = tf.placeholder(shape=[None], dtype=tf.int32, name="options")

                #Initializing Netowrk I/O
                inputs = {"state":self.s}
                out = self.Model(inputs)
                self.term = out["mertaTermination"]
                self.q = out["metaCritic"]

                self.sub_a_prob = out["subActor"]
                self.sub_log_logits = out["subLogLogits"]

                self.nPolicies = len(self.sub_a_prob)

                #Placeholder for the Sub-Policies
                self.old_log_logits_sub_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_sub_hold')

                # Creating the Loss and update calls for the Hierarchical policy
                self.disconnected_q_vals = tf.stop_gradient(self.q)
                self.deliberation_costs = 0
                self.term_op = tf.gather_nd(params=self.term, indices=self.options_)
                self.disconnected_q_vals_option = tf.gather_nd(params=self.disconnected_q_vals, indices=self.options_)
                self.q_vals_option = tf.gather_nd(params=self.q, indices=self.options_)


                loss_termination = tf.reduce_mean(self.term_op * ((self.disconnected_q_vals_option - disconnected_value) + self.deliberation_costs) )

                loss_value = tf.reduce_mean(tf.square(td_target_ - self.q_vals_option), name='critic_loss')

                self.hierarchical loss = loss_value+loss_termination
                variables = self.Model.getHierarchyVariables()
                self.hierarchyUpdater = self.CreateUpdater(self.hierarchicalLoss,variables)

                # Creating the Losses updaters for the Sub-policies.
                self.subpolicyLoss = []
                self.subpolicyUpdater = []
                #Stop_gradient for the value function

                for i in range(self.nPolicies):
                    loss = self.CreateLossSubpolicy(self.sub_a_prob[i],self.td_target_,self.disconnected_q_vals,self.a_his,self.sub_log_logits[i],self.old_log_logits_sub_,self.advantage_,self.actionSize)
                    self.subpolicyLoss.append(loss)
                    variables = self.Model.getSubpolicyVariables(i)
                    self.subpolicyUpdater.append(self.CreateUpdater(loss,variables))

            #Creating Variables for teh purpose of logging.
            self.SubpolicyDistribution = MovingAverage(1000)
コード例 #13
0
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 sharedBuffer,
                 globalNet=None,
                 nTrajs=1,
                 LSTM=False):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders
        self.LSTM = LSTM
        self.sess = sess
        self.scope = scope
        self.Model = sharedModel
        self.sharedBuffer = sharedBuffer
        #Common Stuff Between the networks:
        self.HPs = HPs

        #Creating the different values of beta
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))

        self.betas = []
        for i in range(self.HPs["N"]):
            if i == 0:
                self.betas.append(0.0)
            elif i == self.HPs["N"] - 1:
                self.betas.append(self.HPs["betaMax"])
            else:
                self.betas.append(self.HPs["betaMax"] * sigmoid(
                    (2.0 * float(i) + 2.0 - self.HPs["N"]) /
                    (self.HPs["N"] - 2.0)))

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):

                #Specifying placeholders for Tensorflow Networks
                if len(stateShape) == 4:
                    self.states_ = tf.placeholder(shape=[None] +
                                                  stateShape[1:4],
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape[1:4],
                                                       dtype=tf.float32,
                                                       name='next_states')
                else:
                    self.states_ = tf.placeholder(shape=[None] + stateShape,
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape,
                                                       dtype=tf.float32,
                                                       name='next_states')
                self.actions_ = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name='actions_hold')
                self.done_ = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name='done_hold')
                self.rewards_ = tf.placeholder(shape=[None],
                                               dtype=tf.float32,
                                               name='total_reward')
                self.bandit_one_hot = tf.placeholder(
                    shape=[None, self.HPs["N"]],
                    dtype=tf.int32,
                    name='beta_bandit')
                self.action_past = tf.placeholder(shape=[None],
                                                  dtype=tf.int32,
                                                  name='action_past')
                self.reward_i_past = tf.placeholder(shape=[None],
                                                    dtype=tf.float32,
                                                    name='reward_i_past')
                self.reward_e_past = tf.placeholder(shape=[None],
                                                    dtype=tf.float32,
                                                    name='reward_e_past')
                self.reward_i_current = tf.placeholder(shape=[None],
                                                       dtype=tf.float32,
                                                       name='reward_i_current')
                self.reward_e_current = tf.placeholder(shape=[None],
                                                       dtype=tf.float32,
                                                       name='reward_e_current')

                # Creating the IO for the entire network
                input = {
                    "state": self.states_,
                    "state_next": self.next_states_,
                    "bandit_one_hot": self.bandit_one_hot,
                    "action_past": self.action_past,
                    "reward_i_past": self.reward_i_past,
                    "reward_e_past": self.reward_e_past,
                }
                out = self.Model(input)
                self.q = out["Q"]
                self.a_pred = out["action_prediction"]
                self.latent = out["latent_space"]
                self.rnd_random = out["RND_random"]
                self.rnd_predictor = out["RND_predictor"]

                input2 = {
                    "state": self.next_states_,
                    "state_next":
                    self.next_states_,  #Used as a placeholder in network
                    "bandit_one_hot": self.bandit_one_hot,
                    "action_past": self.actions_,
                    "reward_i_past": self.reward_i_current,
                    "reward_e_past": self.reward_e_current,
                }
                out2 = self.Model(input2)
                q_next = out["Q"]
                with tf.name_scope('q_learning'):
                    #Current Q
                    oh_action = tf.one_hot(
                        self.actions_, actionSize,
                        dtype=tf.float32)  # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action),
                                           axis=-1)  # [?, num_agent]
                    #Next Q
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    #TD Error
                    td_target = self.rewards_ + HPs["Gamma"] * max_next_q * (
                        1. - self.done_)
                    self.td_error = loss = tf.keras.losses.MSE(
                        td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(
                        softmax_q * tf.log(softmax_q))
                    self.loss = loss + HPs["EntropyBeta"] * self.entropy

                self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope)

                if globalNet is None:  #Creating the Training instance of the network.
                    with tf.name_scope('embedding_network'):
                        oh_action = tf.one_hot(
                            self.actions_, actionSize,
                            dtype=tf.float32)  # [?, num_agent, action_size]
                        self.embedding_loss = tf.keras.losses.MSE(
                            oh_action, self.a_pred)

                    with tf.name_scope('life_long_curiosity'):
                        self.llc_loss = tf.keras.losses.MSE(
                            self.rnd_random, self.rnd_predictor)
                    loss = self.loss + self.llc_loss + self.embedding_loss

                    optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"])

                    self.gradients = optimizer.get_gradients(loss, self.params)
                    self.update_op = optimizer.apply_gradients(
                        zip(self.gradients, self.params))

                    self.grads = [self.gradients]
                    self.losses = [loss]
                    self.update_ops = [self.update_op]

                    self.grad_MA = [
                        MovingAverage(400) for i in range(len(self.grads))
                    ]
                    self.loss_MA = [
                        MovingAverage(400) for i in range(len(self.losses))
                    ]
                    self.entropy_MA = MovingAverage(400)
                    self.labels = ["Total"]
                    self.HPs = HPs

                else:  #Creating a Actor Instance for the Network.
                    #Creating the Episodic Memory, which compares samples
                    self.episodicMemory = EpisodicMemory()
                    #Creating Local Buffer to store data until it is ready to push to sample buffer
                    self.buffer = [Trajectory(depth=10) for _ in range(nTrajs)]
                    #Creating a pull operation to synch network parameters
                    with tf.name_scope('sync'):
                        self.pull_params_op = [
                            l_p.assign(g_p)
                            for l_p, g_p in zip(self.params, globalNet.params)
                        ]
                        self.pull_ops = [self.pull_params_op]
コード例 #14
0
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.HPs = settings["NetworkHPs"]

        #Building the network.
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)

        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("PPO"):
                #Placeholders
                if len(stateShape) == 4:
                    self.s = tf.placeholder(tf.float32,
                                            [None] + stateShape[0:4], 'S')
                else:
                    self.s = tf.placeholder(tf.float32, [None] + stateShape,
                                            'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None],
                                                 dtype=tf.float32,
                                                 name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize],
                                                      dtype=tf.float32,
                                                      name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]

                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))

                self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob),
                                               name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                self.critic_loss = tf.reduce_mean(tf.square(td_error),
                                                  name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his,
                                       actionSize,
                                       dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH,
                                             1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(
                    ratio, 1 - self.HPs["eps"],
                    1 + self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate,
                                            clipped_surrogate,
                                            name='surrogate_loss')
                self.actor_loss = -tf.reduce_mean(surrogate_loss,
                                                  name='actor_loss')

                loss = self.actor_loss + self.critic_loss * self.HPs[
                    "CriticBeta"]

                # Build Trainer
                if self.HPs["Optimizer"] == "Adam":
                    self.optimizerA = tf.keras.optimizers.Adam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizerA = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizerA = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizerA = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizerA = tf.keras.optimizers.Adamax(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adamax(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizerA = tf.keras.optimizers.SGD(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.SGD(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"], amsgrad=True)
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"], amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                a_params = self.Model.GetVariables("Actor")
                c_params = self.Model.GetVariables("Critic")
                self.gradients_a = self.optimizerA.get_gradients(
                    loss, self.Model.trainable_variables)
                self.update_op_a = self.optimizerA.apply_gradients(
                    zip(self.gradients_a, self.Model.trainable_variables))

                entropy_loss = -self.entropy * self.HPs["EntropyBeta"]
                self.gradients_e = self.optimizerE.get_gradients(
                    entropy_loss, a_params)
                self.update_op_e = self.optimizerE.apply_gradients(
                    zip(self.gradients_e, a_params))

                total_counter = 1
                vanish_counter = 0
                for gradient in self.gradients_a:
                    total_counter += np.prod(gradient.shape)
                    stuff = tf.reduce_sum(
                        tf.cast(
                            tf.math.less_equal(tf.math.abs(gradient),
                                               tf.constant(1e-8)), tf.int32))
                    vanish_counter += stuff
                self.vanishing_gradient = vanish_counter / total_counter

        self.update_ops = [self.update_op_a, self.update_op_e]
        self.logging_ops = [
            self.actor_loss, self.critic_loss, self.entropy,
            tf.reduce_mean(self.advantage_),
            tf.reduce_mean(ratio), loss, self.vanishing_gradient
        ]
        self.labels = [
            "Loss Actor", "Loss Critic", "Entropy", "Advantage", "PPO Ratio",
            "Loss Total", "Vanishing Gradient"
        ]
        self.logging_MA = [
            MovingAverage(400) for i in range(len(self.logging_ops))
        ]
コード例 #15
0
ファイル: PPO_Hierarchy.py プロジェクト: vanstrn/RL_public
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)
        self.HPs = settings["HPs"]
        self.subReward = False
        self.UpdateSubpolicies = True
        self.nTrajs = nTrajs
        self.method = self.HPs["Method"]

        #Creating two buffers to separate information between the different levels of the network.
        if self.subReward:
            self.buffer = [Trajectory(depth=12) for _ in range(nTrajs)]
            #[s0,a,r,r_sub,s1,done]+[HL_actions, HL_log_logits, HL_v, flag, critics, logits]
        else:
            self.buffer = [Trajectory(depth=11) for _ in range(nTrajs)]
            #[s0,a,r,s1,done]+[HL_action, HL_log_logits, HL_v, flag, critics, logits]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Generic placeholders
                self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None],
                                                 dtype=tf.float32,
                                                 name='adv_hold')

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.a_prob = out["metaActor"]
                self.v = out["metaCritic"]
                self.log_logits = out["metaLogLogits"]

                self.sub_a_prob = out["subActor"]
                self.sub_log_logits = out["subLogLogits"]
                self.sub_v = out["subCritic"]

                self.nPolicies = len(self.sub_a_prob)

                #Placeholder for the Hierarchical Policy
                self.old_log_logits_ = tf.placeholder(
                    shape=[None, self.nPolicies],
                    dtype=tf.float32,
                    name='old_logit_hold')
                #Placeholder for the Sub-Policies
                self.old_log_logits_sub_ = tf.placeholder(
                    shape=[None, actionSize],
                    dtype=tf.float32,
                    name='old_logit_sub_hold')

                # Creating the Loss and update calls for the Hierarchical policy
                self.hierarchicalLoss = self.CreateLossPPO(
                    self.a_prob, self.td_target_, self.v, self.a_his,
                    self.log_logits, self.old_log_logits_, self.advantage_,
                    self.nPolicies)
                variables = self.Model.getHierarchyVariables()
                self.hierarchyUpdater = self.CreateUpdater(
                    self.hierarchicalLoss, variables)

                # Creating the Losses updaters for the Sub-policies.
                self.subpolicyLoss = []
                self.subpolicyUpdater = []
                for i in range(self.nPolicies):
                    loss = self.CreateLossPPO(self.sub_a_prob[i],
                                              self.td_target_, self.sub_v[i],
                                              self.a_his,
                                              self.sub_log_logits[i],
                                              self.old_log_logits_sub_,
                                              self.advantage_, self.actionSize)
                    self.subpolicyLoss.append(loss)
                    variables = self.Model.getSubpolicyVariables(i)
                    self.subpolicyUpdater.append(
                        self.CreateUpdater(loss, variables))

            #Creating Variables for teh purpose of logging.
            self.SubpolicyDistribution = MovingAverage(1000)
コード例 #16
0
    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes I/O placeholders and the training process of a Multi-step DQN.
        Main principal is that instead of one-step TD diference, the loss is evaluated on a
        temporally extended basis.
        G = R_t + γR_t+1 + ... γ^n-1 R_t+n + q(S_t+n,a*,θ-)
        loss = MSE(G,q(S_t,A_t,θ))

        """
        #Placeholders
        self.actionSize = actionSize
        self.sess=sess
        self.scope="worker"
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="worker")
        self.Model_ = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="target")

        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]
        with self.sess.as_default(), self.sess.graph.as_default():
            self.states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='states')
            self.next_states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='next_states')
            self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold')
            self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='rewards_hold')
            self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold')
            with tf.name_scope("target"):
                out2 = self.Model_({"state":self.next_states_})
                q_next = out2["Q"]
                self.targetParams = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target")
            with tf.name_scope(self.scope):
                input = {"state":self.states_}
                out = self.Model(input)
                self.q = out["Q"]

                with tf.name_scope('current_Q'):
                    oh_action = tf.one_hot(self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent]

                with tf.name_scope('target_Q'):
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q
                    # td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q * (1. - self.done_)

                with tf.name_scope('td_error'):
                    loss = tf.keras.losses.MSE(td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(softmax_q * tf.log(softmax_q+ 1e-5))
                    self.loss=total_loss = loss + self.HPs["EntropyBeta"] * self.entropy

                if self.HPs["Optimizer"] == "Adam":
                    self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                self.workerParams = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

                self.gradients = self.optimizer.get_gradients(total_loss, self.workerParams)
                self.update_op = self.optimizer.apply_gradients(zip(self.gradients, self.workerParams))

                with tf.name_scope('push'):
                    self.push_ops = [l_p.assign(g_p) for l_p, g_p in zip(self.targetParams, self.workerParams)]

                self.grads=[self.gradients]
                self.losses=[self.loss]
                self.update_ops=[self.update_op]

        self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
        self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))]
        self.labels = ["Critic"]
コード例 #17
0
ファイル: A3C.py プロジェクト: zd6/RL
class A3C(Method):
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 globalAC=None,
                 nTrajs=1):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders

        self.sess = sess
        self.scope = scope
        self.Model = sharedModel
        if len(stateShape) == 4:
            self.s = tf.placeholder(tf.float32, [None] + stateShape[1:4], 'S')
        else:
            self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
        self.a_his = tf.placeholder(tf.int32, [
            None,
        ], 'A')
        self.v_target = tf.placeholder(tf.float32, [None], 'Vtarget')

        input = {"state": self.s}
        out = self.Model(input)
        self.a_prob = out["actor"]
        self.v = out["critic"]

        if globalAC is None:  # get global network
            with tf.variable_scope(scope):
                self.a_params = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope=self.Model.scope + '/Shared') + tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope=self.Model.scope + '/Actor')
                self.c_params = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope=self.Model.scope + '/Shared') + tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope=self.Model.scope + '/Critic')
        else:  # local net, calculate losses
            self.buffer = [Trajectory(depth=6) for _ in range(nTrajs)]
            with tf.variable_scope(scope + "_update"):

                self.a_params = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope=self.Model.scope + '/Shared') + tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope=self.Model.scope + '/Actor')
                self.c_params = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope=self.Model.scope + '/Shared') + tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope=self.Model.scope + '/Critic')
                print(self.c_params)
                td = tf.subtract(self.v_target, self.v, name='TD_error')
                with tf.name_scope('c_loss'):
                    self.c_loss = tf.reduce_mean(tf.square(td))

                with tf.name_scope('a_loss'):
                    log_prob = tf.reduce_sum(
                        tf.log(self.a_prob + 1e-5) *
                        tf.one_hot(self.a_his, actionSize, dtype=tf.float32),
                        axis=1,
                        keep_dims=True)
                    exp_v = log_prob * tf.stop_gradient(td)
                    entropy = -tf.reduce_sum(
                        self.a_prob * tf.log(self.a_prob + 1e-5),
                        axis=1,
                        keep_dims=True)  # encourage exploration
                    self.entropy = entropy
                    self.exp_v = HPs["EntropyBeta"] * entropy + exp_v
                    self.a_loss = tf.reduce_mean(-self.exp_v)

                with tf.name_scope('local_grad'):
                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
                    self.c_grads = tf.gradients(self.c_loss, self.c_params)

            with tf.name_scope('sync'):
                with tf.name_scope('pull'):
                    self.pull_a_params_op = [
                        l_p.assign(g_p)
                        for l_p, g_p in zip(self.a_params, globalAC.a_params)
                    ]
                    self.pull_c_params_op = [
                        l_p.assign(g_p)
                        for l_p, g_p in zip(self.c_params, globalAC.c_params)
                    ]
                with tf.name_scope('push'):
                    self.update_a_op = tf.train.AdamOptimizer(
                        HPs["Actor LR"]).apply_gradients(
                            zip(self.a_grads, globalAC.a_params))
                    self.update_c_op = tf.train.AdamOptimizer(
                        HPs["Critic LR"]).apply_gradients(
                            zip(self.c_grads, globalAC.c_params))

            self.update_ops = [
                self.update_a_op,
                self.update_c_op,
            ]
            self.pull_ops = [
                self.pull_a_params_op,
                self.pull_c_params_op,
            ]
            self.grads = [
                self.a_grads,
                self.c_grads,
            ]
            self.losses = [
                self.a_loss,
                self.c_loss,
            ]

            self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
            self.loss_MA = [MovingAverage(400) for i in range(len(self.grads))]
            self.entropy_MA = MovingAverage(400)
            self.labels = [
                "Actor",
                "Critic",
            ]
            self.HPs = HPs

    def GetAction(self, state):
        """
        Contains the code to run the network based on an input.
        """
        if len(state.shape) == 3:
            state = state[np.newaxis, :]
        if len(state.shape) == 1:
            state = state[np.newaxis, :]
        probs, v = self.sess.run(
            [self.a_prob, self.v],
            {self.s: state})  # get probabilities for all actions

        actions = np.array([
            np.random.choice(probs.shape[1], p=prob / sum(prob))
            for prob in probs
        ])
        return actions, [
            v
        ]  # return a int and extra data that needs to be fed to buffer.

    def Update(self, HPs, episode=0, statistics=True):
        """
        The main update function for A3C. The function pushes gradients to the global AC Network.
        The second function is to Pull
        """
        #Process the data from the buffer
        samples = 0
        for i in range(len(self.buffer)):
            samples += len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            td_target, _ = self.ProcessBuffer(HPs, traj)
            batches = len(
                self.buffer[traj][0]) // self.HPs["MinibatchSize"] + 1

            s = np.array_split(np.squeeze(self.buffer[traj][0]), batches)
            a_his = np.array_split(
                np.asarray(self.buffer[traj][1]).reshape(-1), batches)
            v_target = np.array_split(td_target, batches)

            for epoch in range(self.HPs["Epochs"]):
                for i in range(batches):
                    #Create a feedDict from the buffer
                    feedDict = {
                        self.s: s[i],
                        self.a_his: a_his[i],
                        self.v_target: v_target[i],
                    }
                    if not statistics:
                        self.sess.run(self.update_ops, feedDict)
                    else:
                        #Perform update operations
                        out = self.sess.run(
                            self.update_ops + self.losses + self.grads,
                            feedDict)  # local grads applied to global net.
                        out = np.array_split(out, 3)
                        losses = out[1]
                        grads = out[2]

                        for i, loss in enumerate(losses):
                            self.loss_MA[i].append(loss)

                        for i, grads_i in enumerate(grads):
                            total_counter = 0
                            vanish_counter = 0
                            for grad in grads_i:
                                total_counter += np.prod(grad.shape)
                                vanish_counter += (np.absolute(grad) <
                                                   1e-8).sum()
                            self.grad_MA[i].append(vanish_counter /
                                                   total_counter)

                        ent = self.sess.run(
                            self.entropy,
                            feedDict)  # local grads applied to global net.
                        entropy = np.average(np.asarray(ent))
                        self.entropy_MA.append(entropy)

        self.ClearTrajectory()
        self.sess.run(
            self.pull_ops)  # global variables synched to the local net.

    def GetStatistics(self):
        dict = {}
        for i, label in enumerate(self.labels):
            dict["Training Results/Vanishing Gradient " +
                 label] = self.grad_MA[i]()
            dict["Training Results/Loss " + label] = self.loss_MA[i]()
            dict["Training Results/Entropy"] = self.entropy_MA()
        return dict

    def ProcessBuffer(self, HPs, traj):
        """
        Process the buffer to calculate td_target.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x]

        reward_lists = np.split(self.buffer[traj][2], split_loc)
        value_lists = np.split(self.buffer[traj][5], split_loc)

        td_target = []
        advantage = []
        for rew, value in zip(reward_lists, value_lists):
            td_target_i, advantage_i = gae(rew.reshape(-1),
                                           value.reshape(-1).tolist(), 0,
                                           self.HPs["Gamma"],
                                           self.HPs["lambda"])
            td_target.extend(td_target_i)
            advantage.extend(advantage_i)
        return td_target, advantage

    @property
    def getVars(self):
        return self.Model.getVars(self.scope)

    @property
    def getAParams(self):
        return tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES,
            scope=self.Model.scope + '/Shared') + tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES,
                scope=self.Model.scope + 'Actor')

    @property
    def getCParams(self):
        return tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES,
            scope=self.Model.scope + '/Shared') + tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES,
                scope=self.Model.scope + '/Critic')
コード例 #18
0
ファイル: DQN.py プロジェクト: zd6/RL
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 globalAC=None,
                 nTrajs=1):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders

        self.sess = sess
        self.scope = scope
        self.Model = sharedModel

        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]
        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                if len(stateShape) == 4:
                    self.states_ = tf.placeholder(shape=[None] +
                                                  stateShape[1:4],
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape[1:4],
                                                       dtype=tf.float32,
                                                       name='next_states')
                else:
                    self.states_ = tf.placeholder(shape=[None] + stateShape,
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape,
                                                       dtype=tf.float32,
                                                       name='next_states')
                self.actions_ = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name='actions_hold')
                self.rewards_ = tf.placeholder(shape=[None],
                                               dtype=tf.float32,
                                               name='rewards_hold')
                self.done_ = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name='done_hold')

                input = {"state": self.states_}
                out = self.Model(input)
                self.q = out["Q"]

                out2 = self.Model({"state": self.next_states_})
                q_next = out2["Q"]

                with tf.name_scope('current_Q'):
                    oh_action = tf.one_hot(
                        self.actions_, actionSize,
                        dtype=tf.float32)  # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action),
                                           axis=-1)  # [?, num_agent]

                with tf.name_scope('target_Q'):
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    td_target = self.rewards_ + HPs["Gamma"] * max_next_q * (
                        1. - self.done_)

                with tf.name_scope('td_error'):
                    loss = tf.keras.losses.MSE(td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(
                        softmax_q * tf.log(softmax_q))
                    self.loss = total_loss = loss + HPs[
                        "EntropyBeta"] * self.entropy

                optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"])
                self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope)

                self.gradients = optimizer.get_gradients(
                    total_loss, self.params)
                self.update_op = optimizer.apply_gradients(
                    zip(self.gradients, self.params))

                self.grads = [self.gradients]
                self.losses = [self.loss]
                self.update_ops = [self.update_op]

        self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
        self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))]
        self.entropy_MA = MovingAverage(400)
        self.labels = ["Critic"]
        self.HPs = HPs
コード例 #19
0
ファイル: DQN.py プロジェクト: zd6/RL
class DQN(Method):
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 globalAC=None,
                 nTrajs=1):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders

        self.sess = sess
        self.scope = scope
        self.Model = sharedModel

        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]
        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                if len(stateShape) == 4:
                    self.states_ = tf.placeholder(shape=[None] +
                                                  stateShape[1:4],
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape[1:4],
                                                       dtype=tf.float32,
                                                       name='next_states')
                else:
                    self.states_ = tf.placeholder(shape=[None] + stateShape,
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape,
                                                       dtype=tf.float32,
                                                       name='next_states')
                self.actions_ = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name='actions_hold')
                self.rewards_ = tf.placeholder(shape=[None],
                                               dtype=tf.float32,
                                               name='rewards_hold')
                self.done_ = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name='done_hold')

                input = {"state": self.states_}
                out = self.Model(input)
                self.q = out["Q"]

                out2 = self.Model({"state": self.next_states_})
                q_next = out2["Q"]

                with tf.name_scope('current_Q'):
                    oh_action = tf.one_hot(
                        self.actions_, actionSize,
                        dtype=tf.float32)  # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action),
                                           axis=-1)  # [?, num_agent]

                with tf.name_scope('target_Q'):
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    td_target = self.rewards_ + HPs["Gamma"] * max_next_q * (
                        1. - self.done_)

                with tf.name_scope('td_error'):
                    loss = tf.keras.losses.MSE(td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(
                        softmax_q * tf.log(softmax_q))
                    self.loss = total_loss = loss + HPs[
                        "EntropyBeta"] * self.entropy

                optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"])
                self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope)

                self.gradients = optimizer.get_gradients(
                    total_loss, self.params)
                self.update_op = optimizer.apply_gradients(
                    zip(self.gradients, self.params))

                self.grads = [self.gradients]
                self.losses = [self.loss]
                self.update_ops = [self.update_op]

        self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
        self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))]
        self.entropy_MA = MovingAverage(400)
        self.labels = ["Critic"]
        self.HPs = HPs

    def GetAction(self, state, episode, step):
        """
        Contains the code to run the network based on an input.
        """
        if len(state.shape) == 3:
            state = state[np.newaxis, :]
        if len(state.shape) == 1:
            state = state[np.newaxis, :]
        q = self.sess.run(self.q, {self.states_: state})
        if "Exploration" in self.HPs:
            if self.HPs["Exploration"] == "EGreedy":
                prob = 0.1 + 0.9 * (np.exp(
                    -episode / self.HPs["ExplorationDecay"]))
                if random.uniform(0, 1) < prob:
                    actions = random.randint(0, 4)
                else:
                    actions = np.argmax(q, axis=-1)
        else:
            actions = np.argmax(q, axis=-1)
        return actions, [
        ]  # return a int and extra data that needs to be fed to buffer.

    def Update(self, HPs, episode=0, statistics=True):
        """
        The main update function for A3C. The function pushes gradients to the global AC Network.
        The second function is to Pull
        """
        #Process the data from the buffer
        samples = 0
        for i in range(len(self.buffer)):
            samples += len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):
            batches = len(
                self.buffer[traj][0]) // self.HPs["MinibatchSize"] + 1
            s = np.array_split(self.buffer[traj][0], batches)
            a_his = np.array_split(
                np.asarray(self.buffer[traj][1]).reshape(-1), batches)
            r = np.array_split(
                np.asarray(self.buffer[traj][2]).reshape(-1), batches)
            s_next = np.array_split(self.buffer[traj][3], batches)
            done = np.array_split(self.buffer[traj][4], batches)

            for epoch in range(self.HPs["Epochs"]):
                for i in range(batches):
                    #Create a feedDict from the buffer
                    feedDict = {
                        self.states_: np.squeeze(np.asarray(s[i])),
                        self.next_states_: np.squeeze(np.asarray(s_next[i])),
                        self.actions_: np.squeeze(np.asarray(a_his[i])),
                        self.rewards_: np.squeeze(np.asarray(r[i])),
                        self.done_: np.squeeze(np.asarray(done[i],
                                                          dtype=float))
                    }
                    out = self.sess.run(
                        self.update_ops + self.losses + self.grads,
                        feedDict)  # local grads applied to global net.
                    out = np.array_split(out, 3)
                    losses = out[1]
                    grads = out[2]

                    for i, loss in enumerate(losses):
                        self.loss_MA[i].append(loss)

                    for i, grads_i in enumerate(grads):
                        total_counter = 1
                        vanish_counter = 0
                        for grad in grads_i:
                            total_counter += np.prod(grad.shape)
                            vanish_counter += (np.absolute(grad) < 1e-8).sum()
                        self.grad_MA[i].append(vanish_counter / total_counter)

                    ent = self.sess.run(
                        self.entropy,
                        feedDict)  # local grads applied to global net.
                    entropy = np.average(np.asarray(ent))
                    self.entropy_MA.append(entropy)

        self.ClearTrajectory()

    def GetStatistics(self):
        dict = {}
        for i, label in enumerate(self.labels):
            dict["Training Results/Vanishing Gradient " +
                 label] = self.grad_MA[i]()
            dict["Training Results/Loss " + label] = self.loss_MA[i]()
            dict["Training Results/Entropy"] = self.entropy_MA()
        return dict

    def ProcessBuffer(self, HPs, traj):
        """
        Process the buffer to calculate td_target.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        pass

    @property
    def getVars(self):
        return self.Model.getVars(self.scope)
コード例 #20
0
ファイル: SF_On_Action_v2.py プロジェクト: zd6/RL
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 nTrajs=1):
        """
        Off policy Successor Representation using neural networks
        Does not create an action for the

        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders
        self.actionSize = actionSize
        self.HPs = HPs
        self.sess = sess
        self.scope = scope
        self.Model = sharedModel
        self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
        self.a = tf.placeholder(tf.float32, [None, self.actionSize], 'A')
        self.s_next = tf.placeholder(tf.float32, [None] + stateShape, 'S_next')
        self.reward = tf.placeholder(tf.float32, [
            None,
        ], 'R')
        self.td_target = tf.placeholder(
            tf.float32, [None, self.Model.data["DefaultParams"]["SFSize"]],
            'TDtarget')
        self.advantage_ = tf.placeholder(shape=[None],
                                         dtype=tf.float32,
                                         name='adv_hold')
        self.old_log_logits_ = tf.placeholder(shape=[None, actionSize],
                                              dtype=tf.float32,
                                              name='old_logit_hold')

        input = {"state": self.s, "action": self.a}
        out = self.Model(input)
        self.value_pred = out["critic"]
        self.state_pred = out["prediction"]
        self.reward_pred = out["reward_pred"]
        self.phi = out["phi"]
        self.psi = out["psi"]
        self.a_prob = out["actor"]
        self.log_logits = out["log_logits"]

        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        self.params = self.Model.getVars()

        with tf.name_scope('loss'):
            sf_error = tf.subtract(self.td_target, self.psi, name='TD_error')
            sf_error = tf.square(sf_error)
            self.c_loss = tf.reduce_mean(sf_error, name="sf_loss")

            if HPs["Loss"] == "MSE":
                self.s_loss = tf.losses.mean_squared_error(
                    self.state_pred, self.s_next)
            elif HPs["Loss"] == "KL":
                self.s_loss = tf.losses.KLD(self.state_pred, self.s_next)
            elif HPs["Loss"] == "M4E":
                self.s_loss = tf.reduce_mean(
                    (self.state_pred - self.s_next)**4)

            self.r_loss = tf.losses.mean_squared_error(
                self.reward, tf.squeeze(self.reward_pred))

            # Entropy
            def _log(val):
                return tf.log(tf.clip_by_value(val, 1e-10, 10.0))

            entropy = self.entropy = -tf.reduce_mean(
                self.a_prob * _log(self.a_prob), name='entropy')
            # Actor Loss
            action_OH = tf.one_hot(self.a, actionSize, dtype=tf.float32)
            log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
            old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1)

            # Clipped surrogate function
            ratio = tf.exp(log_prob - old_log_prob)
            surrogate = ratio * self.advantage_
            clipped_surrogate = tf.clip_by_value(
                ratio, 1 - HPs["eps"], 1 + HPs["eps"]) * self.advantage_
            surrogate_loss = tf.minimum(surrogate,
                                        clipped_surrogate,
                                        name='surrogate_loss')
            self.actor_loss = -tf.reduce_mean(surrogate_loss,
                                              name='actor_loss')

            self.loss = self.a_loss - entropy * HPs[
                "EntropyBeta"] + self.s_loss + HPs[
                    "CriticBeta"] * self.c_loss + HPs[
                        "RewardBeta"] * self.r_loss

        if HPs["Optimizer"] == "Adam":
            self.optimizer = tf.keras.optimizers.Adam(HPs["LR"])
        elif HPs["Optimizer"] == "RMS":
            self.optimizer = tf.keras.optimizers.RMSprop(HPs["LR"])
        elif HPs["Optimizer"] == "Adagrad":
            self.optimizer = tf.keras.optimizers.Adagrad(HPs["LR"])
        elif HPs["Optimizer"] == "Adadelta":
            self.optimizer = tf.keras.optimizers.Adadelta(HPs["LR"])
        elif HPs["Optimizer"] == "Adamax":
            self.optimizer = tf.keras.optimizers.Adamax(HPs["LR"])
        elif HPs["Optimizer"] == "Nadam":
            self.optimizer = tf.keras.optimizers.Nadam(HPs["LR"])
        elif HPs["Optimizer"] == "SGD":
            self.optimizer = tf.keras.optimizers.SGD(HPs["LR"])
        elif HPs["Optimizer"] == "SGD-Nesterov":
            self.optimizer = tf.keras.optimizers.SGD(HPs["LR"], nesterov=True)
        elif HPs["Optimizer"] == "Amsgrad":
            self.optimizer = tf.keras.optimizers.Nadam(HPs["LR"], amsgrad=True)
        else:
            print("Not selected a proper Optimizer")
            exit()

        with tf.name_scope('local_grad'):
            self.grads = self.optimizer.get_gradients(self.loss, self.params)

        with tf.name_scope('update'):
            self.update_op = self.optimizer.apply_gradients(
                zip(self.grads, self.params))

        self.update_ops = [self.update_op]
        self.grads = [self.grads]
        self.losses = [self.c_loss, self.s_loss, self.r_loss, self.a_loss]

        self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
        self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))]
        self.Gradlabels = ["Total"]
        self.Losslabels = ["Critic", "State", "Reward"]

        self.clearBuffer = False
コード例 #21
0
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 sharedBuffer,
                 globalNet=None,
                 nTrajs=1,
                 LSTM=False):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders
        self.LSTM = LSTM
        self.sess = sess
        self.scope = scope
        self.Model = sharedModel
        self.sharedBuffer = sharedBuffer
        self.HPs = HPs
        self.actionSize = actionSize

        #Creating the different values of beta and gamma
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))

        self.betas = []
        for i in range(self.HPs["N"]):
            if i == 0:
                self.betas.append(0.0)
            elif i == self.HPs["N"] - 1:
                self.betas.append(self.HPs["betaMax"])
            else:
                self.betas.append(self.HPs["betaMax"] * sigmoid(
                    (2.0 * float(i) + 2.0 - self.HPs["N"]) /
                    (self.HPs["N"] - 2.0)))
        self.gammas = []
        for i in range(self.HPs["N"]):
            if i == 0:
                self.gammas.append(self.HPs["Gamma0"])
            elif i < 7:
                self.gammas.append(self.HPs["Gamma1"] +
                                   (self.HPs["Gamma0"] - self.HPs["Gamma1"]) *
                                   sigmoid(10.0 *
                                           (2.0 * float(i) - 6.0) / 6.0))
            elif i == 7:
                self.gammas.append(self.HPs["Gamma1"])
            else:
                self.gammas.append(1.0 - np.exp((
                    (self.HPs["N"] - 9.0) * np.log(1.0 - self.HPs["Gamma1"]) +
                    (float(i) - 8.0) * np.log(1 - self.HPs["Gamma2"])) /
                                                (self.HPs["N"] - 9.0)))
        self.gammas = np.asarray(self.gammas)

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):

                #Specifying placeholders for Tensorflow Networks
                if len(stateShape) == 4:
                    self.states_ = tf.placeholder(shape=[None] +
                                                  stateShape[1:4],
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape[1:4],
                                                       dtype=tf.float32,
                                                       name='next_states')
                else:
                    self.states_ = tf.placeholder(shape=[None] + stateShape,
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape,
                                                       dtype=tf.float32,
                                                       name='next_states')
                self.actions_ = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name='actions_hold')
                self.done_ = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name='done_hold')
                self.rewards_ = tf.placeholder(shape=[None],
                                               dtype=tf.float32,
                                               name='total_reward')
                self.bandit_one_hot = tf.placeholder(
                    shape=[None, self.HPs["N"]],
                    dtype=tf.int32,
                    name='beta_bandit')
                self.action_past = tf.placeholder(shape=[None],
                                                  dtype=tf.int32,
                                                  name='action_past')
                self.reward_i_past = tf.placeholder(shape=[None],
                                                    dtype=tf.float32,
                                                    name='reward_i_past')
                self.reward_e_past = tf.placeholder(shape=[None],
                                                    dtype=tf.float32,
                                                    name='reward_e_past')
                self.reward_i_current = tf.placeholder(shape=[None],
                                                       dtype=tf.float32,
                                                       name='reward_i_current')
                self.reward_e_current = tf.placeholder(shape=[None],
                                                       dtype=tf.float32,
                                                       name='reward_e_current')

                # Creating the IO for the entire network
                input = {
                    "state": self.states_,
                    "state_next": self.next_states_,
                    "bandit_one_hot": self.bandit_one_hot,
                    "action_past": self.action_past,
                    "reward_i_past": self.reward_i_past,
                    "reward_e_past": self.reward_e_past,
                }
                out = self.Model(input)
                self.q = out["Q"]
                self.a_pred = out["action_prediction"]
                self.latent = out["latent_space"]
                self.rnd_random = out["RND_random"]
                self.rnd_predictor = out["RND_predictor"]

                input2 = {
                    "state": self.next_states_,
                    "state_next":
                    self.next_states_,  #Used as a placeholder in network
                    "bandit_one_hot": self.bandit_one_hot,
                    "action_past": self.actions_,
                    "reward_i_past": self.reward_i_current,
                    "reward_e_past": self.reward_e_current,
                }
                out2 = self.Model(input2)
                q_next = out["Q"]
                with tf.name_scope('q_learning'):
                    #Current Q
                    oh_action = tf.one_hot(
                        self.actions_, actionSize,
                        dtype=tf.float32)  # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action),
                                           axis=-1)  # [?, num_agent]
                    #Next Q
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    #TD Error
                    td_target = self.rewards_ + tf.reduce_sum(
                        tf.cast(self.bandit_one_hot, tf.float32) *
                        self.gammas) * max_next_q * (1. - self.done_)
                    # td_target = self.rewards_  + HPs["Gamma"] * max_next_q * (1. - self.done_)
                    self.td_error = loss = tf.keras.losses.MSE(
                        td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(
                        softmax_q * tf.log(softmax_q + 1e-5))
                    self.loss = loss + HPs["EntropyBeta"] * self.entropy

                self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope)
                self.int_params = self.Model.GetVariables("Intrinsic")
                self.critic_params = self.Model.GetVariables("Critic")

                if globalNet is None:  #Creating the Training instance of the network.
                    with tf.name_scope('embedding_network'):
                        oh_action = tf.one_hot(
                            self.actions_, actionSize,
                            dtype=tf.float32)  # [?, num_agent, action_size]
                        self.embedding_loss = tf.reduce_mean(
                            tf.keras.losses.MSE(oh_action, self.a_pred))

                    with tf.name_scope('life_long_curiosity'):
                        self.llc_loss = tf.reduce_mean(
                            tf.keras.losses.MSE(self.rnd_random,
                                                self.rnd_predictor))

                    loss_critic = tf.reduce_mean(self.loss)
                    optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"])
                    self.gradients = optimizer.get_gradients(
                        loss_critic, self.critic_params)
                    self.update_op = optimizer.apply_gradients(
                        zip(self.gradients, self.critic_params))
                    #
                    loss_intrinsic = tf.reduce_mean(self.llc_loss +
                                                    self.embedding_loss)
                    optimizer2 = tf.keras.optimizers.Adam(
                        HPs["LearningRateEmbedding"])
                    self.embedding_gradients = optimizer2.get_gradients(
                        loss_intrinsic, self.int_params)
                    self.embedding_update = optimizer2.apply_gradients(
                        zip(self.embedding_gradients, self.int_params))

                    total_counter = 1
                    vanish_counter = 0
                    for gradient in self.gradients:
                        total_counter += np.prod(gradient.shape)
                        stuff = tf.reduce_sum(
                            tf.cast(
                                tf.math.less_equal(tf.math.abs(gradient),
                                                   tf.constant(1e-8)),
                                tf.int32))
                        vanish_counter += stuff
                    self.vanishing_gradient = vanish_counter / total_counter

                    # self.vanishing_gradient = 0

                    self.update_ops = [self.update_op, self.embedding_update]
                    # self.update_ops=[self.update_op]
                    self.logging_ops = [
                        loss, self.embedding_loss, self.llc_loss, self.entropy,
                        self.vanishing_gradient
                    ]
                    self.logging_MA = [
                        MovingAverage(400)
                        for i in range(len(self.logging_ops))
                    ]
                    self.labels = [
                        "Total Loss", "Embedding Loss",
                        "Life Long Curiosity Loss", "Entropy",
                        "Vanishing Gradient"
                    ]

                else:  #Creating a Actor Instance for the Network.
                    #Creating the Episodic Memory, which compares samples
                    self.episodicMemory = EpisodicMemory()
                    #Creating Local Buffer to store data until it is ready to push to sample buffer
                    self.buffer = [Trajectory(depth=10) for _ in range(nTrajs)]
                    #Creating a pull operation to synch network parameters
                    with tf.name_scope('sync'):
                        self.pull_params_op = [
                            l_p.assign(g_p)
                            for l_p, g_p in zip(self.params, globalNet.params)
                        ]
                        self.pull_ops = [self.pull_params_op]

                    self.alpha = MovingAverage(2000)
                    self.K = MovingAverage(2000)
コード例 #22
0
class NGU(Method):
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 sharedBuffer,
                 globalNet=None,
                 nTrajs=1,
                 LSTM=False):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders
        self.LSTM = LSTM
        self.sess = sess
        self.scope = scope
        self.Model = sharedModel
        self.sharedBuffer = sharedBuffer
        self.HPs = HPs
        self.actionSize = actionSize

        #Creating the different values of beta and gamma
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))

        self.betas = []
        for i in range(self.HPs["N"]):
            if i == 0:
                self.betas.append(0.0)
            elif i == self.HPs["N"] - 1:
                self.betas.append(self.HPs["betaMax"])
            else:
                self.betas.append(self.HPs["betaMax"] * sigmoid(
                    (2.0 * float(i) + 2.0 - self.HPs["N"]) /
                    (self.HPs["N"] - 2.0)))
        self.gammas = []
        for i in range(self.HPs["N"]):
            if i == 0:
                self.gammas.append(self.HPs["Gamma0"])
            elif i < 7:
                self.gammas.append(self.HPs["Gamma1"] +
                                   (self.HPs["Gamma0"] - self.HPs["Gamma1"]) *
                                   sigmoid(10.0 *
                                           (2.0 * float(i) - 6.0) / 6.0))
            elif i == 7:
                self.gammas.append(self.HPs["Gamma1"])
            else:
                self.gammas.append(1.0 - np.exp((
                    (self.HPs["N"] - 9.0) * np.log(1.0 - self.HPs["Gamma1"]) +
                    (float(i) - 8.0) * np.log(1 - self.HPs["Gamma2"])) /
                                                (self.HPs["N"] - 9.0)))
        self.gammas = np.asarray(self.gammas)

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):

                #Specifying placeholders for Tensorflow Networks
                if len(stateShape) == 4:
                    self.states_ = tf.placeholder(shape=[None] +
                                                  stateShape[1:4],
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape[1:4],
                                                       dtype=tf.float32,
                                                       name='next_states')
                else:
                    self.states_ = tf.placeholder(shape=[None] + stateShape,
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape,
                                                       dtype=tf.float32,
                                                       name='next_states')
                self.actions_ = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name='actions_hold')
                self.done_ = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name='done_hold')
                self.rewards_ = tf.placeholder(shape=[None],
                                               dtype=tf.float32,
                                               name='total_reward')
                self.bandit_one_hot = tf.placeholder(
                    shape=[None, self.HPs["N"]],
                    dtype=tf.int32,
                    name='beta_bandit')
                self.action_past = tf.placeholder(shape=[None],
                                                  dtype=tf.int32,
                                                  name='action_past')
                self.reward_i_past = tf.placeholder(shape=[None],
                                                    dtype=tf.float32,
                                                    name='reward_i_past')
                self.reward_e_past = tf.placeholder(shape=[None],
                                                    dtype=tf.float32,
                                                    name='reward_e_past')
                self.reward_i_current = tf.placeholder(shape=[None],
                                                       dtype=tf.float32,
                                                       name='reward_i_current')
                self.reward_e_current = tf.placeholder(shape=[None],
                                                       dtype=tf.float32,
                                                       name='reward_e_current')

                # Creating the IO for the entire network
                input = {
                    "state": self.states_,
                    "state_next": self.next_states_,
                    "bandit_one_hot": self.bandit_one_hot,
                    "action_past": self.action_past,
                    "reward_i_past": self.reward_i_past,
                    "reward_e_past": self.reward_e_past,
                }
                out = self.Model(input)
                self.q = out["Q"]
                self.a_pred = out["action_prediction"]
                self.latent = out["latent_space"]
                self.rnd_random = out["RND_random"]
                self.rnd_predictor = out["RND_predictor"]

                input2 = {
                    "state": self.next_states_,
                    "state_next":
                    self.next_states_,  #Used as a placeholder in network
                    "bandit_one_hot": self.bandit_one_hot,
                    "action_past": self.actions_,
                    "reward_i_past": self.reward_i_current,
                    "reward_e_past": self.reward_e_current,
                }
                out2 = self.Model(input2)
                q_next = out["Q"]
                with tf.name_scope('q_learning'):
                    #Current Q
                    oh_action = tf.one_hot(
                        self.actions_, actionSize,
                        dtype=tf.float32)  # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action),
                                           axis=-1)  # [?, num_agent]
                    #Next Q
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    #TD Error
                    td_target = self.rewards_ + tf.reduce_sum(
                        tf.cast(self.bandit_one_hot, tf.float32) *
                        self.gammas) * max_next_q * (1. - self.done_)
                    # td_target = self.rewards_  + HPs["Gamma"] * max_next_q * (1. - self.done_)
                    self.td_error = loss = tf.keras.losses.MSE(
                        td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(
                        softmax_q * tf.log(softmax_q + 1e-5))
                    self.loss = loss + HPs["EntropyBeta"] * self.entropy

                self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope)
                self.int_params = self.Model.GetVariables("Intrinsic")
                self.critic_params = self.Model.GetVariables("Critic")

                if globalNet is None:  #Creating the Training instance of the network.
                    with tf.name_scope('embedding_network'):
                        oh_action = tf.one_hot(
                            self.actions_, actionSize,
                            dtype=tf.float32)  # [?, num_agent, action_size]
                        self.embedding_loss = tf.reduce_mean(
                            tf.keras.losses.MSE(oh_action, self.a_pred))

                    with tf.name_scope('life_long_curiosity'):
                        self.llc_loss = tf.reduce_mean(
                            tf.keras.losses.MSE(self.rnd_random,
                                                self.rnd_predictor))

                    loss_critic = tf.reduce_mean(self.loss)
                    optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"])
                    self.gradients = optimizer.get_gradients(
                        loss_critic, self.critic_params)
                    self.update_op = optimizer.apply_gradients(
                        zip(self.gradients, self.critic_params))
                    #
                    loss_intrinsic = tf.reduce_mean(self.llc_loss +
                                                    self.embedding_loss)
                    optimizer2 = tf.keras.optimizers.Adam(
                        HPs["LearningRateEmbedding"])
                    self.embedding_gradients = optimizer2.get_gradients(
                        loss_intrinsic, self.int_params)
                    self.embedding_update = optimizer2.apply_gradients(
                        zip(self.embedding_gradients, self.int_params))

                    total_counter = 1
                    vanish_counter = 0
                    for gradient in self.gradients:
                        total_counter += np.prod(gradient.shape)
                        stuff = tf.reduce_sum(
                            tf.cast(
                                tf.math.less_equal(tf.math.abs(gradient),
                                                   tf.constant(1e-8)),
                                tf.int32))
                        vanish_counter += stuff
                    self.vanishing_gradient = vanish_counter / total_counter

                    # self.vanishing_gradient = 0

                    self.update_ops = [self.update_op, self.embedding_update]
                    # self.update_ops=[self.update_op]
                    self.logging_ops = [
                        loss, self.embedding_loss, self.llc_loss, self.entropy,
                        self.vanishing_gradient
                    ]
                    self.logging_MA = [
                        MovingAverage(400)
                        for i in range(len(self.logging_ops))
                    ]
                    self.labels = [
                        "Total Loss", "Embedding Loss",
                        "Life Long Curiosity Loss", "Entropy",
                        "Vanishing Gradient"
                    ]

                else:  #Creating a Actor Instance for the Network.
                    #Creating the Episodic Memory, which compares samples
                    self.episodicMemory = EpisodicMemory()
                    #Creating Local Buffer to store data until it is ready to push to sample buffer
                    self.buffer = [Trajectory(depth=10) for _ in range(nTrajs)]
                    #Creating a pull operation to synch network parameters
                    with tf.name_scope('sync'):
                        self.pull_params_op = [
                            l_p.assign(g_p)
                            for l_p, g_p in zip(self.params, globalNet.params)
                        ]
                        self.pull_ops = [self.pull_params_op]

                    self.alpha = MovingAverage(2000)
                    self.K = MovingAverage(2000)

    def GetAction(self,
                  state,
                  a_past,
                  r_i_past,
                  r_e_past,
                  episode=None,
                  step=0):
        """
        Contains the code to run the network based on an input.
        """
        #Fixing the state shape if there is somethinf wrong
        if len(state.shape) == 3:
            state = state[np.newaxis, :]
        if len(state.shape) == 1:
            state = state[np.newaxis, :]

        #Selecting new beta if the begining of the episode
        #Also bootstrapping rewards/actions for the
        if step == 0:
            currBeta = random.randint(0, self.HPs["N"] - 1)
            oh = np.zeros(self.HPs["N"])
            oh[currBeta] = 1
            self.betaSelect = oh
            self.currBeta = self.betas[currBeta]

        feedDict = {
            self.states_: state,
            self.bandit_one_hot: self.betaSelect[np.newaxis, :],
            self.action_past: np.asarray(a_past),
            self.reward_i_past: np.asarray(r_i_past),
            self.reward_e_past: np.asarray(r_e_past)
        }
        q = self.sess.run(self.q, feedDict)

        if "Exploration" in self.HPs:
            if self.HPs["Exploration"] == "EGreedy":
                prob = self.HPs["ExploreSS"] + (1 - self.HPs["ExploreSS"]) * (
                    np.exp(-episode / self.HPs["ExplorationDecay"]))
                if random.uniform(0, 1) < prob:
                    actions = np.array(
                        [random.randint(0, self.actionSize - 1)])
                else:
                    actions = np.argmax(q, axis=-1)
            else:
                actions = np.argmax(q, axis=-1)
        else:
            actions = np.argmax(q, axis=-1)

        return actions, [
            self.currBeta, self.betaSelect
        ]  # return a int and extra data that needs to be fed to buffer.

    def Encode(self, state):
        if len(state.shape) == 3:
            state = state[np.newaxis, :]
        if len(state.shape) == 1:
            state = state[np.newaxis, :]
        return self.sess.run(self.latent, {self.states_: state})

    def RNDPredictionError(self, state):
        if len(state.shape) == 3:
            state = state[np.newaxis, :]
        if len(state.shape) == 1:
            state = state[np.newaxis, :]
        random, predictor = self.sess.run(
            [self.rnd_random, self.rnd_predictor], {self.states_: state})
        return np.linalg.norm(random - predictor)

    def GetIntrinsicReward(self, state_prev, state, episode=None, step=0):
        #Clearing the episodic buffer
        if step == 0:
            self.episodicMemory.Clear()
            self.episodicMemory.Add(self.Encode(state_prev))

        #Adding Sample to the buffer
        encodedState = self.Encode(state)
        stateError = self.RNDPredictionError(state)
        self.sharedBuffer.AddError(stateError)

        #####Calculating the episodic reward factor
        #-finding k nearest neighbors in buffer and distance to them
        K = self.episodicMemory.NearestNeighborsDist(
            encodedState, num=self.HPs["NearestNeighbors"])
        r_episodic = 1.0 / np.sqrt(K + 0.001)

        #Calculating alpha
        stateError_Average, stateError_std = self.sharedBuffer.GetMuSigma()
        alpha = 1.0 + (stateError - stateError_Average) / stateError_std
        self.alpha.append(alpha)
        self.K.append(K)

        #Calculating the intrinsic reward
        r_i = r_episodic * min(max(1.0, alpha), self.HPs["L"])

        #adding the sample to the buffer after nearest neighbors has been calculated.
        self.episodicMemory.Add(encodedState)
        return r_i

    def Update(self, HPs, episode=0, statistics=True):
        """
        """
        #Process the data from the buffer
        samples, num = self.sharedBuffer.Sample()
        if num < self.HPs["BatchSize"]:
            return

        priorities = []
        for traj in samples:
            if len(traj[0]) <= 5:
                continue

            for epoch in range(self.HPs["Epochs"]):
                #Create a feedDict from the buffer
                feedDict = {
                    self.states_: np.squeeze(np.asarray(traj[0])),
                    self.actions_: np.squeeze(np.asarray(traj[1])),
                    self.rewards_: np.squeeze(np.asarray(traj[2])),
                    self.next_states_: np.squeeze(np.asarray(traj[3])),
                    self.done_: np.squeeze(np.asarray(traj[4], dtype=float)),
                    self.action_past: np.squeeze(np.asarray(traj[5])),
                    self.reward_i_past: np.squeeze(np.asarray(traj[6])),
                    self.reward_e_past: np.squeeze(np.asarray(traj[7])),
                    self.bandit_one_hot: np.squeeze(np.asarray(traj[8])),
                }
                out = self.sess.run(
                    self.update_ops + self.logging_ops,
                    feedDict)  # local grads applied to global net.
                logging = out[len(self.update_ops):]

                for i, log in enumerate(logging):
                    self.logging_MA[i].append(log)

    def GetStatistics(self):
        dict = {}
        for i, label in enumerate(self.labels):
            dict["Training Results/" + label] = self.logging_MA[i]()
        return dict

    def GetWorkerStatistics(self):
        dict = {}
        dict["Training Results/Alpha"] = self.alpha()
        dict["Training Results/K"] = self.K()
        return dict

    def PushToBuffer(self):
        self.sess.run(self.pull_ops)
        #Packaging samples in manner that requires modification on the learner end.

        #Estimating TD Difference to give priority to the data.

        for traj in range(len(self.buffer)):
            # g,s_n=MultiStepDiscountProcessing(np.asarray(self.buffer[traj][2]),self.buffer[traj][3],np.sum(self.buffer[traj][9][0]*self.gammas),self.HPs["MultiStep"])
            g, s_n = MultiStepDiscountProcessing(
                np.asarray(self.buffer[traj][2]), self.buffer[traj][3],
                np.sum(self.buffer[traj][9][0] * self.gammas),
                self.HPs["MultiStep"])

            batches = len(
                self.buffer[traj][0]) // self.HPs["MinibatchSize"] + 1
            s = np.array_split(self.buffer[traj][0], batches)
            a_his = np.array_split(self.buffer[traj][1], batches)
            r = np.array_split(np.asarray(g), batches)
            s_next = np.array_split(s_n, batches)
            done = np.array_split(self.buffer[traj][4], batches)

            action_past = np.array_split(self.buffer[traj][5], batches)
            reward_i_past = np.array_split(self.buffer[traj][6], batches)
            reward_e_past = np.array_split(self.buffer[traj][7], batches)
            bandit_one_hot = np.array_split(self.buffer[traj][9], batches)
            for i in range(batches):
                feedDict = {
                    self.states_: np.squeeze(np.asarray(s[i])),
                    self.next_states_: np.squeeze(np.asarray(s_next[i])),
                    self.actions_: np.squeeze(np.asarray(a_his[i])),
                    self.rewards_: np.squeeze(np.asarray(r[i])),
                    self.done_: np.squeeze(np.asarray(done[i], dtype=float)),
                    self.bandit_one_hot:
                    np.squeeze(np.asarray(bandit_one_hot[i])),
                    self.action_past: np.squeeze(np.asarray(action_past[i])),
                    self.reward_i_past:
                    np.squeeze(np.asarray(reward_i_past[i])),
                    self.reward_e_past:
                    np.squeeze(np.asarray(reward_e_past[i])),
                }
                priority = self.sess.run(self.td_error, feedDict)

                self.sharedBuffer.AddTrajectory([
                    s[i], a_his[i], r[i], s_next[i], done[i], action_past[i],
                    reward_i_past[i], reward_e_past[i], bandit_one_hot[i]
                ], priority)
        self.ClearTrajectory()

    def PrioritizeBuffer(self):
        #Updating the network weights before calculating new priorities
        self.sess.run(self.pull_ops)
        #Getting the data that needs to be assigned a new priority.
        trajs = self.sharedBuffer.GetReprioritySamples()
        priority = []
        for traj in trajs:

            feedDict = {
                self.states_: np.squeeze(np.asarray(traj[0])),
                self.actions_: np.squeeze(np.asarray(traj[1])),
                self.rewards_: np.squeeze(np.asarray(traj[2])),
                self.next_states_: np.squeeze(np.asarray(traj[3])),
                self.done_: np.squeeze(np.asarray(traj[4], dtype=float)),
                self.action_past: np.squeeze(np.asarray(traj[5])),
                self.reward_i_past: np.squeeze(np.asarray(traj[6])),
                self.reward_e_past: np.squeeze(np.asarray(traj[7])),
                self.bandit_one_hot: np.squeeze(np.asarray(traj[8])),
            }
            priority.append(self.sess.run(self.td_error, feedDict))
        #Calculating the priority.
        self.sharedBuffer.UpdatePriorities(priority)

        #Pushing the priorities back to the buffer
        self.sharedBuffer.PrioritizeandPruneSamples(2048)

    @property
    def getVars(self):
        return self.Model.getVars(self.scope)
コード例 #23
0
ファイル: PPO_Hierarchy.py プロジェクト: vanstrn/RL_public
class PPO_Hierarchy(Method):
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)
        self.HPs = settings["HPs"]
        self.subReward = False
        self.UpdateSubpolicies = True
        self.nTrajs = nTrajs
        self.method = self.HPs["Method"]

        #Creating two buffers to separate information between the different levels of the network.
        if self.subReward:
            self.buffer = [Trajectory(depth=12) for _ in range(nTrajs)]
            #[s0,a,r,r_sub,s1,done]+[HL_actions, HL_log_logits, HL_v, flag, critics, logits]
        else:
            self.buffer = [Trajectory(depth=11) for _ in range(nTrajs)]
            #[s0,a,r,s1,done]+[HL_action, HL_log_logits, HL_v, flag, critics, logits]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Generic placeholders
                self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None],
                                                 dtype=tf.float32,
                                                 name='adv_hold')

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.a_prob = out["metaActor"]
                self.v = out["metaCritic"]
                self.log_logits = out["metaLogLogits"]

                self.sub_a_prob = out["subActor"]
                self.sub_log_logits = out["subLogLogits"]
                self.sub_v = out["subCritic"]

                self.nPolicies = len(self.sub_a_prob)

                #Placeholder for the Hierarchical Policy
                self.old_log_logits_ = tf.placeholder(
                    shape=[None, self.nPolicies],
                    dtype=tf.float32,
                    name='old_logit_hold')
                #Placeholder for the Sub-Policies
                self.old_log_logits_sub_ = tf.placeholder(
                    shape=[None, actionSize],
                    dtype=tf.float32,
                    name='old_logit_sub_hold')

                # Creating the Loss and update calls for the Hierarchical policy
                self.hierarchicalLoss = self.CreateLossPPO(
                    self.a_prob, self.td_target_, self.v, self.a_his,
                    self.log_logits, self.old_log_logits_, self.advantage_,
                    self.nPolicies)
                variables = self.Model.getHierarchyVariables()
                self.hierarchyUpdater = self.CreateUpdater(
                    self.hierarchicalLoss, variables)

                # Creating the Losses updaters for the Sub-policies.
                self.subpolicyLoss = []
                self.subpolicyUpdater = []
                for i in range(self.nPolicies):
                    loss = self.CreateLossPPO(self.sub_a_prob[i],
                                              self.td_target_, self.sub_v[i],
                                              self.a_his,
                                              self.sub_log_logits[i],
                                              self.old_log_logits_sub_,
                                              self.advantage_, self.actionSize)
                    self.subpolicyLoss.append(loss)
                    variables = self.Model.getSubpolicyVariables(i)
                    self.subpolicyUpdater.append(
                        self.CreateUpdater(loss, variables))

            #Creating Variables for teh purpose of logging.
            self.SubpolicyDistribution = MovingAverage(1000)

    def CreateUpdater(self, loss, variables):
        optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
        gradients = optimizer.get_gradients(loss, variables)
        return optimizer.apply_gradients(zip(gradients, variables))

    def CreateLossPPO(self, a_prob, td_target_, v, a_his, log_logits,
                      old_log_logits_, advantage_, actionSize):
        # Entropy
        entropy = -tf.reduce_mean(a_prob * _log(a_prob), name='entropy')

        # Critic Loss
        td_error = td_target_ - v
        critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss')

        # Actor Loss
        action_OH = tf.one_hot(a_his, actionSize, dtype=tf.float32)
        log_prob = tf.reduce_sum(log_logits * action_OH, 1)
        old_log_prob = tf.reduce_sum(old_log_logits_ * action_OH, 1)

        # Clipped surrogate function
        ratio = tf.exp(log_prob - old_log_prob)
        surrogate = ratio * advantage_
        clipped_surrogate = tf.clip_by_value(ratio, 1 - self.HPs["eps"],
                                             1 + self.HPs["eps"]) * advantage_
        surrogate_loss = tf.minimum(surrogate,
                                    clipped_surrogate,
                                    name='surrogate_loss')
        actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss')

        actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"]
        loss = actor_loss + critic_loss * self.HPs["CriticBeta"]
        return loss

    def InitiateEpisode(self):
        if self.method == "Greedy":
            pass
        elif self.method == "Fixed Step":
            self.counter = 1
            self.nStep = 4

        elif self.method == "Constant":
            pass

        elif self.method == "Confidence":
            self.pastActions = [None] * self.nTrajs

        elif self.method == "Probabilistic Confidence":
            pass

        else:
            pass

    def GetAction(self, state, step, episode=0):
        """
        Method to run data through hierarchical network

        First run the state through the meta network to select subpolicy to use.
        Second run the state through the proper Subpolicy

        ToDo: Check if faster to run the entire network and select appropriate subpolicy afterwards. or run only the required bit.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        #Determine number of steps and whether to initiate confidence based on the length of the Buffer.
        if step == 0:
            self.InitiateEpisode()

        # Run the Meta and Sub-policy Networks
        targets = [self.a_prob, self.log_logits, self.v
                   ] + self.sub_a_prob + self.sub_log_logits + self.sub_v
        res = self.sess.run(targets, {self.s: state})
        LL_probs = res[0]
        HL_log_logits = res[1]
        HL_v = res[2]
        sub_probs = res[3:3 + self.nPolicies]
        sub_log_logits = res[3 + self.nPolicies:3 + 2 * self.nPolicies]
        sub_v = res[3 + 2 * self.nPolicies:]

        if self.method == "Greedy":
            HL_actions = np.array([
                np.random.choice(LL_probs.shape[1], p=prob / sum(prob))
                for prob in LL_probs
            ])
            flag = [True] * state.shape[0]
        elif self.method == "Fixed Step":
            if self.counter == self.nStep:
                #Reseting Step counter and selecting New option
                self.counter = 1
            if self.counter == 1:
                HL_actions = np.array([
                    np.random.choice(LL_probs.shape[1], p=prob / sum(prob))
                    for prob in LL_probs
                ])
                self.traj_action = HL_actions
                flag = [True] * state.shape[0]
            else:
                HL_actions = self.traj_action
                flag = [False] * state.shape[0]
            self.counter += 1

        elif self.method == "Confidence":
            flag = []
            HL_actions = []
            confids = -np.mean(LL_probs * np.log(LL_probs), axis=1)
            for i, confid in enumerate(confids):
                if confid < 0.1 or step == 0:
                    action = np.random.choice(LL_probs.shape[1],
                                              p=LL_probs[i] / sum(LL_probs[i]))
                    HL_actions.append(action)
                    self.pastActions[i] = action
                    flag.append(True)
                else:
                    HL_actions.append(self.pastActions[i])
                    flag.append(True)
            self.traj_action = HL_actions

        elif self.method == "Probabilistic Confidence":
            pass
        else:
            pass

        # Run the Subpolicy Network
        actions = np.array([
            np.random.choice(self.actionSize,
                             p=sub_probs[mod][idx] / sum(sub_probs[mod][idx]))
            for idx, mod in enumerate(HL_actions)
        ])
        critics = [sub_v[mod][idx] for idx, mod in enumerate(HL_actions)]
        logits = [
            sub_log_logits[mod][idx] for idx, mod in enumerate(HL_actions)
        ]

        return actions, [
            HL_actions, HL_log_logits, HL_v, flag, critics, logits
        ]

    def Update(self, HPs):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        samples = 0
        for i in range(len(self.buffer)):
            samples += len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            td_target, advantage, td_target_hier, advantage_hier, actions_hier, ll_hier, s_hier = self.ProcessBuffer(
                HPs, traj)
            # Updating the Hierarchical Controller
            for epoch in range(self.HPs["Epochs"]):
                for batch in MultiBatchDivider([
                        s_hier, actions_hier, td_target_hier, advantage_hier,
                        ll_hier
                ], self.HPs["MinibatchSize"]):

                    feed_dict = {
                        self.s: np.asarray(batch[0]).squeeze(),
                        self.a_his: np.asarray(batch[1]).squeeze(),
                        self.td_target_: np.asarray(batch[2]).squeeze(),
                        self.advantage_: np.reshape(batch[3], [-1]),
                        self.old_log_logits_: np.asarray(batch[4]).squeeze()
                    }
                    self.sess.run(self.hierarchyUpdater, feed_dict)

            if self.UpdateSubpolicies:
                #Collecting the data into different sub-Policies
                if self.subReward:
                    tmp, l1, l2, l3, l4, l5 = (
                        list(t) for t in zip(
                            *sorted(zip(self.buffer[traj][6], self.buffer[traj]
                                        [0], self.buffer[traj][1], td_target,
                                        advantage, self.buffer[traj][10]),
                                    key=lambda x: x[0]))
                    )  #Sorting by the value in the actions_hier
                    #dividing at the splits
                    for subpolicyNum, data in SubpolicyIterator(
                            tmp, [l1, l2, l3, l4, l5]):
                        #Updating each of the sub-policies.
                        for epoch in range(self.HPs["Epochs"]):
                            for batch in MultiBatchDivider(
                                    data, self.HPs["MinibatchSize"]):

                                feed_dict = {
                                    self.s:
                                    np.asarray(batch[0]).squeeze(),
                                    self.a_his:
                                    np.asarray(batch[1]).squeeze(),
                                    self.td_target_:
                                    np.asarray(batch[2]).squeeze(),
                                    self.advantage_:
                                    np.reshape(batch[3], [-1]),
                                    self.old_log_logits_sub_:
                                    np.asarray(batch[4]).squeeze()
                                }
                                self.sess.run(
                                    self.subpolicyUpdater[subpolicyNum],
                                    feed_dict)
                    self.SubpolicyDistribution.extend(
                        np.asarray(self.buffer[traj][6]))
                else:
                    tmp, l1, l2, l3, l4, l5 = (
                        list(t) for t in zip(
                            *sorted(zip(self.buffer[traj][5], self.buffer[traj]
                                        [0], self.buffer[traj][1], td_target,
                                        advantage, self.buffer[traj][10]),
                                    key=lambda x: x[0]))
                    )  #Sorting by the value in the actions_hier
                    #dividing at the splits
                    for subpolicyNum, data in SubpolicyIterator(
                            tmp, [l1, l2, l3, l4, l5]):
                        #Updating each of the sub-policies.
                        for epoch in range(self.HPs["Epochs"]):
                            for batch in MultiBatchDivider(
                                    data, self.HPs["MinibatchSize"]):

                                feed_dict = {
                                    self.s:
                                    np.asarray(batch[0]).squeeze(),
                                    self.a_his:
                                    np.asarray(batch[1]).squeeze(),
                                    self.td_target_:
                                    np.asarray(batch[2]).squeeze(),
                                    self.advantage_:
                                    np.reshape(batch[3], [-1]),
                                    self.old_log_logits_sub_:
                                    np.asarray(batch[4]).squeeze()
                                }
                                self.sess.run(
                                    self.subpolicyUpdater[subpolicyNum],
                                    feed_dict)
                    self.SubpolicyDistribution.extend(
                        np.asarray(self.buffer[traj][5]))

            self.ClearTrajectory()

    def GetStatistics(self):
        stats = {}
        for i in range(self.nPolicies):
            length = len(self.SubpolicyDistribution.tolist())
            if length == 0:
                length = 1
            stats[
                "Subpolicy Use/" +
                str(i)] = self.SubpolicyDistribution.tolist().count(i) / length
        return stats

    def ProcessBuffer(self, HPs, traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.
        clip : list[bool]
            List where the trajectory has finished.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        #Splitting the buffer into different episodes based on the done tag.
        split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x]
        if self.subReward:
            #Stuff need to be processed for the Low Level Controllers
            reward_lists = np.split(self.buffer[traj][2], split_loc[:-1])
            sub_reward_lists = np.split(self.buffer[traj][3], split_loc[:-1])
            value_lists = np.split(self.buffer[traj][10], split_loc[:-1])

            #Stuff needed for the
            HL_S_lists = np.split(self.buffer[traj][0], split_loc[:-1])
            HL_Critic_lists = np.split(self.buffer[traj][8], split_loc[:-1])
            HL_Logits_lists = np.split(self.buffer[traj][7], split_loc[:-1])
            HL_action_lists = np.split(self.buffer[traj][6], split_loc[:-1])
            HL_flag_lists = np.split(self.buffer[traj][9], split_loc[:-1])

            td_target = []
            advantage = []
            td_target_hier = []
            advantage_hier = []
            ll = []
            actions = []

            for rew, s_rew, value, HL_critic, HL_ll, HL_a, HL_flag, HL_s in zip(
                    reward_lists, sub_reward_lists, value_lists,
                    HL_Critic_lists, HL_Logits_lists, HL_action_lists,
                    HL_flag_lists, HL_S_lists):
                # Calculating the per step advantage of each of the different sections
                td_target_i, advantage_i = gae(
                    s_rew.reshape(-1).tolist(),
                    value.reshape(-1).tolist(), 0, self.HPs["Gamma"],
                    self.HPs["lambda"])
                td_target.extend(td_target_i)
                advantage.extend(advantage_i)
                #Colapsing different trajectory lengths for the hierarchical controller
                split_loc_ = [i + 1 for i, x in enumerate(HL_flag[:-1]) if x]
                rew_hier = [np.sum(l) for l in np.split(rew, split_loc_)]
                value_hier = [l[0] for l in np.split(HL_critic, split_loc_)]
                actions.extend([l[0] for l in np.split(HL_a, split_loc_)])
                ll.extend([l[0] for l in np.split(HL_ll, split_loc_)])
                s.extend([l[0] for l in np.split(HL_s, split_loc_)])
                #Calculating the td_target and advantage for the hierarchical controller.
                td_target_i_, advantage_i_ = gae(
                    np.asarray(rew_hier).reshape(-1).tolist(),
                    np.asarray(value_hier).reshape(-1).tolist(), 0,
                    self.HPs["Gamma"], self.HPs["lambda"])
                td_target_hier.extend(td_target_i_)
                advantage_hier.extend(advantage_i_)

            return td_target, advantage, td_target_hier, advantage_hier, actions, ll
        else:

            #Stuff need to be processed for the Low Level Controllers
            reward_lists = np.split(self.buffer[traj][2], split_loc[:-1])
            value_lists = np.split(self.buffer[traj][9], split_loc[:-1])

            #Stuff needed for the
            HL_S_lists = np.split(self.buffer[traj][0], split_loc[:-1])
            HL_Critic_lists = np.split(self.buffer[traj][7], split_loc[:-1])
            HL_Logits_lists = np.split(self.buffer[traj][6], split_loc[:-1])
            HL_action_lists = np.split(self.buffer[traj][5], split_loc[:-1])
            HL_flag_lists = np.split(self.buffer[traj][8], split_loc[:-1])

            td_target = []
            advantage = []
            td_target_hier = []
            advantage_hier = []
            ll = []
            actions = []
            s = []

            for rew, value, HL_critic, HL_ll, HL_a, HL_flag, HL_s in zip(
                    reward_lists, value_lists, HL_Critic_lists,
                    HL_Logits_lists, HL_action_lists, HL_flag_lists,
                    HL_S_lists):
                # Calculating the per step advantage of each of the different sections
                td_target_i, advantage_i = gae(
                    rew.reshape(-1).tolist(),
                    value.reshape(-1).tolist(), 0, self.HPs["Gamma"],
                    self.HPs["lambda"])
                td_target.extend(td_target_i)
                advantage.extend(advantage_i)
                #Colapsing different trajectory lengths for the hierarchical controller
                split_loc_ = [i + 1 for i, x in enumerate(HL_flag[:-1]) if x]
                rew_hier = [np.sum(l) for l in np.split(rew, split_loc_)]
                value_hier = [l[0] for l in np.split(HL_critic, split_loc_)]
                actions.extend([l[0] for l in np.split(HL_a, split_loc_)])
                ll.extend([l[0] for l in np.split(HL_ll, split_loc_)])
                s.extend([l[0] for l in np.split(HL_s, split_loc_)])
                #Calculating the td_target and advantage for the hierarchical controller.
                td_target_i_, advantage_i_ = gae(
                    np.asarray(rew_hier).reshape(-1).tolist(),
                    np.asarray(value_hier).reshape(-1).tolist(), 0,
                    self.HPs["Gamma"], self.HPs["lambda"])
                td_target_hier.extend(td_target_i_)
                advantage_hier.extend(advantage_i_)

            return td_target, advantage, td_target_hier, advantage_hier, actions, ll, s

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
コード例 #24
0
    def __init__(self,Model,sess,stateShape,actionSize,HPs,nTrajs=1,scope="PPO_Training"):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess=sess
        self.Model = Model

        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Placeholders
                self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'TD_target')
                self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state":self.s}
                out = self.Model(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]

                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))
                entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(ratio, 1-HPs["eps"], 1+HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss')

                actor_loss = actor_loss - entropy * HPs["EntropyBeta"]
                loss = actor_loss + critic_loss * HPs["CriticBeta"]

                # Build Trainer
                self.optimizer = tf.keras.optimizers.Adam(HPs["Critic LR"])
                self.gradients = self.optimizer.get_gradients(loss, tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope))
                self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)))

        #Creating variables for logging.
        self.EntropyMA = MovingAverage(400)
        self.CriticLossMA = MovingAverage(400)
        self.ActorLossMA = MovingAverage(400)
        self.GradMA = MovingAverage(400)
        self.HPs = HPs
コード例 #25
0
ファイル: ApeX.py プロジェクト: vanstrn/RL_public
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 sharedBuffer,
                 globalAC=None,
                 nTrajs=1,
                 targetNetwork=None):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        self.sess = sess
        self.scope = scope
        self.sharedBuffer = sharedBuffer
        self.actionSize = actionSize
        self.trajLength = 40

        #Creating the General I/O of the network
        self.Model = sharedModel
        with self.sess.as_default(), self.sess.graph.as_default():
            self.states_ = tf.placeholder(shape=[None] + stateShape,
                                          dtype=tf.float32,
                                          name='states')
            self.next_states_ = tf.placeholder(shape=[None] + stateShape,
                                               dtype=tf.float32,
                                               name='next_states')
            self.actions_ = tf.placeholder(shape=[None],
                                           dtype=tf.int32,
                                           name='actions_hold')
            self.rewards_ = tf.placeholder(shape=[None],
                                           dtype=tf.float32,
                                           name='rewards_hold')
            self.done_ = tf.placeholder(shape=[None],
                                        dtype=tf.float32,
                                        name='done_hold')

            if targetNetwork is not None:
                with tf.name_scope("target"):
                    out2 = targetNetwork({"state": self.next_states_})
                    q_next = out2["Q"]
                    self.targetParams = tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES, "target")

            with tf.name_scope(scope):

                input = {"state": self.states_}
                out = self.Model(input)
                self.q = out["Q"]

                # GettingVariables for the specified network.
                with tf.variable_scope(scope):
                    self.params = tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES, scope)

                    # Creating the Global Actor that does all of the learning
                    if globalAC is None:
                        out2 = self.Model({"state": self.next_states_})
                        q_next = out2["Q"]
                        with tf.variable_scope(scope + "_update"):

                            with tf.name_scope('current_Q'):
                                oh_action = tf.one_hot(
                                    self.actions_,
                                    actionSize,
                                    dtype=tf.float32
                                )  # [?, num_agent, action_size]
                                curr_q = tf.reduce_sum(
                                    tf.multiply(self.q, oh_action),
                                    axis=-1)  # [?, num_agent]

                            with tf.name_scope('target_Q'):
                                max_next_q = tf.reduce_max(q_next, axis=-1)
                                td_target = self.rewards_ + HPs[
                                    "Gamma"] * max_next_q * (1. - self.done_)

                            with tf.name_scope('td_error'):
                                self.td_error = loss = tf.keras.losses.MSE(
                                    td_target, curr_q)
                                softmax_q = tf.nn.softmax(curr_q)
                                self.entropy = -tf.reduce_mean(
                                    softmax_q * tf.log(softmax_q + 1e-5))
                                self.loss = total_loss = loss + HPs[
                                    "EntropyBeta"] * self.entropy
                            optimizer = tf.keras.optimizers.Adam(
                                HPs["LearningRate"])
                            self.gradients = optimizer.get_gradients(
                                total_loss, self.params)
                            self.update_op = optimizer.apply_gradients(
                                zip(self.gradients, self.params))

                            self.grads = [self.gradients]
                            self.losses = [self.loss]
                            self.update_ops = [self.update_op]

                            self.push_ops = [
                                l_p.assign(g_p) for l_p, g_p in zip(
                                    self.targetParams, self.params)
                            ]

                        self.grad_MA = [
                            MovingAverage(400) for i in range(len(self.grads))
                        ]
                        self.loss_MA = [
                            MovingAverage(400) for i in range(len(self.losses))
                        ]
                        self.entropy_MA = MovingAverage(400)
                        self.labels = [
                            "Critic",
                        ]

                    # Creating the local networks that only pull parameters and run experiments.
                    else:
                        out2 = self.Model({"state": self.next_states_})
                        q_next = out2["Q"]
                        #Creating local Buffer that
                        self.buffer = [
                            Trajectory(depth=5) for _ in range(nTrajs)
                        ]
                        with tf.variable_scope(scope + "_priority"):

                            with tf.name_scope('current_Q'):
                                oh_action = tf.one_hot(
                                    self.actions_,
                                    actionSize,
                                    dtype=tf.float32
                                )  # [?, num_agent, action_size]
                                curr_q = tf.reduce_sum(
                                    tf.multiply(self.q, oh_action),
                                    axis=-1)  # [?, num_agent]

                            with tf.name_scope('target_Q'):
                                max_next_q = tf.reduce_max(q_next, axis=-1)
                                td_target = self.rewards_ + HPs[
                                    "Gamma"] * max_next_q * (1. - self.done_)

                            with tf.name_scope('td_error'):
                                self.td_error = tf.keras.losses.MSE(
                                    td_target, curr_q)

                        with tf.name_scope('sync'):
                            self.pull_params_op = [
                                l_p.assign(g_p) for l_p, g_p in zip(
                                    self.params, globalAC.params)
                            ]

                            self.pull_ops = [self.pull_params_op]

                self.HPs = HPs
コード例 #26
0
ファイル: A3C_SF_v0.py プロジェクト: zd6/RL
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 globalAC=None,
                 nTrajs=1):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders

        self.sess = sess
        self.scope = scope
        self.Model = sharedModel
        self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
        self.s_next = tf.placeholder(tf.float32, [None] + stateShape, 'S_next')
        self.a_his = tf.placeholder(tf.int32, [
            None,
        ], 'A')
        self.reward = tf.placeholder(tf.float32, [
            None,
        ], 'R')
        self.v_target = tf.placeholder(tf.float32, [None], 'Vtarget')
        self.td_target = tf.placeholder(tf.float32, [None, 128], 'TDtarget')

        input = {"state": self.s}
        out = self.Model(input)
        self.a_prob = out["actor"]
        self.v = out["critic"]
        self.state_pred = out["prediction"]
        self.phi = out["phi"]
        self.psi = out["psi"]

        if globalAC is None:  # get global network
            with tf.variable_scope(scope):
                self.a_params = self.Model.GetVariables("Actor")
                self.c_params = self.Model.GetVariables("Critic")
                self.s_params = self.Model.GetVariables("Reconstruction")
        else:  # local net, calculate losses
            self.buffer = [Trajectory(depth=8) for _ in range(nTrajs)]
            with tf.variable_scope(scope + "_update"):

                self.a_params = self.Model.GetVariables("Actor")
                self.c_params = self.Model.GetVariables("Critic")
                self.s_params = self.Model.GetVariables("Reconstruction")

                with tf.name_scope('c_loss'):
                    td = tf.subtract(self.td_target, self.psi, name='TD_error')
                    self.c_loss = tf.reduce_mean(tf.square(td))

                with tf.name_scope('a_loss'):
                    td = tf.subtract(self.v_target, self.v, name='TD_error')
                    log_prob = tf.reduce_sum(
                        tf.log(self.a_prob + 1e-5) *
                        tf.one_hot(self.a_his, actionSize, dtype=tf.float32),
                        axis=1,
                        keep_dims=True)
                    exp_v = log_prob * tf.stop_gradient(td)
                    entropy = -tf.reduce_sum(
                        self.a_prob * tf.log(self.a_prob + 1e-5),
                        axis=1,
                        keep_dims=True)  # encourage exploration
                    self.exp_v = HPs["EntropyBeta"] * entropy + exp_v
                    self.a_loss = tf.reduce_mean(-self.exp_v)

                with tf.name_scope('s_loss'):
                    self.s_loss = tf.losses.mean_squared_error(
                        self.state_pred, self.s_next)

                with tf.name_scope('local_grad'):
                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
                    self.s_grads = tf.gradients(self.s_loss, self.s_params)

            with tf.name_scope('sync'):
                with tf.name_scope('pull'):
                    self.pull_a_params_op = [
                        l_p.assign(g_p)
                        for l_p, g_p in zip(self.a_params, globalAC.a_params)
                    ]
                    self.pull_c_params_op = [
                        l_p.assign(g_p)
                        for l_p, g_p in zip(self.c_params, globalAC.c_params)
                    ]
                    self.pull_s_params_op = [
                        l_p.assign(g_p)
                        for l_p, g_p in zip(self.s_params, globalAC.s_params)
                    ]

                with tf.name_scope('push'):
                    self.update_a_op = tf.train.AdamOptimizer(
                        HPs["Actor LR"]).apply_gradients(
                            zip(self.a_grads, globalAC.a_params))
                    self.update_c_op = tf.train.AdamOptimizer(
                        HPs["Critic LR"]).apply_gradients(
                            zip(self.c_grads, globalAC.c_params))
                    self.update_s_op = tf.train.AdamOptimizer(
                        HPs["Actor LR"]).apply_gradients(
                            zip(self.s_grads, globalAC.s_params))

            self.update_ops = [
                self.update_a_op,
                self.update_c_op,
                self.update_s_op,
            ]
            self.pull_ops = [
                self.pull_a_params_op,
                self.pull_c_params_op,
                self.pull_s_params_op,
            ]
            self.grads = [
                self.a_grads,
                self.c_grads,
                self.s_grads,
            ]
            self.losses = [
                self.a_loss,
                self.c_loss,
                self.s_loss,
            ]

            self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
            self.loss_MA = [MovingAverage(400) for i in range(len(self.grads))]
            self.labels = [
                "Actor",
                "Critic",
                "State",
            ]
コード例 #27
0
class NGU(Method):
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 sharedBuffer,
                 globalNet=None,
                 nTrajs=1,
                 LSTM=False):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders
        self.LSTM = LSTM
        self.sess = sess
        self.scope = scope
        self.Model = sharedModel
        self.sharedBuffer = sharedBuffer
        #Common Stuff Between the networks:
        self.HPs = HPs

        #Creating the different values of beta
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))

        self.betas = []
        for i in range(self.HPs["N"]):
            if i == 0:
                self.betas.append(0.0)
            elif i == self.HPs["N"] - 1:
                self.betas.append(self.HPs["betaMax"])
            else:
                self.betas.append(self.HPs["betaMax"] * sigmoid(
                    (2.0 * float(i) + 2.0 - self.HPs["N"]) /
                    (self.HPs["N"] - 2.0)))

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):

                #Specifying placeholders for Tensorflow Networks
                if len(stateShape) == 4:
                    self.states_ = tf.placeholder(shape=[None] +
                                                  stateShape[1:4],
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape[1:4],
                                                       dtype=tf.float32,
                                                       name='next_states')
                else:
                    self.states_ = tf.placeholder(shape=[None] + stateShape,
                                                  dtype=tf.float32,
                                                  name='states')
                    self.next_states_ = tf.placeholder(shape=[None] +
                                                       stateShape,
                                                       dtype=tf.float32,
                                                       name='next_states')
                self.actions_ = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name='actions_hold')
                self.done_ = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name='done_hold')
                self.rewards_ = tf.placeholder(shape=[None],
                                               dtype=tf.float32,
                                               name='total_reward')
                self.bandit_one_hot = tf.placeholder(
                    shape=[None, self.HPs["N"]],
                    dtype=tf.int32,
                    name='beta_bandit')
                self.action_past = tf.placeholder(shape=[None],
                                                  dtype=tf.int32,
                                                  name='action_past')
                self.reward_i_past = tf.placeholder(shape=[None],
                                                    dtype=tf.float32,
                                                    name='reward_i_past')
                self.reward_e_past = tf.placeholder(shape=[None],
                                                    dtype=tf.float32,
                                                    name='reward_e_past')
                self.reward_i_current = tf.placeholder(shape=[None],
                                                       dtype=tf.float32,
                                                       name='reward_i_current')
                self.reward_e_current = tf.placeholder(shape=[None],
                                                       dtype=tf.float32,
                                                       name='reward_e_current')

                # Creating the IO for the entire network
                input = {
                    "state": self.states_,
                    "state_next": self.next_states_,
                    "bandit_one_hot": self.bandit_one_hot,
                    "action_past": self.action_past,
                    "reward_i_past": self.reward_i_past,
                    "reward_e_past": self.reward_e_past,
                }
                out = self.Model(input)
                self.q = out["Q"]
                self.a_pred = out["action_prediction"]
                self.latent = out["latent_space"]
                self.rnd_random = out["RND_random"]
                self.rnd_predictor = out["RND_predictor"]

                input2 = {
                    "state": self.next_states_,
                    "state_next":
                    self.next_states_,  #Used as a placeholder in network
                    "bandit_one_hot": self.bandit_one_hot,
                    "action_past": self.actions_,
                    "reward_i_past": self.reward_i_current,
                    "reward_e_past": self.reward_e_current,
                }
                out2 = self.Model(input2)
                q_next = out["Q"]
                with tf.name_scope('q_learning'):
                    #Current Q
                    oh_action = tf.one_hot(
                        self.actions_, actionSize,
                        dtype=tf.float32)  # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action),
                                           axis=-1)  # [?, num_agent]
                    #Next Q
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    #TD Error
                    td_target = self.rewards_ + HPs["Gamma"] * max_next_q * (
                        1. - self.done_)
                    self.td_error = loss = tf.keras.losses.MSE(
                        td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(
                        softmax_q * tf.log(softmax_q))
                    self.loss = loss + HPs["EntropyBeta"] * self.entropy

                self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope)

                if globalNet is None:  #Creating the Training instance of the network.
                    with tf.name_scope('embedding_network'):
                        oh_action = tf.one_hot(
                            self.actions_, actionSize,
                            dtype=tf.float32)  # [?, num_agent, action_size]
                        self.embedding_loss = tf.keras.losses.MSE(
                            oh_action, self.a_pred)

                    with tf.name_scope('life_long_curiosity'):
                        self.llc_loss = tf.keras.losses.MSE(
                            self.rnd_random, self.rnd_predictor)
                    loss = self.loss + self.llc_loss + self.embedding_loss

                    optimizer = tf.keras.optimizers.Adam(HPs["LearningRate"])

                    self.gradients = optimizer.get_gradients(loss, self.params)
                    self.update_op = optimizer.apply_gradients(
                        zip(self.gradients, self.params))

                    self.grads = [self.gradients]
                    self.losses = [loss]
                    self.update_ops = [self.update_op]

                    self.grad_MA = [
                        MovingAverage(400) for i in range(len(self.grads))
                    ]
                    self.loss_MA = [
                        MovingAverage(400) for i in range(len(self.losses))
                    ]
                    self.entropy_MA = MovingAverage(400)
                    self.labels = ["Total"]
                    self.HPs = HPs

                else:  #Creating a Actor Instance for the Network.
                    #Creating the Episodic Memory, which compares samples
                    self.episodicMemory = EpisodicMemory()
                    #Creating Local Buffer to store data until it is ready to push to sample buffer
                    self.buffer = [Trajectory(depth=10) for _ in range(nTrajs)]
                    #Creating a pull operation to synch network parameters
                    with tf.name_scope('sync'):
                        self.pull_params_op = [
                            l_p.assign(g_p)
                            for l_p, g_p in zip(self.params, globalNet.params)
                        ]
                        self.pull_ops = [self.pull_params_op]

    def GetAction(self,
                  state,
                  a_past,
                  r_i_past,
                  r_e_past,
                  episode=None,
                  step=0):
        """
        Contains the code to run the network based on an input.
        """
        #Fixing the state shape if there is somethinf wrong
        if len(state.shape) == 3:
            state = state[np.newaxis, :]
        if len(state.shape) == 1:
            state = state[np.newaxis, :]

        #Selecting new beta if the begining of the episode
        #Also bootstrapping rewards/actions for the
        if step == 0:
            currBeta = random.randint(0, self.HPs["N"] - 1)
            oh = np.zeros(self.HPs["N"])
            oh[currBeta] = 1
            self.betaSelect = oh
            self.currBeta = self.betas[currBeta]

        feedDict = {
            self.states_: state,
            self.bandit_one_hot: self.betaSelect[np.newaxis, :],
            self.action_past: np.asarray(a_past),
            self.reward_i_past: np.asarray(r_i_past),
            self.reward_e_past: np.asarray(r_e_past)
        }
        q = self.sess.run(self.q, feedDict)

        actions = np.argmax(q, axis=-1)
        return actions, [
            self.currBeta, self.betaSelect
        ]  # return a int and extra data that needs to be fed to buffer.

    def Encode(self, state):
        return self.sess.run(self.latent, {self.states_: state})

    def RNDPredictionError(self, state):
        random, predictor = self.sess.run(
            [self.rnd_random, self.rnd_predictor], {self.states_: state})
        return np.linalg.norm(random - predictor)

    def GetIntrinsicReward(self, state_prev, state, episode=None, step=0):
        #Clearing the episodic buffer
        if step == 0:
            self.episodicMemory.Clear()
            self.episodicMemory.Add(self.Encode(state_prev))

        #Adding Sample to the buffer
        encodedState = self.Encode(state)
        stateError = self.RNDPredictionError(state)
        self.sharedBuffer.AddError(stateError)

        #####Calculating the episodic reward factor
        #-finding k nearest neighbors in buffer and distance to them
        K = self.episodicMemory.NearestNeighborsDist(encodedState, num=5)
        r_episodic = 1.0 / np.sqrt(K + 0.001)

        #Calculating alpha
        stateError_Average, stateError_std = self.sharedBuffer.GetMuSigma()
        alpha = 1.0 + (stateError - stateError_Average) / stateError_std

        #Calculating the intrinsic reward
        r_i = r_episodic * min(max(1.0, alpha), 5.0)

        #adding the sample to the buffer after nearest neighbors has been calculated.
        self.episodicMemory.Add(encodedState)
        return r_i

    def Update(self, HPs, episode=0, statistics=True):
        """
        The main update function for A3C. The function pushes gradients to the global AC Network.
        The second function is to Pull
        """
        #Process the data from the buffer
        samples, num = self.sharedBuffer.Sample()

        if num < self.HPs["BatchSize"]:
            return

        priorities = []
        for traj in samples:
            if len(traj[0]) <= 5:
                continue
            batches = len(traj[0]) // self.HPs["MinibatchSize"] + 1
            s = np.array_split(traj[0], batches)
            a_his = np.array_split(np.asarray(traj[1]).reshape(-1), batches)
            r = np.array_split(np.asarray(traj[2]).reshape(-1), batches)
            s_next = np.array_split(traj[3], batches)
            done = np.array_split(traj[4], batches)
            bandit_one_hot = np.array_split(traj[8], batches)
            action_past = np.array_split(traj[5], batches)
            reward_i_past = np.array_split(traj[6], batches)
            reward_e_past = np.array_split(traj[7], batches)

            for epoch in range(self.HPs["Epochs"]):
                for i in range(batches):
                    #Create a feedDict from the buffer
                    if len(np.squeeze(np.asarray(s[i])).shape) == 3:
                        continue
                    feedDict = {
                        self.states_:
                        np.squeeze(np.asarray(s[i])),
                        self.next_states_:
                        np.squeeze(np.asarray(s_next[i])),
                        self.actions_:
                        np.squeeze(np.asarray(a_his[i])),
                        self.rewards_:
                        np.squeeze(np.asarray(r[i])),
                        self.done_:
                        np.squeeze(np.asarray(done[i], dtype=float)),
                        self.bandit_one_hot:
                        np.squeeze(np.asarray(bandit_one_hot[i])),
                        self.action_past:
                        np.squeeze(np.asarray(action_past[i])),
                        self.reward_i_past:
                        np.squeeze(np.asarray(reward_i_past[i])),
                        self.reward_e_past:
                        np.squeeze(np.asarray(reward_e_past[i])),
                    }
                    out = self.sess.run(
                        self.update_ops + self.losses + self.grads,
                        feedDict)  # local grads applied to global net.
                    out = np.array_split(out, 3)
                    losses = out[1]
                    grads = out[2]

                    for i, loss in enumerate(losses):
                        self.loss_MA[i].append(loss)

                    for i, grads_i in enumerate(grads):
                        total_counter = 1
                        vanish_counter = 0
                        for grad in grads_i:
                            total_counter += np.prod(grad.shape)
                            vanish_counter += (np.absolute(grad) < 1e-8).sum()
                        self.grad_MA[i].append(vanish_counter / total_counter)

                    ent = self.sess.run(
                        self.entropy,
                        feedDict)  # local grads applied to global net.
                    entropy = np.average(np.asarray(ent))
                    self.entropy_MA.append(entropy)
                feedDict = {
                    self.states_: np.squeeze(np.asarray(traj[0])),
                    self.next_states_: np.squeeze(np.asarray(traj[3])),
                    self.actions_: traj[1],
                    self.rewards_: traj[2],
                    self.done_: np.squeeze(np.asarray(traj[4], dtype=float)),
                    self.bandit_one_hot: np.asarray(traj[8]),
                    self.action_past: np.squeeze(np.asarray(traj[5], )),
                    self.reward_i_past: np.squeeze(np.asarray(traj[6], )),
                    self.reward_e_past: np.squeeze(np.asarray(traj[7], )),
                }
                priorities.append(self.sess.run(self.td_error, feedDict))

        self.sharedBuffer.UpdatePriorities(priorities)

    def GetStatistics(self):
        dict = {}
        for i, label in enumerate(self.labels):
            dict["Training Results/Vanishing Gradient " +
                 label] = self.grad_MA[i]()
            dict["Training Results/Loss " + label] = self.loss_MA[i]()
            dict["Training Results/Entropy"] = self.entropy_MA()
        return dict

    def PushToBuffer(self):
        #Packaging samples in manner that requires modification on the learner end.

        #Estimating TD Difference to give priority to the data.

        for traj in range(len(self.buffer)):
            s = self.buffer[traj][0]
            a_his = np.asarray(self.buffer[traj][1]).reshape(-1)
            r = np.asarray(self.buffer[traj][2]).reshape(-1)
            s_next = self.buffer[traj][3]
            done = self.buffer[traj][4]
            action_past = self.buffer[traj][5]
            reward_i_past = self.buffer[traj][6]
            reward_e_past = self.buffer[traj][7]
            bandit_one_hot = self.buffer[traj][9]

            #Create a feedDict from the buffer
            feedDict = {
                self.states_: np.squeeze(np.asarray(s)),
                self.next_states_: np.squeeze(np.asarray(s_next)),
                self.actions_: np.squeeze(np.asarray(a_his)),
                self.rewards_: np.squeeze(np.asarray(r)),
                self.done_: np.squeeze(np.asarray(done, dtype=float)),
                self.bandit_one_hot: np.squeeze(np.asarray(bandit_one_hot)),
                self.action_past: np.squeeze(np.asarray(action_past)),
                self.reward_i_past: np.squeeze(np.asarray(reward_i_past)),
                self.reward_e_past: np.squeeze(np.asarray(reward_e_past)),
            }
            priority = self.sess.run(self.td_error, feedDict)

        self.sharedBuffer.AddTrajectory([
            s, a_his, r, s_next, done, action_past, reward_i_past,
            reward_e_past, bandit_one_hot
        ], priority)
        self.sharedBuffer.PrioritizeandPruneSamples(2048)
        self.ClearTrajectory()
        self.sess.run(self.pull_ops)

    @property
    def getVars(self):
        return self.Model.getVars(self.scope)
コード例 #28
0
class PPO(Method):
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)
        scope = "PPO"

        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=8) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Placeholders
                self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None],
                                                 'TD_target')
                self.advantage_ = tf.placeholder(shape=[None],
                                                 dtype=tf.float32,
                                                 name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize],
                                                      dtype=tf.float32,
                                                      name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]

                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))

                entropy = self.entropy = -tf.reduce_mean(
                    self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(
                    tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his,
                                       actionSize,
                                       dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH,
                                             1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(
                    ratio, 1 - self.HPs["eps"],
                    1 + self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate,
                                            clipped_surrogate,
                                            name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(
                    surrogate_loss, name='actor_loss')

                loss = self.actor_loss + self.critic_loss * self.HPs[
                    "CriticBeta"]

                # Build Trainer
                if self.HPs["Optimizer"] == "Adam":
                    self.optimizerA = tf.keras.optimizers.Adam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizerA = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizerA = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizerA = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizerA = tf.keras.optimizers.Adamax(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adamax(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizerA = tf.keras.optimizers.SGD(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.SGD(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"], amsgrad=True)
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"], amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                a_params = self.Model.GetVariables("Actor")
                c_params = self.Model.GetVariables("Critic")
                self.gradients_a = self.optimizerA.get_gradients(
                    loss, self.Model.trainable_variables)
                # capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in self.gradients_a]
                self.update_op_a = self.optimizerA.apply_gradients(
                    zip(self.gradients_a, self.Model.trainable_variables))

                entropy_loss = -self.entropy * self.HPs["EntropyBeta"]
                self.gradients_e = self.optimizerE.get_gradients(
                    entropy_loss, a_params)
                self.update_op_e = self.optimizerE.apply_gradients(
                    zip(self.gradients_e, a_params))

                total_counter = 1
                vanish_counter = 0
                for gradient in self.gradients_a:
                    total_counter += np.prod(gradient.shape)
                    stuff = tf.reduce_sum(
                        tf.cast(
                            tf.math.less_equal(tf.math.abs(gradient),
                                               tf.constant(1e-8)), tf.int32))
                    vanish_counter += stuff
                self.vanishing_gradient = vanish_counter / total_counter

        self.update_ops = [self.update_op_a, self.update_op_e]
        self.logging_ops = [
            self.actor_loss, self.critic_loss, self.entropy,
            tf.reduce_mean(self.advantage_),
            tf.reduce_mean(ratio), loss, self.vanishing_gradient
        ]
        self.labels = [
            "Loss Actor", "Loss Critic", "Entropy", "Advantage", "PPO Ratio",
            "Loss Total", "Vanishing Gradient"
        ]
        self.logging_MA = [
            MovingAverage(400) for i in range(len(self.logging_ops))
        ]
        self.count_MA = MovingAverage(400)

    def GetAction(self, state, episode=1, step=0):
        """
        Method to run data through the neural network.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        try:
            probs, log_logits, v = self.sess.run(
                [self.a_prob, self.log_logits, self.v], {self.s: state})
        except ValueError:
            probs, log_logits, v = self.sess.run(
                [self.a_prob, self.log_logits, self.v],
                {self.s: np.expand_dims(state, axis=0)})
        actions = np.array([
            np.random.choice(probs.shape[1], p=prob / sum(prob))
            for prob in probs
        ])

        confid = -np.mean(probs * np.log(probs), axis=1)
        if step == 0:
            self.store_actions = actions
            self.old_confid = confid
            self.count = 0
            return actions, [v, log_logits, True]
        else:
            if confid < self.old_confid:  # compare inverse entropy
                self.old_confid = confid
                self.store_actions = actions
                self.count_MA.append(self.count)
                self.count = 0
                return actions, [v, log_logits, True]
            else:
                if self.count >= 4:
                    self.old_confid = np.maximum(
                        self.old_confid + self.HPs["ConfidenceAnnealing"],
                        self.HPs["MinConfidence"])
                self.count += 1
                return self.store_actions, [v, log_logits, False]

    def Update(self, episode=0):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        #Counting number of samples.
        samples = 0
        for i in range(len(self.buffer)):
            samples += len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            td_target_hier, advantage_hier, actions_hier, ll_hier, s_hier = self.ProcessBuffer(
                traj)

            for epoch in range(self.HPs["Epochs"]):
                for batch in MultiBatchDivider([
                        s_hier, actions_hier, td_target_hier, advantage_hier,
                        ll_hier
                ], self.HPs["MinibatchSize"]):
                    #Staging Buffer inputs into the entries to run through the network.
                    feedDict = {
                        self.s: np.asarray(batch[0]).squeeze(),
                        self.a_his: np.asarray(batch[1]).squeeze(),
                        self.td_target_: np.asarray(batch[2]).squeeze(),
                        self.advantage_: np.reshape(batch[3], [-1]),
                        self.old_log_logits_: np.asarray(batch[4]).squeeze()
                    }
                    out = self.sess.run(
                        self.update_ops + self.logging_ops,
                        feedDict)  # local grads applied to global net.
                    logging = out[len(self.update_ops):]

                    for i, log in enumerate(logging):
                        self.logging_MA[i].append(log)

        self.ClearTrajectory()

    def GetStatistics(self):
        dict = {}
        for i, label in enumerate(self.labels):
            dict["Training Results/" + label] = self.logging_MA[i]()

        dict["Training Results/Average Traj Length"] = self.count_MA()
        return dict

    def ProcessBuffer(self, traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        # Split into different episodes based on the "done" signal. Assumes that episode terminates at done.
        # Cannot account for instances where there are multiple done signals in a row.

        split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x]

        # reward_lists = np.split(self.buffer[traj][2],split_loc)
        # value_lists = np.split(self.buffer[traj][5],split_loc)
        #
        # td_target=[]; advantage=[]
        # for rew,value in zip(reward_lists,value_lists):
        #     td_target_i, advantage_i = gae(rew.reshape(-1).tolist(),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"])
        #     td_target.extend(td_target_i); advantage.extend( advantage_i)
        # return td_target, advantage

        reward_lists = np.split(self.buffer[traj][2], split_loc[:-1])

        #Stuff needed for the
        HL_S_lists = np.split(self.buffer[traj][0], split_loc[:-1])
        HL_Critic_lists = np.split(self.buffer[traj][5], split_loc[:-1])
        HL_Logits_lists = np.split(self.buffer[traj][6], split_loc[:-1])
        HL_action_lists = np.split(self.buffer[traj][1], split_loc[:-1])
        HL_flag_lists = np.split(self.buffer[traj][7], split_loc[:-1])

        td_target_hier = []
        advantage_hier = []
        ll = []
        actions = []
        s = []

        for rew, HL_critic, HL_ll, HL_a, HL_flag, HL_s in zip(
                reward_lists, HL_Critic_lists, HL_Logits_lists,
                HL_action_lists, HL_flag_lists, HL_S_lists):
            #Colapsing different trajectory lengths for the hierarchical controller
            split_loc_ = [i for i, x in enumerate(HL_flag[:-1]) if x][1:]
            rew_hier = [np.sum(l) for l in np.split(rew, split_loc_)]
            value_hier = [l[0] for l in np.split(HL_critic, split_loc_)]
            actions.extend([l[0] for l in np.split(HL_a, split_loc_)])
            ll.extend([l[0] for l in np.split(HL_ll, split_loc_)])
            s.extend([l[0] for l in np.split(HL_s, split_loc_)])
            #Calculating the td_target and advantage for the hierarchical controller.
            td_target_i_, advantage_i_ = gae(
                np.asarray(rew_hier).reshape(-1).tolist(),
                np.asarray(value_hier).reshape(-1).tolist(), 0,
                self.HPs["Gamma"], self.HPs["lambda"])
            td_target_hier.extend(td_target_i_)
            advantage_hier.extend(advantage_i_)

        return td_target_hier, advantage_hier, actions, ll, s

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
コード例 #29
0
class PPO(Method):

    def __init__(self,Model,sess,stateShape,actionSize,HPs,nTrajs=1,scope="PPO_Training"):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess=sess
        self.Model = Model

        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Placeholders
                self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'TD_target')
                self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state":self.s}
                out = self.Model(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]

                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))
                entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(ratio, 1-HPs["eps"], 1+HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss')

                actor_loss = actor_loss - entropy * HPs["EntropyBeta"]
                loss = actor_loss + critic_loss * HPs["CriticBeta"]

                # Build Trainer
                self.optimizer = tf.keras.optimizers.Adam(HPs["Critic LR"])
                self.gradients = self.optimizer.get_gradients(loss, tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope))
                self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)))

        #Creating variables for logging.
        self.EntropyMA = MovingAverage(400)
        self.CriticLossMA = MovingAverage(400)
        self.ActorLossMA = MovingAverage(400)
        self.GradMA = MovingAverage(400)
        self.HPs = HPs

    def GetAction(self, state, episode=1,step=0):
        """
        Method to run data through the neural network.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        try:
            probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: state})
        except ValueError:
            probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: np.expand_dims(state,axis=0)})
        actions = np.array([np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs])

        if step % self.HPs["FS"] == 0:
            self.store_actions = actions
            return actions, [v,log_logits]
        else:
            return self.store_actions, [v,log_logits]

    def Update(self,HPs,episode=0):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        #Counting number of samples.
        samples=0
        for i in range(len(self.buffer)):
            samples +=len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            td_target, advantage = self.ProcessBuffer(traj)

            batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1
            s = np.array_split( self.buffer[traj][0], batches)
            a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches)
            td_target_ = np.array_split( td_target, batches)
            advantage_ = np.array_split( np.reshape(advantage, [-1]), batches)
            old_log_logits_ = np.array_split( np.reshape(self.buffer[traj][6], [-1,self.actionSize]), batches)

            for epoch in range(self.HPs["Epochs"]):
                for i in range(batches):
                    #Staging Buffer inputs into the entries to run through the network.
                    feed_dict = {self.s: np.squeeze(s[i]),
                                 self.a_his: a_his[i],
                                 self.td_target_:td_target_[i],
                                 self.advantage_: advantage_[i],
                                 self.old_log_logits_: old_log_logits_[i]}
                    aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.update_ops], feed_dict)

                    self.EntropyMA.append(entropy)
                    self.CriticLossMA.append(cLoss)
                    self.ActorLossMA.append(aLoss)
                    total_counter = 0
                    vanish_counter = 0
                    for grad in grads:
                        total_counter += np.prod(grad.shape)
                        vanish_counter += (np.absolute(grad)<1e-8).sum()
                    self.GradMA.append(vanish_counter/total_counter)

        self.ClearTrajectory()


    def GetStatistics(self):
        dict = {"Training Results/Entropy":self.EntropyMA(),
        "Training Results/Loss Critic":self.CriticLossMA(),
        "Training Results/Loss Actor":self.ActorLossMA(),
        "Training Results/Vanishing Gradient":self.GradMA(),}
        return dict


    def ProcessBuffer(self,traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        # Split into different episodes based on the "done" signal. Assumes that episode terminates at done.
        # Cannot account for instances where there are multiple done signals in a row.

        split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x]

        reward_lists = np.split(self.buffer[traj][2],split_loc)
        value_lists = np.split(self.buffer[traj][5],split_loc)

        td_target=[]; advantage=[]
        for rew,value in zip(reward_lists,value_lists):
            td_target_i, advantage_i = gae(rew.reshape(-1).tolist(),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"])
            td_target.extend(td_target_i); advantage.extend( advantage_i)
        return td_target, advantage

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
コード例 #30
0
ファイル: A3C.py プロジェクト: zd6/RL
    def __init__(self,
                 sharedModel,
                 sess,
                 stateShape,
                 actionSize,
                 scope,
                 HPs,
                 globalAC=None,
                 nTrajs=1):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders

        self.sess = sess
        self.scope = scope
        self.Model = sharedModel
        if len(stateShape) == 4:
            self.s = tf.placeholder(tf.float32, [None] + stateShape[1:4], 'S')
        else:
            self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
        self.a_his = tf.placeholder(tf.int32, [
            None,
        ], 'A')
        self.v_target = tf.placeholder(tf.float32, [None], 'Vtarget')

        input = {"state": self.s}
        out = self.Model(input)
        self.a_prob = out["actor"]
        self.v = out["critic"]

        if globalAC is None:  # get global network
            with tf.variable_scope(scope):
                self.a_params = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope=self.Model.scope + '/Shared') + tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope=self.Model.scope + '/Actor')
                self.c_params = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope=self.Model.scope + '/Shared') + tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope=self.Model.scope + '/Critic')
        else:  # local net, calculate losses
            self.buffer = [Trajectory(depth=6) for _ in range(nTrajs)]
            with tf.variable_scope(scope + "_update"):

                self.a_params = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope=self.Model.scope + '/Shared') + tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope=self.Model.scope + '/Actor')
                self.c_params = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope=self.Model.scope + '/Shared') + tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope=self.Model.scope + '/Critic')
                print(self.c_params)
                td = tf.subtract(self.v_target, self.v, name='TD_error')
                with tf.name_scope('c_loss'):
                    self.c_loss = tf.reduce_mean(tf.square(td))

                with tf.name_scope('a_loss'):
                    log_prob = tf.reduce_sum(
                        tf.log(self.a_prob + 1e-5) *
                        tf.one_hot(self.a_his, actionSize, dtype=tf.float32),
                        axis=1,
                        keep_dims=True)
                    exp_v = log_prob * tf.stop_gradient(td)
                    entropy = -tf.reduce_sum(
                        self.a_prob * tf.log(self.a_prob + 1e-5),
                        axis=1,
                        keep_dims=True)  # encourage exploration
                    self.entropy = entropy
                    self.exp_v = HPs["EntropyBeta"] * entropy + exp_v
                    self.a_loss = tf.reduce_mean(-self.exp_v)

                with tf.name_scope('local_grad'):
                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
                    self.c_grads = tf.gradients(self.c_loss, self.c_params)

            with tf.name_scope('sync'):
                with tf.name_scope('pull'):
                    self.pull_a_params_op = [
                        l_p.assign(g_p)
                        for l_p, g_p in zip(self.a_params, globalAC.a_params)
                    ]
                    self.pull_c_params_op = [
                        l_p.assign(g_p)
                        for l_p, g_p in zip(self.c_params, globalAC.c_params)
                    ]
                with tf.name_scope('push'):
                    self.update_a_op = tf.train.AdamOptimizer(
                        HPs["Actor LR"]).apply_gradients(
                            zip(self.a_grads, globalAC.a_params))
                    self.update_c_op = tf.train.AdamOptimizer(
                        HPs["Critic LR"]).apply_gradients(
                            zip(self.c_grads, globalAC.c_params))

            self.update_ops = [
                self.update_a_op,
                self.update_c_op,
            ]
            self.pull_ops = [
                self.pull_a_params_op,
                self.pull_c_params_op,
            ]
            self.grads = [
                self.a_grads,
                self.c_grads,
            ]
            self.losses = [
                self.a_loss,
                self.c_loss,
            ]

            self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
            self.loss_MA = [MovingAverage(400) for i in range(len(self.grads))]
            self.entropy_MA = MovingAverage(400)
            self.labels = [
                "Actor",
                "Critic",
            ]
            self.HPs = HPs