Beispiel #1
0
class DDPG:

    def __init__(self, sess, params):
        self.sess = sess
        self.__dict__.update(params)
        # create placeholders
        self.create_input_placeholders()
        # create actor/critic models
        self.actor = Actor(self.sess, self.inputs, **self.actor_params)
        self.critic = Critic(self.sess, self.inputs, **self.critic_params)
        self.noise_params = {k: np.array(list(map(float, v.split(","))))
                             for k, v in self.noise_params.items()}
        self.noise = Noise(**self.noise_params)
        self.ou_level = np.zeros(self.dimensions["u"])
        self.memory = Memory(self.n_mem_objects,
                             self.memory_size)

    def create_input_placeholders(self):
        self.inputs = {}
        with tf.name_scope("inputs"):
            for ip_name, dim in self.dimensions.items():
                self.inputs[ip_name] = tf.placeholder(tf.float32,
                                                      shape=(None, dim),
                                                      name=ip_name)
            self.inputs["g"] = tf.placeholder(tf.float32,
                                              shape=self.inputs["u"].shape,
                                              name="a_grad")
            self.inputs["p"] = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name="pred_q")

    def step(self, x, is_u_discrete, explore=True):
        x = x.reshape(-1, self.dimensions["x"])
        u = self.actor.predict(x)
        if explore:
            self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
            u = u + self.ou_level
        q = self.critic.predict(x, u)
        if is_u_discrete:
            return [np.argmax(u), u[0], q[0]]
        return [u[0], u, q[0]]

    def remember(self, experience):
        self.memory.add(experience)

    def train(self):
        # check if the memory contains enough experiences
        if self.memory.size < 3*self.b_size:
            return
        x, g, ag, u, r, nx, ng, t = self.get_batch()
        # for her transitions
        her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0]
        # print("{} of {} selected for HER transitions".
        # format(len(her_idxs), self.b_size))
        g[her_idxs] = ag[her_idxs]
        r[her_idxs] = 1
        t[her_idxs] = 1
        x = np.hstack([x, g])
        nx = np.hstack([nx, ng])
        nu = self.actor.predict_target(nx)
        tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t)
        self.critic.train(x, u, tq)
        grad = self.critic.get_action_grads(x, u)
        # print("Grads:\n", g)
        self.actor.train(x, grad)
        self.update_targets()

    def get_batch(self):
        return self.memory.sample(self.b_size)

    def update_targets(self):
        self.critic.update_target()
        self.actor.update_target()
Beispiel #2
0
class Agent(object):
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 gamma=0.99,
                 max_size=10000,
                 layer1_size=400,
                 layer2_size=300,
                 batch_size=64):
        n_actions = env.action_space.shape[0]
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.sess = tf.Session()

        self.actor = Actor(alpha,
                           n_actions,
                           'Actor',
                           input_dims,
                           self.sess,
                           layer1_size,
                           layer2_size,
                           env.action_space.high,
                           self.batch_size,
                           ckpt_dir='tmp/ddpg/actor')

        self.critic = Critic(beta,
                             n_actions,
                             'Critic',
                             input_dims,
                             self.sess,
                             layer1_size,
                             layer2_size,
                             self.batch_size,
                             ckpt_dir='tmp/ddpg/critic')

        self.target_actor = Actor(alpha,
                                  n_actions,
                                  'TargetActor',
                                  input_dims,
                                  self.sess,
                                  layer1_size,
                                  layer2_size,
                                  env.action_space.high,
                                  self.batch_size,
                                  ckpt_dir='tmp/ddpg/target_actor')

        self.target_critic = Critic(beta,
                                    n_actions,
                                    'TargetCritic',
                                    input_dims,
                                    self.sess,
                                    layer1_size,
                                    layer2_size,
                                    self.batch_size,
                                    ckpt_dir='tmp/ddpg/target_critic')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_actor = [
            self.target_actor.params[i].assign(
                tf.multiply(self.actor.params[i], self.tau) +
                tf.multiply(self.target_actor.params[i], 1. - self.tau))
            for i in range(len(self.target_actor.params))
        ]

        self.update_critic = [
            self.target_critic.params[i].assign(
                tf.multiply(self.critic.params[i], self.tau) +
                tf.multiply(self.target_critic.params[i], 1. - self.tau))
            for i in range(len(self.target_critic.params))
        ]

        self.sess.run(tf.global_variables_initializer())

        self.update_target_network_parameters(first=True)

    def update_target_network_parameters(self, first=False):
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                if first:
                    old_tau = self.tau
                    self.tau = 1.0
                    self.target_actor.sess.run(self.update_actor)
                    self.target_critic.sess.run(self.update_critic)
                    self.tau = old_tau
                else:
                    self.target_critic.sess.run(self.update_critic)
                    self.target_actor.sess.run(self.update_actor)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, state):
        # print("State[0]: ",state[0].shape)
        # print("State[1]: ",state[1].shape)
        state1 = state[0][np.newaxis, :]
        state2 = state[1][np.newaxis, :]
        state = [state1, state2]
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                mu = self.actor.predict(state)
        noise = self.noise()
        mu_prime = mu + noise

        return mu_prime[0]

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                state, action, reward, new_state, done = \
                                            self.memory.sample_buffer(self.batch_size)
                #target q-value(new_state) with actor's bounded action forward pass
                critic_value_ = self.target_critic.predict(
                    new_state, self.target_actor.predict(new_state))

                target = []
                for j in range(self.batch_size):
                    target.append(reward[j] +
                                  self.gamma * critic_value_[j] * done[j])

                target = np.reshape(target, (self.batch_size, 1))

                _ = self.critic.train(state, action, target)  #s_i, a_i and y_i

                # a = mu(s_i)
                a_outs = self.actor.predict(state)
                # gradients of Q w.r.t actions
                grads = self.critic.get_action_gradients(state, a_outs)

                self.actor.train(state, grads[0])

                self.update_target_network_parameters(first=True)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()
Beispiel #3
0
class Agent:
    #Warning! policy.py and critic.py are still work in progress and contain many global variables that should be converted to
    #class member variables. Before that is done, all instances of Agent must use the same values for the following:
    #PPOepsilon,nHidden,nUnitsPerLayer,activation,H,entropyLossWeight,sdLowLimit
    def __init__(self,
                 stateDim: int,
                 actionDim: int,
                 actionMin: np.array,
                 actionMax: np.array,
                 learningRate=0.0005,
                 gamma=0.99,
                 GAElambda=0.95,
                 PPOepsilon=0.2,
                 PPOentropyLossWeight=0,
                 nHidden: int = 2,
                 nUnitsPerLayer: int = 128,
                 mode="PPO-CMA-m",
                 activation="lrelu",
                 H: int = 9,
                 entropyLossWeight: float = 0,
                 sdLowLimit=0.01,
                 useScaler: bool = True,
                 criticTimestepScale=0.001):
        #Create policy network
        print("Creating policy")
        self.actionMin = actionMin.copy()
        self.actionMax = actionMax.copy()
        self.actionDim = actionDim
        self.stateDim = stateDim
        self.useScaler = useScaler
        if useScaler:
            self.scaler = Scaler(stateDim)
        self.scalerInitialized = False
        self.normalizeAdvantages = True
        self.gamma = gamma
        self.GAElambda = GAElambda
        self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale  #with gamma==0, no need for this
        piEpsilon = None
        nHistory = 1
        negativeAdvantageAvoidanceSigma = 0
        if mode == "PPO-CMA" or mode == "PPO-CMA-m":
            usePPOLoss = False  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = True
            self.reluAdvantages = True if mode == "PPO-CMA" else False
            nHistory = H  #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations
            useSigmaSoftClip = True
            negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0
        elif mode == "PPO":
            usePPOLoss = True  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = False
            # separateSigmaAdapt=False
            self.reluAdvantages = False
            useSigmaSoftClip = True
            piEpsilon = 0
        else:
            raise ("Unknown mode {}".format(mode))
        self.policy = Policy(
            stateDim,
            actionDim,
            actionMin,
            actionMax,
            entropyLossWeight=PPOentropyLossWeight,
            networkActivation=activation,
            networkDepth=nHidden,
            networkUnits=nUnitsPerLayer,
            networkSkips=False,
            learningRate=learningRate,
            minSigma=sdLowLimit,
            PPOepsilon=PPOepsilon,
            usePPOLoss=usePPOLoss,
            separateVarAdapt=separateVarAdapt,
            nHistory=nHistory,
            useSigmaSoftClip=useSigmaSoftClip,
            piEpsilon=piEpsilon,
            negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma)

        #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time.
        #Thus, we use time step as an additional feature for the critic.
        #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime
        print("Creating critic network")
        self.critic = Critic(stateDim=stateDim + 1,
                             learningRate=learningRate,
                             nHidden=nHidden,
                             networkUnits=nUnitsPerLayer,
                             networkActivation=activation,
                             useSkips=False,
                             lossType="L1")

        #Experience trajectory buffers for the memorize() and updateWithMemorized() methods
        self.experienceTrajectories = []
        self.currentTrajectory = []

    #call this after tensorflow's global variables initializer
    def init(self, sess: tf.Session, verbose=False):
        #Pretrain the policy to output the initial Gaussian for all states
        self.policy.init(
            sess, 0, 1,
            0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim),
            0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim),
            256, 2000, verbose)

    #stateObs is an n-by-m tensor, where n = number of observations, m = number of observation variables
    def act(self,
            sess: tf.Session,
            stateObs: np.array,
            deterministic=False,
            clipActionToLimits=True):
        #Expand a single 1d-observation into a batch of 1 vectors
        if len(stateObs.shape) == 1:
            stateObs = np.reshape(stateObs, [1, stateObs.shape[0]])
        #Query the policy for the action, except for the first iteration where we sample directly from the initial exploration Gaussian
        #that covers the whole action space.
        #This is done because we don't know the scale of state observations a priori; thus, we can only init the state scaler in update(),
        #after we have collected some experience.
        if self.useScaler and (not self.scalerInitialized):
            actions = np.random.normal(
                0.5 * (self.actionMin + self.actionMax) *
                np.ones(self.actionDim),
                0.5 * (self.actionMax - self.actionMin) *
                np.ones(self.actionDim),
                size=[stateObs.shape[0], self.actionDim])
            if clipActionToLimits:
                actions = np.clip(
                    actions, np.reshape(self.actionMin, [1, self.actionDim]),
                    np.reshape(self.actionMax, [1, self.actionDim]))
            return actions
        else:
            if self.useScaler:
                scaledObs = self.scaler.process(stateObs)
            else:
                scaledObs = stateObs
            if deterministic:
                actions = self.policy.getExpectation(sess, scaledObs)
            else:
                actions = self.policy.sample(sess, scaledObs)
            if clipActionToLimits:
                actions = np.clip(actions, self.actionMin, self.actionMax)
            return actions

    def memorize(self, observation: np.array, action: np.array, reward: float,
                 nextObservation: np.array, done: bool):
        e = Experience(observation, action, reward, nextObservation, done)
        self.currentTrajectory.append(e)
        if done:
            self.experienceTrajectories.append(self.currentTrajectory)
            self.currentTrajectory = []

    def getAverageActionStdev(self):
        if self.useScaler and (not self.scalerInitialized):
            return np.mean(0.5 * (self.actionMax - self.actionMin))
        else:
            return self.policy.usedSigmaSum / (1e-20 +
                                               self.policy.usedSigmaSumCounter)

    #If you call memorize() after each action, you can update the agent with this method.
    #If you handle the experience buffers yourself, e.g., due to a multithreaded implementation, use the update() method instead.
    def updateWithMemorized(self,
                            sess: tf.Session,
                            batchSize: int = 512,
                            nBatches: int = 100,
                            verbose=True,
                            valuesValid=False,
                            timestepsValid=False):
        self.update(sess,
                    experienceTrajectories=self.experienceTrajectories,
                    batchSize=batchSize,
                    nBatches=nBatches,
                    verbose=verbose,
                    valuesValid=valuesValid,
                    timestepsValid=timestepsValid)
        averageEpisodeReturn = 0
        for t in self.experienceTrajectories:
            episodeReturn = 0
            for e in t:
                episodeReturn += e.r
            averageEpisodeReturn += episodeReturn
        averageEpisodeReturn /= len(self.experienceTrajectories)
        self.experienceTrajectories = []
        self.currentTrajectory = []
        return averageEpisodeReturn

    #experienceTrajectories is a list of lists of Experience instances such that each of the contained lists corresponds to an episode simulation trajectory
    def update(self,
               sess: tf.Session,
               experienceTrajectories,
               batchSize: int = 512,
               nBatches: int = 100,
               verbose=True,
               valuesValid=False,
               timestepsValid=False):
        trajectories = experienceTrajectories  #shorthand

        #Collect all data into linear arrays for training.
        nTrajectories = len(trajectories)
        nData = 0
        for trajectory in trajectories:
            nData += len(trajectory)
            #propagate values backwards along trajectory if not already done
            if not valuesValid:
                for i in reversed(range(len(trajectory) - 1)):
                    #value estimates, used for training the critic and estimating advantages
                    trajectory[i].V = trajectory[
                        i].r + self.gamma * trajectory[i + 1].V
            #update time steps if not updated
            if not timestepsValid:
                for i in range(len(trajectory)):
                    trajectory[i].timeStep = i
        allStates = np.zeros([nData, self.stateDim])
        allActions = np.zeros([nData, self.actionDim])
        allValues = np.zeros([nData])
        allTimes = np.zeros([nData, 1])
        k = 0
        for trajectory in trajectories:
            for e in trajectory:
                allStates[k, :] = e.s
                allValues[k] = e.V
                allActions[k, :] = e.a
                allTimes[k, 0] = e.timeStep * self.criticTimestepScale
                k += 1

        #Update scalers
        if self.useScaler:
            self.scaler.update(allStates)
            scale, offset = self.scaler.get()
            self.scalerInitialized = True
        else:
            offset = 0
            scale = 1

        #Scale the observations for training the critic
        scaledStates = self.scaler.process(allStates)

        #Train critic
        def augmentCriticObs(obs: np.array, timeSteps: np.array):
            return np.concatenate([obs, timeSteps], axis=1)

        self.critic.train(sess,
                          augmentCriticObs(scaledStates, allTimes),
                          allValues,
                          batchSize,
                          nEpochs=0,
                          nBatches=nBatches,
                          verbose=verbose)

        #Policy training needs advantages, which depend on the critic we just trained.
        #We use Generalized Advantage Estimation by Schulman et al.
        if verbose:
            print("Estimating advantages...".format(len(trajectories)))
        for t in trajectories:
            #query the critic values of all states of this trajectory in one big batch
            nSteps = len(t)
            states = np.zeros([nSteps + 1, self.stateDim])
            timeSteps = np.zeros([nSteps + 1, 1])
            for i in range(nSteps):
                states[i, :] = t[i].s
                timeSteps[i, 0] = t[i].timeStep * self.criticTimestepScale
            states[nSteps, :] = t[nSteps - 1].s_next
            states = (states - offset) * scale
            values = self.critic.predict(sess,
                                         augmentCriticObs(states, timeSteps))

            #GAE loop, i.e., take the instantaneous advantage (how much value a single action brings, assuming that the
            #values given by the critic are unbiased), and smooth those along the trajectory using 1st-order IIR filter.
            for step in reversed(range(nSteps - 1)):
                delta_t = t[step].r + self.gamma * values[step +
                                                          1] - values[step]
                t[step].advantage = delta_t + self.GAElambda * self.gamma * t[
                    step + 1].advantage

        #Gather the advantages to linear array and apply ReLU and normalization if needed
        allAdvantages = np.zeros([nData])
        k = 0
        for trajectory in trajectories:
            for e in trajectory:
                allAdvantages[k] = e.advantage
                k += 1

        if self.reluAdvantages:
            allAdvantages = np.clip(allAdvantages, 0, np.inf)
        if self.normalizeAdvantages:
            aMean = np.mean(allAdvantages)
            aSd = np.std(allAdvantages)
            if verbose:
                print("Advantage mean {}, sd{}".format(aMean, aSd))
            allAdvantages /= 1e-10 + aSd

        #Train policy. Note that this uses original unscaled states, because the PPO-CMA variance training needs a history of
        #states in the same scale
        self.policy.train(sess,
                          allStates,
                          allActions,
                          allAdvantages,
                          batchSize,
                          nEpochs=0,
                          nBatches=nBatches,
                          stateOffset=offset,
                          stateScale=scale,
                          verbose=verbose)