Example #1
0
class DQNAgent(object):
    
    def __init__(self,sess,env,params):
        self.params=params
        self.xpsize=params['replaymemory']
        self.cnt=0
        self.env=env
        self.sess=sess
        self.current_loss=0
        
#        self.run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
#        self.run_metadata = tf.RunMetadata()
        
        self.last_reward=tf.Variable(0,name="cum_reward",dtype=tf.float32,trainable=False)
        self.last_q=tf.Variable(0,name="cum_q",dtype=tf.float32,trainable=False)
        self.last_rate=tf.Variable(0,name="rate",dtype=tf.float32,trainable=False)
        self.last_steps=tf.Variable(0,name="episode_steps",dtype=tf.float32,trainable=False)
        self.epoche_reward=tf.Variable(0,name="epoche_reward",dtype=tf.float32,trainable=False)
        self.epoche_value=tf.Variable(0,name="epoche_value",dtype=tf.float32,trainable=False)
        self.epoche_maxreward=tf.Variable(0,name="epoche_max_reward",dtype=tf.float32,trainable=False)
        self.eps=params['initexploration']
        self.q_predict=QNet(sess,"prediction",params)
        self.q_target=QNet(sess,"target",params,train=False)
        
        self.initBuffers()
        self.initTraining()
        self.initSummaries()
        
        self.rpm=RPM(params['replaymemory'])
        
#        os.mkdir(self.params['traindir'])
        subdir=datetime.datetime.now().strftime('%d%m%y_%H%M%S')
        self.traindir=os.path.join(params['traindir'], "run_%s"%subdir)
        os.mkdir(self.traindir)
        self.picdir=os.path.join(self.traindir,"pics")
        os.mkdir(self.picdir)
        checkpoint_dir=os.path.join(self.traindir,self.params['checkpoint_dir'])
        os.mkdir(checkpoint_dir)
        
        self.saver = tf.train.Saver()
        
        if params["latest_run"]:
            self.latest_traindir=os.path.join(params['traindir'], "run_%s"%params["latest_run"])
            latest_checkpoint = tf.train.latest_checkpoint(os.path.join(self.latest_traindir,self.params['checkpoint_dir']))
            if latest_checkpoint:
                print("Loading model checkpoint {}...\n".format(latest_checkpoint))
                self.saver.restore(sess, latest_checkpoint)
        
        self.merged = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter(self.traindir,sess.graph)
                
        init = tf.global_variables_initializer()
        
        sess.run(init)
        self.q_target.updateWeights(self.q_predict.getWeights())
#        sess.graph.finalize()
    
    def __del__(self):
        self.train_writer.close()
        
    def initTraining(self):
#        self.optimizer = tf.train.RMSPropOptimizer(self.params['learningrate'],self.params['gradientmomentum'],
#                                                   self.params['mingradientmomentum'],1e-6)
        self.optimizer = tf.train.RMSPropOptimizer(self.params['learningrate'],momentum=0.95, epsilon=0.01)
        
        
        self.global_step = tf.Variable(0, trainable=False)
        self.eps_op=tf.train.polynomial_decay(params['initexploration'], self.global_step,
                                          params['finalexpframe'], params['finalexploration'],
                                          power=1)
        
        qpred=self.q_predict.estimateAction()
        qtarget=self.q_target.estimateQGreedy()
        diff=qtarget-qpred
        
#        self.losses = tf.squared_difference(qtarget, qpred) # (r + g*max a' Q_target(s',a')-Q_predict(s,a))
#        self.loss = tf.reduce_mean(self.losses)
        self.loss = tf.reduce_mean(self.td_error(diff))
        
        self.train = self.optimizer.minimize(self.loss,global_step=self.global_step)

    def initBuffers(self):
        self.reward_buffer=deque([])
        self.frame_buffer=deque([])
        self.frame2_buffer=deque([])
        self.action_buffer=deque([])
        self.done_buffer=deque([])
        
    def variable_summaries(self,var):
        with tf.name_scope('summaries'):
            mean = tf.reduce_mean(var)
            tf.summary.scalar('mean', mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.summary.scalar('stddev', stddev)
            tf.summary.scalar('max', tf.reduce_max(var))
            tf.summary.scalar('min', tf.reduce_min(var))
            tf.summary.histogram('histogram', var)
            
            
    def initSummaries(self):
        with tf.name_scope("episode_stats"):
            tf.summary.scalar('cum_reward', self.last_reward)
            tf.summary.scalar('steps', self.last_steps)
            tf.summary.scalar('rate', self.last_rate)
        with tf.name_scope("epoche_stats"):
            tf.summary.scalar('epoche_reward', self.epoche_reward)
            tf.summary.scalar('epoche_maxreward', self.epoche_maxreward)
            tf.summary.scalar('epoche_value', self.epoche_value)
        with tf.name_scope("prediction_action"):
            self.variable_summaries(self.q_predict.action_logits)
            tf.summary.histogram('histogram',tf.to_float(self.q_predict.greedy_actions))
        
        with tf.name_scope("target_action"):
            self.variable_summaries(self.q_target.action_logits)
            
        with tf.name_scope("loss"):
            tf.summary.scalar('loss_val',self.loss)
            
        with tf.name_scope("epsilon"):
            tf.summary.scalar('eps_val',self.eps_op)
        
    def addTransition(self,t):
        self.rpm.addTransition(t)
            
    def _sampleTransitionBatch(self,batchsize=32):
        sample=self.rpm.sampleTransition()
        
        return {self.q_predict.images_placeholder: np.array(sample[0],dtype=np.float32)/255.,
                self.q_predict.action_placeholder: sample[1],
                self.q_target.reward_placeholder: np.clip(sample[2],-1,1),
                self.q_target.images_placeholder: np.array(sample[3],dtype=np.float32)/255.,
                self.q_target.done_placeholder: np.array(sample[4],dtype=np.float32)}
        
    def saveStats(self,reward,steps=0,rate=0):
        ops=[self.last_reward.assign(reward),
             self.last_steps.assign(steps),
             self.last_rate.assign(rate)]
        
        self.sess.run(ops)
#        reward_file=os.path.join(self.traindir, 'rewards.dat')
#        np.savetxt(reward_file,np.array(data))

    
    def epocheStats(self,reward,q,rmax):
        ops=[self.epoche_value.assign(q),
             self.epoche_reward.assign(reward),
             self.epoche_maxreward.assign(rmax)]
        
        self.sess.run(ops)
        
    def td_error(self,x):
        if self.params["huberloss"]:
            # Huber loss
            try:
                return tf.select(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5)
            except:
                return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5)
        else:
            return tf.square(x)
            
        
    def takeAction(self,state=None,eps_ext=None):
        self.eps=self.eps_op.eval()
        g=0
        
        actions=range(self.params['actionsize'])
        
        if eps_ext is not None:
            if np.random.random()<eps_ext:
                a=np.random.choice(actions)
            else:
                action_index=self.q_predict.estimateActionGreedy(state)
                a=action_index
                g=1
                return a,g
        else:
            if state==None:
                a=np.random.choice(actions)
            else:
                if np.random.random()<self.eps:
                    a=np.random.choice(actions)
                        
                else:
                    action_index=self.q_predict.estimateActionGreedy(state)
                    a=action_index
                    g=1
            
        return a,g
    
    def getLoss(self):
        xp_feed_dict=self._sampleTransitionBatch(batchsize=self.params['batchsize'])
        l=self.sess.run(self.loss,feed_dict=xp_feed_dict)
        return l
        
    def trainNet(self):
        #Needs frameskipping: every C steps reset target weights!

        xp_feed_dict=self._sampleTransitionBatch(batchsize=self.params['batchsize'])
        
        self.sess.run([self.train],feed_dict=xp_feed_dict)
        
        
        
        # Create the Timeline object, and write it to a json
#        tl = timeline.Timeline(self.run_metadata.step_stats)
#        ctf = tl.generate_chrome_trace_format()
#        with open('timeline.json', 'w') as f:
#            f.write(ctf)
        
        if self.global_step.eval()%self.params['summary_steps']==0:
            l,summary=self.sess.run([self.loss,self.merged],feed_dict=xp_feed_dict)
            self.current_loss=l
            self.train_writer.add_summary(summary, self.global_step.eval())
        
        if self.global_step.eval()%self.params['checkpoint_steps']==0:
            checkpoint_file = os.path.join(self.traindir,self.params['checkpoint_dir'], 'checkpoint')
            name=self.saver.save(self.sess, checkpoint_file, global_step=self.global_step.eval())
            print("Saving checkpoint: %s"%name)
            
        
        return self.current_loss
        
    def resetTarget(self):
        #reset target weights every C steps; put in main loop
        self.q_target.updateWeights(self.q_predict.getWeights())
        

    def _writeFrame(self,frame,episode,timestep,picdir):
        ep_dir=os.path.join(picdir,"episode_%.5d"%episode)
        if not os.path.exists(ep_dir):
            os.mkdir(ep_dir)
        name = os.path.join(ep_dir,"step_%.4d.png"%timestep)
        cv2.imwrite(name,frame)
        
    def writeFrame(self,frame,episode,timestep):
        self._writeFrame(frame,episode,timestep,self.picdir)
Example #2
0
class DDPGAgent(object):
    def __init__(self, sess, env, params):
        self.params = params
        self.xpsize = params['replaymemory']
        self.cnt = 0
        self.env = env
        self.sess = sess
        self.current_loss = 0

        #        self.run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        #        self.run_metadata = tf.RunMetadata()

        self.last_reward = tf.Variable(0,
                                       name="cum_reward",
                                       dtype=tf.float32,
                                       trainable=False)
        self.last_q = tf.Variable(0,
                                  name="cum_q",
                                  dtype=tf.float32,
                                  trainable=False)
        self.last_rate = tf.Variable(0,
                                     name="rate",
                                     dtype=tf.float32,
                                     trainable=False)
        self.last_steps = tf.Variable(0,
                                      name="episode_steps",
                                      dtype=tf.float32,
                                      trainable=False)
        self.epoche_reward = tf.Variable(0,
                                         name="epoche_reward",
                                         dtype=tf.float32,
                                         trainable=False)
        self.epoche_value = tf.Variable(0,
                                        name="epoche_value",
                                        dtype=tf.float32,
                                        trainable=False)
        self.epoche_maxreward = tf.Variable(0,
                                            name="epoche_max_reward",
                                            dtype=tf.float32,
                                            trainable=False)

        self.eps = params['initexploration']

        self.noise = ou_noise.OUNoise(params["actionsize"])

        self.ac_predict = actor.ActorNet(sess, env, "actor_predict", params)
        self.ac_target = actor.ActorNet(sess,
                                        env,
                                        "actor_target",
                                        params,
                                        train=False)
        self.ac_target.setWeightUpdate(self.ac_predict.params_list)
        #        self.last_action=tf.Variable(params["actionsize"]*[0],name="last_action",dtype=tf.float32,trainable=False)
        #        self.last_action.assign(self.ac_predict.scaled_out)

        self.cr_predict = critic.CriticNet(sess,
                                           "critic_predict",
                                           params,
                                           wd=self.params["weight_decay"])
        self.cr_target = critic.CriticNet(sess,
                                          "critic_target",
                                          params,
                                          train=False)
        self.cr_target.setWeightUpdate(self.cr_predict.params_list)

        self.cr_target.updateInitWeights(self.cr_predict.params_list)
        self.ac_target.updateInitWeights(self.ac_predict.params_list)

        self.initTraining()
        self.initSummaries()

        self.rpm = RPM(params['replaymemory'],
                       frame_shape=params["obssize"],
                       dtype=params["frame_dtype"])

        #        os.mkdir(self.params['traindir'])
        subdir = datetime.datetime.now().strftime('%d%m%y_%H%M%S')
        self.traindir = os.path.join(params['traindir'], "run_%s" % subdir)
        os.mkdir(self.traindir)
        self.picdir = os.path.join(self.traindir, "pics")
        os.mkdir(self.picdir)
        checkpoint_dir = os.path.join(self.traindir,
                                      self.params['checkpoint_dir'])
        os.mkdir(checkpoint_dir)

        self.saver = tf.train.Saver()

        if params["latest_run"]:
            self.latest_traindir = os.path.join(
                params['traindir'], "run_%s" % params["latest_run"])
            latest_checkpoint = tf.train.latest_checkpoint(
                os.path.join(self.latest_traindir,
                             self.params['checkpoint_dir']))
            if latest_checkpoint:
                print("Loading model checkpoint {}...\n".format(
                    latest_checkpoint))
                self.saver.restore(sess, latest_checkpoint)

        self.merged = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter(self.traindir, sess.graph)

        init = tf.global_variables_initializer()

        sess.run(init)
        #        self.cr_target.updateInitWeights(self.cr_predict.params_list)
        #        self.ac_target.updateInitWeights(self.ac_predict.params_list)
        self.sess.run(self.cr_target.initholder)
        self.sess.run(self.ac_target.initholder)
#        sess.graph.finalize()

    def __del__(self):
        self.train_writer.close()

    def initTraining(self):
        self.global_step = tf.Variable(0, trainable=False)

        self.optimizer_critic = tf.train.AdamOptimizer(
            self.params['learningrate_critic'])

        cr_pred = self.cr_predict.estimateQ()
        cr_target = self.cr_target.estimateTarget()

        diff = cr_target - cr_pred

        self.loss_critic = tf.reduce_mean(self.td_error(diff))
        tf.add_to_collection('losses', self.loss_critic)
        self.loss_critic = tf.add_n(tf.get_collection('losses'))

        self.train_critic = self.optimizer_critic.minimize(
            self.loss_critic, global_step=self.global_step)

        self.optimizer_actor = tf.train.AdamOptimizer(
            self.params['learningrate_actor'])

        self.train_actor = self.optimizer_actor.apply_gradients(
            zip(self.ac_predict.actor_gradients, self.ac_predict.params_list))

    def variable_summaries(self, var, name='summaries'):
        with tf.name_scope(name.split("/")[-1].split(":")[0]):
            mean = tf.reduce_mean(var)
            tf.summary.scalar('mean', mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.summary.scalar('stddev', stddev)
            tf.summary.scalar('max', tf.reduce_max(var))
            tf.summary.scalar('min', tf.reduce_min(var))
            tf.summary.histogram('histogram', var)

    def initSummaries(self):
        with tf.name_scope("episode_stats"):
            tf.summary.scalar('cum_reward', self.last_reward)
            tf.summary.scalar('steps', self.last_steps)
            tf.summary.scalar('rate', self.last_rate)
        with tf.name_scope("epoche_stats"):
            tf.summary.scalar('epoche_reward', self.epoche_reward)
            tf.summary.scalar('epoche_maxreward', self.epoche_maxreward)
            tf.summary.scalar('epoche_value', self.epoche_value)
        with tf.name_scope("critic"):
            self.variable_summaries(self.cr_predict.out)
            tf.summary.histogram('histogram', tf.to_float(self.cr_predict.out))
            self.variable_summaries(self.cr_target.out)
            tf.summary.histogram('histogram', tf.to_float(self.cr_target.out))
#        for w in self.cr_predict.params_list:
#            with tf.name_scope("critic_"+w.name.split("/")[-1].split(":")[0]):
#                self.variable_summaries(w)
#        for w in self.cr_target.params_list:
#            with tf.name_scope("critic_target_"+w.name.split("/")[-1].split(":")[0]):
#                self.variable_summaries(w)
#        for w in self.ac_predict.params_list:
#            with tf.name_scope("actor_"+w.name.split("/")[-1].split(":")[0]):
#                self.variable_summaries(w)
        with tf.name_scope("actor"):
            self.variable_summaries(self.ac_predict.scaled_out)
            tf.summary.histogram('histogram',
                                 tf.to_float(self.ac_predict.scaled_out))
#            self.variable_summaries(self.ac_predict.actor_gradients)
#            tf.summary.histogram('histogram',tf.to_float(self.ac_predict.actor_gradients))
        with tf.name_scope("loss"):
            tf.summary.scalar('loss_critic', self.loss_critic)

    def saveStats(self, reward, steps=0, rate=0):
        ops = [
            self.last_reward.assign(reward),
            self.last_steps.assign(steps),
            self.last_rate.assign(rate)
        ]

        self.sess.run(ops)
#        reward_file=os.path.join(self.traindir, 'rewards.dat')
#        np.savetxt(reward_file,np.array(data))

    def epocheStats(self, reward, q, rmax):
        ops = [
            self.epoche_value.assign(q),
            self.epoche_reward.assign(reward),
            self.epoche_maxreward.assign(rmax)
        ]

        self.sess.run(ops)

    def td_error(self, x):
        if self.params["huberloss"]:
            # Huber loss
            try:
                return tf.select(
                    tf.abs(x) < 1.0, 0.5 * tf.square(x),
                    tf.abs(x) - 0.5)
            except:
                return tf.where(
                    tf.abs(x) < 1.0, 0.5 * tf.square(x),
                    tf.abs(x) - 0.5)
        else:
            return tf.square(x)

    def takeAction(self, state=None, eps_ext=None):
        g = 0

        if state == None:
            a = self.env.action_space.sample()
        else:
            feed = np.expand_dims(state, axis=0)
            a = self.ac_predict.policy(feed) + self.noise.noise()
            g = 1


#            self.last_action.assign(np.mean(a)).op.run()

        return a, g

    def getLoss(self):
        xp_feed_dict = self._sampleTransitionBatch(
            batchsize=self.params['batchsize'])
        l = self.sess.run(self.loss, feed_dict=xp_feed_dict)
        return l

    def addTransition(self, t):
        self.rpm.addTransition(t)

    def _sampleCriticBatch(self):
        sample = self.rpm.sampleTransition(batchsize=self.params["batchsize"])

        actor_actions = self.ac_target.policy(sample[3])

        cr_batch = {
            self.cr_predict.input_placeholder:
            np.array(sample[0], dtype=np.float32),
            self.cr_predict.action_placeholder:
            np.array(sample[1], dtype=np.float32),
            self.cr_target.reward_placeholder:
            np.array(sample[2], dtype=np.float32),
            self.cr_target.input_placeholder:
            np.array(sample[3], dtype=np.float32),
            self.cr_target.action_placeholder:
            actor_actions,
            self.cr_target.done_placeholder:
            sample[4]
        }

        return cr_batch

    def _sampleActorBatch(self, critic_batch):
        actor_actions = self.ac_predict.policy(
            critic_batch[self.cr_predict.input_placeholder])
        critic_grads = self.cr_predict.getGradients({
            self.cr_predict.input_placeholder:
            critic_batch[self.cr_predict.input_placeholder],
            self.cr_predict.action_placeholder:
            actor_actions
        })

        ac_batch = {
            self.ac_predict.input_placeholder:
            critic_batch[self.cr_predict.input_placeholder],
            self.ac_predict.gradients_placeholder:
            critic_grads[0]
        }

        return ac_batch

    def trainNet(self):
        critic_batch = self._sampleCriticBatch()

        self.sess.run([self.train_critic], feed_dict=critic_batch)

        actor_batch = self._sampleActorBatch(critic_batch)

        self.sess.run([self.train_actor], feed_dict=actor_batch)

        # Create the Timeline object, and write it to a json
        #        tl = timeline.Timeline(self.run_metadata.step_stats)
        #        ctf = tl.generate_chrome_trace_format()
        #        with open('timeline.json', 'w') as f:
        #            f.write(ctf)

        if self.global_step.eval() % self.params['summary_steps'] == 0:
            z = critic_batch.copy()
            z.update(actor_batch)
            l, summary = self.sess.run([self.loss_critic, self.merged],
                                       feed_dict=z)
            self.current_loss = l
            self.train_writer.add_summary(summary, self.global_step.eval())

        if self.global_step.eval() % self.params['checkpoint_steps'] == 0:
            checkpoint_file = os.path.join(self.traindir,
                                           self.params['checkpoint_dir'],
                                           'checkpoint')
            name = self.saver.save(self.sess,
                                   checkpoint_file,
                                   global_step=self.global_step.eval())
            print("Saving checkpoint: %s" % name)

        return self.current_loss

    def updateTarget(self):
        self.cr_target.updateWeights()
        self.ac_target.updateWeights()

    def _writeFrame(self, frame, episode, timestep, picdir):
        ep_dir = os.path.join(picdir, "episode_%.5d" % episode)
        if not os.path.exists(ep_dir):
            os.mkdir(ep_dir)
        name = os.path.join(ep_dir, "step_%.4d.png" % timestep)
        cv2.imwrite(name, frame)

    def writeFrame(self, frame, episode, timestep):
        self._writeFrame(frame, episode, timestep, self.picdir)