class DQNAgent(object): def __init__(self,sess,env,params): self.params=params self.xpsize=params['replaymemory'] self.cnt=0 self.env=env self.sess=sess self.current_loss=0 # self.run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # self.run_metadata = tf.RunMetadata() self.last_reward=tf.Variable(0,name="cum_reward",dtype=tf.float32,trainable=False) self.last_q=tf.Variable(0,name="cum_q",dtype=tf.float32,trainable=False) self.last_rate=tf.Variable(0,name="rate",dtype=tf.float32,trainable=False) self.last_steps=tf.Variable(0,name="episode_steps",dtype=tf.float32,trainable=False) self.epoche_reward=tf.Variable(0,name="epoche_reward",dtype=tf.float32,trainable=False) self.epoche_value=tf.Variable(0,name="epoche_value",dtype=tf.float32,trainable=False) self.epoche_maxreward=tf.Variable(0,name="epoche_max_reward",dtype=tf.float32,trainable=False) self.eps=params['initexploration'] self.q_predict=QNet(sess,"prediction",params) self.q_target=QNet(sess,"target",params,train=False) self.initBuffers() self.initTraining() self.initSummaries() self.rpm=RPM(params['replaymemory']) # os.mkdir(self.params['traindir']) subdir=datetime.datetime.now().strftime('%d%m%y_%H%M%S') self.traindir=os.path.join(params['traindir'], "run_%s"%subdir) os.mkdir(self.traindir) self.picdir=os.path.join(self.traindir,"pics") os.mkdir(self.picdir) checkpoint_dir=os.path.join(self.traindir,self.params['checkpoint_dir']) os.mkdir(checkpoint_dir) self.saver = tf.train.Saver() if params["latest_run"]: self.latest_traindir=os.path.join(params['traindir'], "run_%s"%params["latest_run"]) latest_checkpoint = tf.train.latest_checkpoint(os.path.join(self.latest_traindir,self.params['checkpoint_dir'])) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) self.saver.restore(sess, latest_checkpoint) self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(self.traindir,sess.graph) init = tf.global_variables_initializer() sess.run(init) self.q_target.updateWeights(self.q_predict.getWeights()) # sess.graph.finalize() def __del__(self): self.train_writer.close() def initTraining(self): # self.optimizer = tf.train.RMSPropOptimizer(self.params['learningrate'],self.params['gradientmomentum'], # self.params['mingradientmomentum'],1e-6) self.optimizer = tf.train.RMSPropOptimizer(self.params['learningrate'],momentum=0.95, epsilon=0.01) self.global_step = tf.Variable(0, trainable=False) self.eps_op=tf.train.polynomial_decay(params['initexploration'], self.global_step, params['finalexpframe'], params['finalexploration'], power=1) qpred=self.q_predict.estimateAction() qtarget=self.q_target.estimateQGreedy() diff=qtarget-qpred # self.losses = tf.squared_difference(qtarget, qpred) # (r + g*max a' Q_target(s',a')-Q_predict(s,a)) # self.loss = tf.reduce_mean(self.losses) self.loss = tf.reduce_mean(self.td_error(diff)) self.train = self.optimizer.minimize(self.loss,global_step=self.global_step) def initBuffers(self): self.reward_buffer=deque([]) self.frame_buffer=deque([]) self.frame2_buffer=deque([]) self.action_buffer=deque([]) self.done_buffer=deque([]) def variable_summaries(self,var): with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) def initSummaries(self): with tf.name_scope("episode_stats"): tf.summary.scalar('cum_reward', self.last_reward) tf.summary.scalar('steps', self.last_steps) tf.summary.scalar('rate', self.last_rate) with tf.name_scope("epoche_stats"): tf.summary.scalar('epoche_reward', self.epoche_reward) tf.summary.scalar('epoche_maxreward', self.epoche_maxreward) tf.summary.scalar('epoche_value', self.epoche_value) with tf.name_scope("prediction_action"): self.variable_summaries(self.q_predict.action_logits) tf.summary.histogram('histogram',tf.to_float(self.q_predict.greedy_actions)) with tf.name_scope("target_action"): self.variable_summaries(self.q_target.action_logits) with tf.name_scope("loss"): tf.summary.scalar('loss_val',self.loss) with tf.name_scope("epsilon"): tf.summary.scalar('eps_val',self.eps_op) def addTransition(self,t): self.rpm.addTransition(t) def _sampleTransitionBatch(self,batchsize=32): sample=self.rpm.sampleTransition() return {self.q_predict.images_placeholder: np.array(sample[0],dtype=np.float32)/255., self.q_predict.action_placeholder: sample[1], self.q_target.reward_placeholder: np.clip(sample[2],-1,1), self.q_target.images_placeholder: np.array(sample[3],dtype=np.float32)/255., self.q_target.done_placeholder: np.array(sample[4],dtype=np.float32)} def saveStats(self,reward,steps=0,rate=0): ops=[self.last_reward.assign(reward), self.last_steps.assign(steps), self.last_rate.assign(rate)] self.sess.run(ops) # reward_file=os.path.join(self.traindir, 'rewards.dat') # np.savetxt(reward_file,np.array(data)) def epocheStats(self,reward,q,rmax): ops=[self.epoche_value.assign(q), self.epoche_reward.assign(reward), self.epoche_maxreward.assign(rmax)] self.sess.run(ops) def td_error(self,x): if self.params["huberloss"]: # Huber loss try: return tf.select(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) except: return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) else: return tf.square(x) def takeAction(self,state=None,eps_ext=None): self.eps=self.eps_op.eval() g=0 actions=range(self.params['actionsize']) if eps_ext is not None: if np.random.random()<eps_ext: a=np.random.choice(actions) else: action_index=self.q_predict.estimateActionGreedy(state) a=action_index g=1 return a,g else: if state==None: a=np.random.choice(actions) else: if np.random.random()<self.eps: a=np.random.choice(actions) else: action_index=self.q_predict.estimateActionGreedy(state) a=action_index g=1 return a,g def getLoss(self): xp_feed_dict=self._sampleTransitionBatch(batchsize=self.params['batchsize']) l=self.sess.run(self.loss,feed_dict=xp_feed_dict) return l def trainNet(self): #Needs frameskipping: every C steps reset target weights! xp_feed_dict=self._sampleTransitionBatch(batchsize=self.params['batchsize']) self.sess.run([self.train],feed_dict=xp_feed_dict) # Create the Timeline object, and write it to a json # tl = timeline.Timeline(self.run_metadata.step_stats) # ctf = tl.generate_chrome_trace_format() # with open('timeline.json', 'w') as f: # f.write(ctf) if self.global_step.eval()%self.params['summary_steps']==0: l,summary=self.sess.run([self.loss,self.merged],feed_dict=xp_feed_dict) self.current_loss=l self.train_writer.add_summary(summary, self.global_step.eval()) if self.global_step.eval()%self.params['checkpoint_steps']==0: checkpoint_file = os.path.join(self.traindir,self.params['checkpoint_dir'], 'checkpoint') name=self.saver.save(self.sess, checkpoint_file, global_step=self.global_step.eval()) print("Saving checkpoint: %s"%name) return self.current_loss def resetTarget(self): #reset target weights every C steps; put in main loop self.q_target.updateWeights(self.q_predict.getWeights()) def _writeFrame(self,frame,episode,timestep,picdir): ep_dir=os.path.join(picdir,"episode_%.5d"%episode) if not os.path.exists(ep_dir): os.mkdir(ep_dir) name = os.path.join(ep_dir,"step_%.4d.png"%timestep) cv2.imwrite(name,frame) def writeFrame(self,frame,episode,timestep): self._writeFrame(frame,episode,timestep,self.picdir)
class DDPGAgent(object): def __init__(self, sess, env, params): self.params = params self.xpsize = params['replaymemory'] self.cnt = 0 self.env = env self.sess = sess self.current_loss = 0 # self.run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # self.run_metadata = tf.RunMetadata() self.last_reward = tf.Variable(0, name="cum_reward", dtype=tf.float32, trainable=False) self.last_q = tf.Variable(0, name="cum_q", dtype=tf.float32, trainable=False) self.last_rate = tf.Variable(0, name="rate", dtype=tf.float32, trainable=False) self.last_steps = tf.Variable(0, name="episode_steps", dtype=tf.float32, trainable=False) self.epoche_reward = tf.Variable(0, name="epoche_reward", dtype=tf.float32, trainable=False) self.epoche_value = tf.Variable(0, name="epoche_value", dtype=tf.float32, trainable=False) self.epoche_maxreward = tf.Variable(0, name="epoche_max_reward", dtype=tf.float32, trainable=False) self.eps = params['initexploration'] self.noise = ou_noise.OUNoise(params["actionsize"]) self.ac_predict = actor.ActorNet(sess, env, "actor_predict", params) self.ac_target = actor.ActorNet(sess, env, "actor_target", params, train=False) self.ac_target.setWeightUpdate(self.ac_predict.params_list) # self.last_action=tf.Variable(params["actionsize"]*[0],name="last_action",dtype=tf.float32,trainable=False) # self.last_action.assign(self.ac_predict.scaled_out) self.cr_predict = critic.CriticNet(sess, "critic_predict", params, wd=self.params["weight_decay"]) self.cr_target = critic.CriticNet(sess, "critic_target", params, train=False) self.cr_target.setWeightUpdate(self.cr_predict.params_list) self.cr_target.updateInitWeights(self.cr_predict.params_list) self.ac_target.updateInitWeights(self.ac_predict.params_list) self.initTraining() self.initSummaries() self.rpm = RPM(params['replaymemory'], frame_shape=params["obssize"], dtype=params["frame_dtype"]) # os.mkdir(self.params['traindir']) subdir = datetime.datetime.now().strftime('%d%m%y_%H%M%S') self.traindir = os.path.join(params['traindir'], "run_%s" % subdir) os.mkdir(self.traindir) self.picdir = os.path.join(self.traindir, "pics") os.mkdir(self.picdir) checkpoint_dir = os.path.join(self.traindir, self.params['checkpoint_dir']) os.mkdir(checkpoint_dir) self.saver = tf.train.Saver() if params["latest_run"]: self.latest_traindir = os.path.join( params['traindir'], "run_%s" % params["latest_run"]) latest_checkpoint = tf.train.latest_checkpoint( os.path.join(self.latest_traindir, self.params['checkpoint_dir'])) if latest_checkpoint: print("Loading model checkpoint {}...\n".format( latest_checkpoint)) self.saver.restore(sess, latest_checkpoint) self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(self.traindir, sess.graph) init = tf.global_variables_initializer() sess.run(init) # self.cr_target.updateInitWeights(self.cr_predict.params_list) # self.ac_target.updateInitWeights(self.ac_predict.params_list) self.sess.run(self.cr_target.initholder) self.sess.run(self.ac_target.initholder) # sess.graph.finalize() def __del__(self): self.train_writer.close() def initTraining(self): self.global_step = tf.Variable(0, trainable=False) self.optimizer_critic = tf.train.AdamOptimizer( self.params['learningrate_critic']) cr_pred = self.cr_predict.estimateQ() cr_target = self.cr_target.estimateTarget() diff = cr_target - cr_pred self.loss_critic = tf.reduce_mean(self.td_error(diff)) tf.add_to_collection('losses', self.loss_critic) self.loss_critic = tf.add_n(tf.get_collection('losses')) self.train_critic = self.optimizer_critic.minimize( self.loss_critic, global_step=self.global_step) self.optimizer_actor = tf.train.AdamOptimizer( self.params['learningrate_actor']) self.train_actor = self.optimizer_actor.apply_gradients( zip(self.ac_predict.actor_gradients, self.ac_predict.params_list)) def variable_summaries(self, var, name='summaries'): with tf.name_scope(name.split("/")[-1].split(":")[0]): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) def initSummaries(self): with tf.name_scope("episode_stats"): tf.summary.scalar('cum_reward', self.last_reward) tf.summary.scalar('steps', self.last_steps) tf.summary.scalar('rate', self.last_rate) with tf.name_scope("epoche_stats"): tf.summary.scalar('epoche_reward', self.epoche_reward) tf.summary.scalar('epoche_maxreward', self.epoche_maxreward) tf.summary.scalar('epoche_value', self.epoche_value) with tf.name_scope("critic"): self.variable_summaries(self.cr_predict.out) tf.summary.histogram('histogram', tf.to_float(self.cr_predict.out)) self.variable_summaries(self.cr_target.out) tf.summary.histogram('histogram', tf.to_float(self.cr_target.out)) # for w in self.cr_predict.params_list: # with tf.name_scope("critic_"+w.name.split("/")[-1].split(":")[0]): # self.variable_summaries(w) # for w in self.cr_target.params_list: # with tf.name_scope("critic_target_"+w.name.split("/")[-1].split(":")[0]): # self.variable_summaries(w) # for w in self.ac_predict.params_list: # with tf.name_scope("actor_"+w.name.split("/")[-1].split(":")[0]): # self.variable_summaries(w) with tf.name_scope("actor"): self.variable_summaries(self.ac_predict.scaled_out) tf.summary.histogram('histogram', tf.to_float(self.ac_predict.scaled_out)) # self.variable_summaries(self.ac_predict.actor_gradients) # tf.summary.histogram('histogram',tf.to_float(self.ac_predict.actor_gradients)) with tf.name_scope("loss"): tf.summary.scalar('loss_critic', self.loss_critic) def saveStats(self, reward, steps=0, rate=0): ops = [ self.last_reward.assign(reward), self.last_steps.assign(steps), self.last_rate.assign(rate) ] self.sess.run(ops) # reward_file=os.path.join(self.traindir, 'rewards.dat') # np.savetxt(reward_file,np.array(data)) def epocheStats(self, reward, q, rmax): ops = [ self.epoche_value.assign(q), self.epoche_reward.assign(reward), self.epoche_maxreward.assign(rmax) ] self.sess.run(ops) def td_error(self, x): if self.params["huberloss"]: # Huber loss try: return tf.select( tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) except: return tf.where( tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) else: return tf.square(x) def takeAction(self, state=None, eps_ext=None): g = 0 if state == None: a = self.env.action_space.sample() else: feed = np.expand_dims(state, axis=0) a = self.ac_predict.policy(feed) + self.noise.noise() g = 1 # self.last_action.assign(np.mean(a)).op.run() return a, g def getLoss(self): xp_feed_dict = self._sampleTransitionBatch( batchsize=self.params['batchsize']) l = self.sess.run(self.loss, feed_dict=xp_feed_dict) return l def addTransition(self, t): self.rpm.addTransition(t) def _sampleCriticBatch(self): sample = self.rpm.sampleTransition(batchsize=self.params["batchsize"]) actor_actions = self.ac_target.policy(sample[3]) cr_batch = { self.cr_predict.input_placeholder: np.array(sample[0], dtype=np.float32), self.cr_predict.action_placeholder: np.array(sample[1], dtype=np.float32), self.cr_target.reward_placeholder: np.array(sample[2], dtype=np.float32), self.cr_target.input_placeholder: np.array(sample[3], dtype=np.float32), self.cr_target.action_placeholder: actor_actions, self.cr_target.done_placeholder: sample[4] } return cr_batch def _sampleActorBatch(self, critic_batch): actor_actions = self.ac_predict.policy( critic_batch[self.cr_predict.input_placeholder]) critic_grads = self.cr_predict.getGradients({ self.cr_predict.input_placeholder: critic_batch[self.cr_predict.input_placeholder], self.cr_predict.action_placeholder: actor_actions }) ac_batch = { self.ac_predict.input_placeholder: critic_batch[self.cr_predict.input_placeholder], self.ac_predict.gradients_placeholder: critic_grads[0] } return ac_batch def trainNet(self): critic_batch = self._sampleCriticBatch() self.sess.run([self.train_critic], feed_dict=critic_batch) actor_batch = self._sampleActorBatch(critic_batch) self.sess.run([self.train_actor], feed_dict=actor_batch) # Create the Timeline object, and write it to a json # tl = timeline.Timeline(self.run_metadata.step_stats) # ctf = tl.generate_chrome_trace_format() # with open('timeline.json', 'w') as f: # f.write(ctf) if self.global_step.eval() % self.params['summary_steps'] == 0: z = critic_batch.copy() z.update(actor_batch) l, summary = self.sess.run([self.loss_critic, self.merged], feed_dict=z) self.current_loss = l self.train_writer.add_summary(summary, self.global_step.eval()) if self.global_step.eval() % self.params['checkpoint_steps'] == 0: checkpoint_file = os.path.join(self.traindir, self.params['checkpoint_dir'], 'checkpoint') name = self.saver.save(self.sess, checkpoint_file, global_step=self.global_step.eval()) print("Saving checkpoint: %s" % name) return self.current_loss def updateTarget(self): self.cr_target.updateWeights() self.ac_target.updateWeights() def _writeFrame(self, frame, episode, timestep, picdir): ep_dir = os.path.join(picdir, "episode_%.5d" % episode) if not os.path.exists(ep_dir): os.mkdir(ep_dir) name = os.path.join(ep_dir, "step_%.4d.png" % timestep) cv2.imwrite(name, frame) def writeFrame(self, frame, episode, timestep): self._writeFrame(frame, episode, timestep, self.picdir)