num_masks=0, device=info['DEVICE']) if args.model_loadpath is not '': # what about random states - they will be wrong now??? # TODO - what about target net update cnt target_net.load_state_dict(model_dict['target_net_state_dict']) policy_net.load_state_dict(model_dict['policy_net_state_dict']) opt.load_state_dict(model_dict['optimizer']) print("loaded model state_dicts") # TODO cant load buffer yet if args.buffer_loadpath == '': args.buffer_loadpath = args.model_loadpath.replace( '.pkl', '_train_buffer.pkl') print("auto loading buffer from:%s" % args.buffer_loadpath) rbuffer.load(args.buffer_loadpath) info['args'] = args write_info_file(info, model_base_filepath, total_steps) random_state = np.random.RandomState(info["SEED"]) board_logger = TensorBoardLogger(model_base_filedir) last_target_update = 0 print("Starting training") all_rewards = [] epsilon_by_frame = lambda frame_idx: info['EPSILON_MIN'] + (info[ 'EPSILON_MAX'] - info['EPSILON_MIN']) * math.exp(-1. * frame_idx / info['EPSILON_DECAY']) for epoch_num in range(epoch_start, info['N_EPOCHS']): ep_reward, total_steps, etime = run_training_episode( epoch_num, total_steps)
class dqnRunner(): def __init__(self, sess, params, out_dir=None, agentB_sess=None): self.params = params self.sess = sess self.agentB_sess = agentB_sess self.lock = threading.Lock() self.modelStoreIntv = 150 self.bufferStoreIntv = 150 self.annealSteps = params['annealSteps'] self.state_dim = params['pxRes'] if self.params['verbose']: printT("tensorflow version: {}".format(tf.__version__)) # create environment self.env = Environment(sess, params, self) self.numActions = self.env.numActions # load classifier for reward calculation if self.params['classNN'] is not None: with tf.device("/device:CPU:0"): self.rewardClassNet = ClassConvNetEval(self.sess, params) self.env.rewardClassNet = self.rewardClassNet # just gets or resets global_step self.global_step = None variables = tf.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) for v in variables: if "global_step" in v.name: self.global_step = v if self.global_step is None: self.global_step = tf.Variable(0, name='global_step', trainable=False) self.resetGlStep = tf.assign(self.global_step, 0) # load actual dqn self.q = DQN(self.sess, self.params['out_dir'], self.global_step, self.params, self.numActions) self.evalMethods = ["agent", "random"] self.evalMethod = "agent" self.qAgentB = None if (not self.params['agentB'] is None) and self.params['interEval']: self.qAgentB = DQN(self.agentB_sess, self.params['out_dir'], self.global_step, self.params, self.numActions, agentB=True) self.evalMethod = "agentA" self.evalMethods = ["agentA", "random", "fixed", "agentB"] self.sess.as_default() # replay buffer (size and type) if self.params['replaySz'] is None: self.replayBufferSize = 1000000 else: self.replayBufferSize = self.params['replaySz'] self.replay = ReplayBuffer(self.replayBufferSize) # variables for exploration decay self.action_step = tf.Variable(0, name='action_step', trainable=False, dtype=tf.int32) self.increment_ac_step_op = tf.assign(self.action_step, self.action_step + 1) self.global_action_step = tf.Variable(0, name='global_action_step', trainable=False, dtype=tf.int32) self.increment_gac_step_op = tf.assign(self.global_action_step, self.global_action_step + 1) self.episode_step = tf.Variable(0, name='episode_step', trainable=False, dtype=tf.int32) self.increment_ep_step_op = tf.assign(self.episode_step, self.episode_step + 1) self.resetEpStep = tf.assign(self.episode_step, 0) self.resetAcStep = tf.assign(self.action_step, 0) self.resetGAcStep = tf.assign(self.global_action_step, 0) # save state self.saver = tf.train.Saver( max_to_keep=self.params['keepNewestModels']) fn = os.path.join(self.params['out_dir'], "mainLoopTime.txt") self.mainLoopTimeFile = open(fn, "a") fn_ = os.path.join(self.params['out_dir'], "learnLoopTime.txt") self.learnLoopTimeFile = open(fn_, "a") # main function, runs the learning process def run(self): # debugging variables, for tensorboard if self.params['evaluation']: # evaluation episodes, no exploration eval_reward = tf.Variable(0., name="evalReward") eval_reward_op = tf.summary.scalar("Eval-Reward", eval_reward) eval_disc_reward = tf.Variable(0., name="evalDiscReward") eval_disc_reward_op = tf.summary.scalar("Eval-Reward_discounted", eval_disc_reward) eval_stepCount = tf.Variable(0., name="evalStepCount") eval_stepCount_op = tf.summary.scalar("Eval-StepCount", eval_stepCount) eval_sum_vars = [eval_reward, eval_disc_reward, eval_stepCount] eval_sum_op = tf.summary.merge( [eval_reward_op, eval_disc_reward_op, eval_stepCount_op]) # (discounted) reward per episode episode_reward = tf.Variable(0., name="episodeReward") episode_reward_op = tf.summary.scalar("Reward", episode_reward) episode_disc_reward = tf.Variable(0., name="episodeDiscReward") episode_disc_reward_op = tf.summary.scalar("Reward_discounted", episode_disc_reward) # average (max q) episode_ave_max_q = tf.Variable(0., name='epsideAvgMaxQ') episode_ave_max_q_op = tf.summary.scalar("Qmax_Value", episode_ave_max_q) # number of steps for episode stepCount = tf.Variable(0., name="stepCount") stepCount_op = tf.summary.scalar("StepCount", stepCount) # number of learning iterations(total number of mini batches so far) global_step_op = tf.summary.scalar("GlobalStep", self.global_step) # current exploration epsilon epsilonVar = tf.Variable(0., name="epsilon") epsilonVar_op = tf.summary.scalar("Epsilon", epsilonVar) summary_vars = [ episode_reward, episode_disc_reward, episode_ave_max_q, stepCount, epsilonVar ] summary_ops = tf.summary.merge([ episode_reward_op, episode_disc_reward_op, episode_ave_max_q_op, stepCount_op, epsilonVar_op ]) self.writer = tf.summary.FileWriter( os.path.join(self.params['out_dir'], "train"), self.sess.graph) self.action_vars = [] self.action_ops = [] for a in range(self.numActions): action = tf.Variable(0., name="qval_action_" + str(a)) action_op = tf.summary.scalar("Q-Value_Action_" + str(a), action) self.action_vars.append(action) self.action_ops.append(action_op) self.action_ops = tf.summary.merge(self.action_ops) # initialize all tensorflow variables # and finalize graph (cannot be modified anymore) self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() # for debugging, variable values before and after if self.params['veryveryverbose']: variables = tf.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, scope="DQN") for v in variables: if v.name.endswith("conv1_2/weights:0"): print(v.name, self.sess.run(v)) # do we want to use pretrained weights for the dqn # from the classifier or a pretrained agent? if self.params['resume']: pass elif self.params['useClassNN']: print("restoring dqn net from classNN: {}".format( self.params['classNN'])) if "ckpt" in self.params['classNN']: self.q.saver.restore(self.sess, self.params['classNN']) else: self.q.saver.restore( self.sess, tf.train.latest_checkpoint(self.params['classNN'])) elif self.params['dqnNN'] is not None: print("restoring dqn net from dqnNN: {}".format( self.params['dqnNN'])) if "ckpt" in self.params['dqnNN']: self.q.saver.restore(self.sess, self.params['dqnNN']) else: self.q.saver.restore( self.sess, tf.train.latest_checkpoint(self.params['dqnNN'])) # main network weights are set, now run target init op self.sess.run(self.q.target_nn_init_op) if (self.params['agentB'] is not None) and self.params['interEval']: print("restoring agentB net from {}".format(self.params['agentB'])) if "ckpt" in self.params['agentB']: self.qAgentB.saver.restore(self.agentB_sess, self.params['agentB']) else: self.qAgentB.saver.restore( self.agentB_sess, tf.train.latest_checkpoint(self.params['agentB'])) # for debugging, variable values before and after if self.params['veryveryverbose']: variables = tf.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, scope="DQN") for v in variables: if v.name.endswith("conv1_2/weights:0"): print(v.name, self.sess.run(v)) print("initialize classifier network") if self.params['classNN'] is not None: print("restoring reward class net from classNN: {}".format( self.params['classNN'])) if "ckpt" in self.params['classNN']: self.rewardClassNet.saver.restore(self.sess, self.params['classNN']) else: self.rewardClassNet.saver.restore( self.sess, tf.train.latest_checkpoint(self.params['classNN'])) # load previously trained model if not self.params['resume'] and self.params['loadModel']: if "ckpt" in self.params['loadModel']: self.saver.restore(self.sess, self.params['loadModel']) else: self.saver.restore( self.sess, tf.train.latest_checkpoint(self.params['loadModel'])) printT("Model {} restored.".format(self.params['loadModel'])) # load previously filled replay buffer if not self.params['resume'] and self.params['loadReplay'] is not None: self.replay.load(self.params['loadReplay']) printT("Buffer {} restored.".format(self.params['loadReplay'])) # resume old run if self.params['resume']: self.saver.restore( sess, tf.train.latest_checkpoint( os.path.join(self.params['out_dir'], "models"))) printT("Model {} restored.".format( tf.train.latest_checkpoint( os.path.join(self.params['out_dir'], "models")))) # if not self.params['interEval'] : self.replay.load( os.path.join(self.params['out_dir'], "replayBuffer")) printT("Buffer {} restored.".format(self.params['out_dir'])) else: self.sess.run(self.resetGlStep) # start immediately for interactive test runs try: if os.environ['IS_INTERACTIVE'] == 'true' \ and \ not self.params['sleep']: self.params['startLearning'] = 1 except KeyError: pass # exploration variables self.startEpsilon = self.params['epsilonStart'] self.endEpsilon = self.params['epsilonStop'] self.epsilon = sess.run(epsilonVar) # evaluation/learning/exploration self.evalEp = False self.learning = True self.pauseLearning = False self.pauseExploring = False self.stopLearning = False self.stopExploring = False self.qValFileExpl = open( os.path.join(self.params['out_dir'], "qValExpl.txt"), "a") self.qValFileEval = open( os.path.join(self.params['out_dir'], "qValEval.txt"), "a") self.actionLogFile = open( os.path.join(self.params['out_dir'], "actionLog.txt"), "a") self.episodeLogFile = open( os.path.join(self.params['out_dir'], "episodeLog.txt"), "a") self.episodeEvalLogFile = open( os.path.join(self.params['out_dir'], "episodeEvalLog.txt"), "a") # remove stop/termination file if os.path.exists("stop"): os.remove(os.path.join(params['out_dir'], "stop")) # reset if self.params['onlyLearn']: sess.run(self.resetEpStep) sess.run(self.resetAcStep) if self.params['onlyLearn']: self.learn() exit() # multi-threaded # learning and exploration threads act independently? if self.params['async']: t = threading.Thread(target=self.learnWrap) t.daemon = True t.start() if self.params['evaluation']: # evaluate this often evalEpReward = 0 evalEpDiscReward = 0 evalEpStepCount = 0 evalIntv = 25 evalCnt = 40 evalOc = 0 # start exploration self.episode = sess.run(self.episode_step) if self.params['verbose']: printT("start Episode: {}".format(self.episode)) acs = sess.run(self.action_step) if self.params['verbose']: printT("start action step: {}".format(acs)) self.globActStep = acs gacs = sess.run(self.global_action_step) if self.params['verbose']: printT("start global action step: {}".format(gacs)) self.gac = gacs while self.episode < self.params['numEpisodes']: self.episode = sess.run(self.episode_step) sess.run(self.increment_ep_step_op) if self.params['verbose']: print("STARTING NEW EPISODE:" + str(self.episode)) # do we want to explore/gather samples? while self.stopExploring: time.sleep(1) # evaluation episode (no exploration?) if self.params['evaluation'] and self.episode % ( evalIntv + evalCnt) < evalCnt: self.evalEp = True if self.episode % (evalIntv + evalCnt) == 0: if self.params['verbose']: printT("Start Eval Episodes!") evalOc += 1 elif self.params['onlyLearn'] or \ (self.params['limitExploring'] is not None \ and self.replay.size() >= self.params['limitExploring']): self.pauseExploring = True self.evalEp = False else: self.evalEp = False # reset simulation/episode state terminal = False ep_reward = 0 ep_disc_reward = 0 ep_ave_max_q = 0 self.inEpStep = 0 if self.params['interEval']: self.evalMethod = self.evalMethods[self.episode % (len(self.evalMethods))] # reset environment # set start state and allowed actions nextState, allowedActions, terminal = self.env.reset( self.episode, self.evalEp, globActStep=self.globActStep) allowedV = self.calcAllowedActionsVector(allowedActions) if nextState is None: # unable to get state # restart with new episode continue lastTime = time.time() # step forward until terminal while not terminal: if os.path.exists(os.path.join(params['out_dir'], "stop")): self.terminate() if self.params['async']: if not t.isAlive(): printT("alive {}".format(t.isAlive())) printT("Exception in user code:") printT('-' * 60) traceback.print_exc(file=sys.stdout) printT('-' * 60) sys.stdout.flush() t.join(timeout=None) os._exit(-1) # state <- nextstate state = nextState # choose action # random or according to dqn (depending on epsilon) self.inEpStep += 1 if not self.evalEp: sess.run(self.increment_ac_step_op) self.globActStep += 1 sess.run(self.increment_gac_step_op) self.gac += 1 epsStep = max( 0, self.globActStep - (self.params['startLearning'] / 4.0)) tmp_step = min(epsStep, self.annealSteps) self.epsilon = (self.startEpsilon - self.endEpsilon) * \ (1 - tmp_step / self.annealSteps) + \ self.endEpsilon action = self.getActionID(state, allowedV) if self.evalMethod == "fixed": action = self.params['fixedAction'] # We choose a random action in these cases rnm = np.random.rand() if self.params['veryveryverbose']: printT("rnm:" + str(rnm) + " self.epsilon:" + str(self.epsilon) + " |self.params['randomEps']:" + str(self.params['randomEps']) + " e:" + str(self.episode)) if (self.evalMethod == "random" ) or (not self.pauseExploring) and (not self.evalEp) and ( self.episode < self.params['randomEps'] or rnm < self.epsilon): if self.params['verbose']: printT("randomly selecting action") action = np.random.choice(allowedActions) if self.params['verbose']: printT( "\nEpisode: {}, Step: {}, Time:{}, Next action (e-greedy {}): {}" .format(self.episode, self.globActStep, time.ctime(), self.epsilon, action)) else: # We let the DQN choose the action if self.params['verbose']: printT("Greedyly selecting action:") if self.params['verbose']: printT( "\nEpisode: {}, Step: {}, Time:{}, Next action: {}" .format(self.episode, self.globActStep, time.ctime(), action)) # perform selected action and # get new state, reward, and termination-info nextState, reward, terminal, terminalP, allowedActions = self.env.act( action, self.episode, self.inEpStep, self.globActStep, self.evalEp) if self.params['veryveryverbose']: print('ACTIONLOG:', str(self.globActStep), str(self.episode), str(self.inEpStep), action, self.evalEp, terminal, terminalP, reward, self.epsilon, self.evalMethod) self.actionLogFile.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( time.time(), str(self.globActStep), str(self.episode), str(self.inEpStep), action, self.evalEp, terminal, terminalP, reward, self.epsilon, self.evalMethod)) self.actionLogFile.flush() allowedV = self.calcAllowedActionsVector(allowedActions) # accumulate episode reward ep_disc_reward += pow(self.params['gamma'], self.inEpStep - 1) * reward ep_reward += reward if (self.evalMethod == "agent" ) and not self.evalEp and not self.pauseExploring: self.insertSamples(np.copy(state), action, reward, terminal, np.copy(nextState), np.copy(allowedV)) # do logging inside of one episode # we do not want to lose any data if self.params['storeModel'] and \ ((self.globActStep+1) % self.modelStoreIntv) == 0: logDqn.logModel(self) if self.params['storeBuffer'] and \ ((self.globActStep+1) % self.bufferStoreIntv) == 0: logDqn.logBuffer(self) # if training/exploration not decoupled, do one learning step if not self.params['async']: for i in range(8): self.learn() sys.stdout.flush() cTime = time.time() usedTime = cTime - lastTime # do we want to pause exploration thread? # (to simulate slower stm) if not self.pauseExploring and \ not self.evalEp and \ self.params['sleep'] and \ self.params['async'] and \ (self.replay.size() >= self.params['startLearning']) and \ (self.replay.size() >= self.params['miniBatchSize']): if self.params['sleepA'] is not None: sleepingTime = self.params['sleepA'] - usedTime if sleepingTime > 0: time.sleep(sleepingTime) else: time.sleep(60) cTime = time.time() usedTime = cTime - lastTime lastTime = cTime self.mainLoopTimeFile.write( str(cTime) + " " + str(usedTime) + "\n") self.mainLoopTimeFile.flush() # terminate episode after x steps # even if no good state has been reached if self.inEpStep == self.params['stepsTillTerm']: self.env.switchApproachArea() break # end episode # otherwise store episode summaries and print log if self.evalEp: evalEpReward += ep_reward evalEpDiscReward += ep_disc_reward evalEpStepCount += self.inEpStep if self.episode % (evalIntv + evalCnt) == (evalCnt - 1): summary_str = self.sess.run( eval_sum_op, feed_dict={ eval_sum_vars[0]: evalEpReward / float(evalCnt), eval_sum_vars[1]: evalEpDiscReward / float(evalCnt), eval_sum_vars[2]: evalEpStepCount / float(evalCnt) }) self.writer.add_summary(summary_str, evalOc - 1) evalEpReward = 0.0 evalEpDiscReward = 0.0 evalEpStepCount = 0.0 if self.params['veryveryverbose']: printT("step count-eval: {}".format(self.inEpStep)) if self.params['veryverbose']: printT( 'Time: {} | Reward: {} | Discounted Reward: {} | Eval-Episode {}' .format(time.ctime(), ep_reward, ep_disc_reward, self.episode)) self.episodeEvalLogFile.write( "{}\t{}\t{}\t{}\t{}\t{}\n".format(time.time(), self.episode, ep_reward, ep_disc_reward, self.inEpStep, self.epsilon)) self.episodeEvalLogFile.flush() else: if self.params['evaluation']: et = self.episode - (evalOc * evalCnt) else: et = self.episode summary_str = self.sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_disc_reward, summary_vars[2]: ep_ave_max_q / float(max(self.inEpStep, 1)), summary_vars[3]: self.inEpStep, summary_vars[4]: self.epsilon }) self.writer.add_summary(summary_str, et) self.writer.flush() if self.params['veryveryverbose']: printT("step count: {}".format(self.inEpStep)) if self.params['veryveryverbose']: printT( 'Time: {} | Reward: {} | Discounted Reward: {} | Episode {} | Buffersize: {}' .format(time.ctime(), ep_reward, ep_disc_reward, self.episode, self.replay.size())) self.episodeLogFile.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( time.time(), self.episode, ep_reward, ep_disc_reward, self.inEpStep, self.epsilon, self.evalMethod)) self.episodeLogFile.flush() # log some stuff if self.params['storeModel'] and \ ((self.episode+1) % self.modelStoreIntv) == 0: logDqn.logModel(self) if self.params['storeBuffer'] and \ ((self.episode+1) % self.bufferStoreIntv) == 0: logDqn.logBuffer(self) statsIntv = 100 sys.stdout.flush() # stop learning after last episode self.learning = False sys.stdout.flush() def terminate(self): printT("terminating...........") sys.stdout.flush() self.logStuff() sys.stdout.flush() printT("EXIT NOW!") sys.stdout.flush() exit(0) def learnWrap(self): try: self.learn() except: printT("learn wrap failed") printT("Exception in user code:") printT('-' * 60) traceback.print_exc(file=sys.stdout) printT('-' * 60) sys.stdout.flush() os._exit(-1) def learn(self): y_batch = np.zeros((self.params['miniBatchSize'], 1)) tmp = np.zeros((self.params['miniBatchSize'], self.numActions)) lastTime = time.time() count = 0 while self.learning: # Throtteling to allow the other thread a chance count += 1 cTime = time.time() loopTime = cTime - lastTime lastTime = cTime self.learnLoopTimeFile.write( str(cTime) + " " + str(loopTime) + "\n") self.learnLoopTimeFile.flush() if self.stopLearning: time.sleep(5.0) continue if self.replay.size() < self.params['startLearning'] or \ self.replay.size() < self.params['miniBatchSize'] or \ self.evalEp: if self.params['async']: time.sleep(5.0) continue else: return s_batch, a_batch, r_batch, t_batch, ns_batch, allowed_batch = \ self.replay.sample_batch(self.params['miniBatchSize']) if self.params['doubleDQN']: qValsNewState = self.estimate_ddqn(ns_batch, allowed_batch, p=False, mem=tmp) else: qValsNewState = self.predict_target_nn(ns_batch) for i in range(self.params['miniBatchSize']): if t_batch[i]: y_batch[i] = r_batch[i] else: y_batch[i] = r_batch[ i] + self.params['gamma'] * qValsNewState[i] gS, qs, delta = self.update(s_batch, a_batch, y_batch) if self.params['noHardResetDQN']: self.update_targets() elif (gS + 1) % self.params['resetFreq'] == 0: self.update_targets() if not self.params['async']: return if self.params['onlyLearn']: if (gS + 1) % 1000 == 0: logDqn.logModel(self) # Returns vector of length 'self.numActions' containing # Zeros for allowed actions # '-inf' for forbidden actions def calcAllowedActionsVector(self, allowedActions): allowedV = np.zeros(shape=(self.numActions)) allowedV[:] = float("-inf") # init all actions as fobidden for i in allowedActions: allowedV[i] = 0 # mark actions as allowed return allowedV # get action id for max q def getActionID(self, state, allowedActionsV): if self.params['interEval'] and self.evalMethod == 'agentB': if self.params['verbose']: print("PREDICTING WITH AGENTB:") qs = self.qAgentB.run_predict(state) print(qs) else: if self.params['verbose']: print("PREDICTING WITH AGENT:") qs = self.q.run_predict(state) if self.evalEp: self.qValFileEval.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( time.time(), str(self.globActStep), str(self.episode), str(self.inEpStep), qs[0], allowedActionsV)) self.qValFileEval.flush() else: self.qValFileExpl.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( time.time(), str(self.globActStep), str(self.episode), str(self.inEpStep), qs[0], allowedActionsV)) self.qValFileExpl.flush() var_dict = {} for a in range(self.numActions): var_dict[self.action_vars[a]] = qs[0][a] summary_str = self.sess.run(self.action_ops, feed_dict=var_dict) self.writer.add_summary(summary_str, self.gac) self.writer.flush() printT("Q-values:" + str(qs)) qs = qs + allowedActionsV return np.argmax(qs, axis=1)[0] # update dqn main network def update(self, states, actionIDs, targets): step, out, delta, loss = self.q.run_train(states, actionIDs, targets) # network diverged? if np.isnan(loss): printT("ABORT: NaN") sys.stdout.flush() os._exit(-1) return step, out, delta # update dqn target network def update_targets(self): self.q.run_update_target_nn() # estimate q values using double dqn # get values of target network for actions where main network is max def estimate_ddqn(self, states, allowedActionsV, p=False, mem=None): qs = self.q.run_predict(states) if p: if self.params['veryveryverbose']: print("allowedActionsV.shape" + str(allowedActionsV.shape)) print("qs.shape" + str(qs.shape)) qs += allowedActionsV # add '-inf' to the q values of forbidden actions if p: if self.params['veryveryverbose']: print(states) print(qs.shape) print(states.shape) printT("qs: {}".format(qs)) maxA = np.argmax(qs, axis=1) qs = self.q.run_predict_target(states) mem.fill(0) mem[np.arange(maxA.size), maxA] = 1 mem = mem * qs mem = np.sum(mem, axis=1) return mem # predict dqns def predict_target_nn(self, states): qs = self.q.run_predict_target(states) return np.max(qs, axis=1) def predict_nn(self, states): qs = self.q.run_predict(states) return np.max(qs, axis=1) # insert samples into replay buffer def insertSamples(self, stateScaled, action, reward, terminal, newStateScaled, allowedActionsV): stateScaled.shape = (stateScaled.shape[1], stateScaled.shape[2], stateScaled.shape[3]) newStateScaled.shape = (newStateScaled.shape[1], newStateScaled.shape[2], newStateScaled.shape[3]) states = (stateScaled, np.rot90(stateScaled, 2), np.fliplr(stateScaled), np.flipud(stateScaled)) newStates = (newStateScaled, np.rot90(newStateScaled, 2), np.fliplr(newStateScaled), np.flipud(newStateScaled)) if (self.params['fullAugmentation']): self.lock.acquire() for i in range(4): for j in range(4): self.replay.add(states[i], action, reward, terminal, allowedActionsV, newStates[j]) self.lock.release() else: self.lock.acquire() self.replay.add(stateScaled, action, reward, terminal, allowedActionsV, newStateScaled) self.replay.add(np.ascontiguousarray(np.rot90(stateScaled, 2)), action, reward, terminal, allowedActionsV, np.ascontiguousarray(np.rot90(newStateScaled, 2))) self.replay.add(np.ascontiguousarray(np.fliplr(stateScaled)), action, reward, terminal, allowedActionsV, np.ascontiguousarray(np.fliplr(newStateScaled))) self.replay.add(np.ascontiguousarray(np.flipud(stateScaled)), action, reward, terminal, allowedActionsV, np.ascontiguousarray(np.flipud(newStateScaled))) self.lock.release() # if we want to stop if buffer is full # or limit exploration if self.pauseExploring == False and \ self.replay.size() == self.replayBufferSize: if self.params['termAtFull']: printT("Buffer FULL!") self.logStuff() self.pauseExploring = True # exit() elif self.pauseExploring == False and \ self.params['limitExploring'] is not None and \ self.replay.size() >= self.params['limitExploring']: if self.params['termAtFull']: printT("Buffer FULL!") self.logStuff() self.pauseExploring = True def logStuff(self): logDqn.logModel(self) logDqn.logBuffer(self)
def main(): args = parser.parse_args() # Seq sequence length & visualization_num args.seq_len = args.target_num if args.seq_len is None else args.seq_len args.visualization_dir = os.path.join('exp', args.exp, 'visualization') utils.mkdir(args.visualization_dir) # Set exp directory and tensorboard writer writer_dir = os.path.join('exp', args.exp) utils.mkdir(writer_dir) writer = SummaryWriter(writer_dir) # Save arguments str_list = [] for key in vars(args): print('[{0}] = {1}'.format(key, getattr(args, key))) str_list.append('--{0}={1} \\'.format(key, getattr(args, key))) with open(os.path.join('exp', args.exp, 'args.txt'), 'w+') as f: f.write('\n'.join(str_list)) # Set directory. e.g. replay buffer, visualization, model snapshot args.replay_buffer_dir = os.path.join('exp', args.exp, 'replay_buffer') args.model_dir = os.path.join('exp', args.exp, 'models') utils.mkdir(args.model_dir) # Reset random seeds np.random.seed(args.seed) torch.manual_seed(args.seed) # Set device device = torch.device('cpu') if args.gpu == '-1' else torch.device( f'cuda:{args.gpu}') # Set replay buffer replay_buffer = ReplayBuffer(args.replay_buffer_dir, args.replay_buffer_size) if args.load_replay_buffer is not None: print(f'==> Loading replay buffer from {args.load_replay_buffer}') replay_buffer.load( os.path.join('exp', args.load_replay_buffer, 'replay_buffer')) print( f'==> Loaded replay buffer from {args.load_replay_buffer} [size = {replay_buffer.length}]' ) # Set model and optimizer if args.model_type == 'adagrasp': model = GraspingModel(num_rotations=args.num_rotations, gripper_final_state=True) elif args.model_type == 'adagrasp_init_only': model = GraspingModel(num_rotations=args.num_rotations, gripper_final_state=False) elif args.model_type == 'scene_only': model = GraspingModelSceneOnly(num_rotations=args.num_rotations, gripper_final_state=True) else: raise NotImplementedError(f'Does not support {args.model_type}') optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(0.9, 0.95)) model = model.to(device) #check cuda memory allocation if args.gpu != '-1': bytes_allocated = torch.cuda.memory_allocated(device) print("Model size: {:.3f} MB".format(bytes_allocated / (1024**2))) # load checkpoint if args.load_checkpoint is not None: print(f'==> Loading checkpoint from {args.load_checkpoint}') if args.load_checkpoint.endswith('.pth'): checkpoint = torch.load(args.load_checkpoint, map_location=device) else: checkpoint = torch.load(os.path.join('exp', args.load_checkpoint, 'models', 'latest.pth'), map_location=device) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint[ 'epoch'] if args.load_replay_buffer is not None else 0 print(f'==> Loaded checkpoint from {args.load_checkpoint}') else: start_epoch = 0 # launch processes for each env args.num_envs = 1 if args.gui else args.num_envs processes, conns = [], [] ctx = mp.get_context('spawn') for rank in range(args.num_envs): conn_main, conn_env = ctx.Pipe() reset_args = { 'num_open_scale': args.num_open_scale, 'max_open_scale': args.max_open_scale, 'min_open_scale': args.min_open_scale, 'gripper_final_state': args.model_type == 'adagrasp', 'target_num': args.target_num, 'obstacle_num': args.obstacle_num } p = ctx.Process(target=env_process, args=(rank, start_epoch + args.seed + rank, conn_env, args.gui, args.num_cam, args.seq_len, reset_args)) p.daemon = True p.start() processes.append(p) conns.append(conn_main) # Initialize exit signal handler (for graceful exits) def save_and_exit(signal, frame): print('Warning: keyboard interrupt! Cleaning up...') for p in processes: p.terminate() replay_buffer.dump() writer.close() print('Finished. Now exiting gracefully.') sys.exit(0) signal.signal(signal.SIGINT, save_and_exit) for epoch in range(start_epoch, args.epoch): print(f'---------- epoch-{epoch + 1} ----------') timestamp = time.time() assert args.min_epsilon <= args.max_epsilon m1, m2 = args.min_epsilon, args.max_epsilon epsilon = max(m1, m2 - (m2 - m1) * epoch / args.exploration_epoch) # Data collection data = collect_data( conns, model, device, n_steps=1, epsilon=epsilon, gripper_final_state=(args.model_type == 'adagrasp')) for d in data.values(): replay_buffer.save_data(d) average_reward = np.mean([d['reward'] for d in data.values()]) average_score = np.mean([d['score'] for d in data.values()]) print( f'Mean reward = {average_reward:.3f}, Mean score = {average_score:.3f}' ) writer.add_scalar('Data Collection/Reward', average_reward, epoch + 1) writer.add_scalar('Data Collection/Score', average_score, epoch + 1) time_data_collection = time.time() - timestamp # Replay buffer statistic reward_data = np.array(replay_buffer.scalar_data['reward']) print( f'Replay buffer size = {len(reward_data)} (positive = {len(np.argwhere(reward_data == 1))}, negative = {len(np.argwhere(reward_data == 0))})' ) # Policy training model.train() torch.set_grad_enabled(True) sum_loss = 0 score_statics = {'positive': list(), 'negative': list()} for _ in range(args.iter_per_epoch): iter_loss, iter_score_statics = train( model, device, replay_buffer, optimizer, args.batch_size, gripper_final_state=(args.model_type == 'adagrasp')) sum_loss += iter_loss score_statics['positive'].append(iter_score_statics[1]) score_statics['negative'].append(iter_score_statics[0]) average_loss = sum_loss / args.iter_per_epoch positive_score_prediction = np.mean(score_statics['positive']) negative_score_prediction = np.mean(score_statics['negative']) print( f'Training loss = {average_loss:.5f}, positive_mean = {positive_score_prediction:.3f}, negative_mean = {negative_score_prediction:.3f}' ) writer.add_scalar('Policy Training/Loss', average_loss, epoch + 1) writer.add_scalar('Policy Training/Positive Score Prediction', positive_score_prediction, epoch + 1) writer.add_scalar('Policy Training/Negative Score Prediction', negative_score_prediction, epoch + 1) # Save model and optimizer if (epoch + 1) % args.snapshot_gap == 0: model.eval() torch.set_grad_enabled(False) # Visualization [conn.send("reset") for conn in conns] data = collect_data( conns, model, device, n_steps=args.seq_len, epsilon=0, gripper_final_state=(args.model_type == 'adagrasp')) vis_path = os.path.join(args.visualization_dir, 'epoch_%06d' % (epoch + 1)) utils.visualization(data, args.num_envs, args.seq_len, args.num_open_scale, args.num_rotations, args.num_vis, vis_path) save_state = { 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch + 1 } torch.save(save_state, os.path.join(args.model_dir, 'latest.pth')) shutil.copyfile( os.path.join(args.model_dir, 'latest.pth'), os.path.join(args.model_dir, 'epoch_%06d.pth' % (epoch + 1))) # Save replay buffer replay_buffer.dump() # Print elapsed time for an epoch time_all = time.time() - timestamp time_training = time_all - time_data_collection print( f'Elapsed time = {time_all:.2f}: (collect) {time_data_collection:.2f} + (train) {time_training:.2f}' ) save_and_exit(None, None)
def trainer(env, outdir, epochs=100, MINIBATCH_SIZE=64, GAMMA=0.99, epsilon=0.01, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=False, render=False): tf.reset_default_graph() with tf.Session(config=config) as sess: # configuring environment #env = gym.make(ENV_NAME) # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # info of the environment to pass to the agent state_dim = env.observation_space action_dim = env.action_space action_bound = np.float64( 1 ) # I choose this number since the mountain continuos does not have a boundary # Creating agent # FOR the RNN #tf.contrib.rnn.core_rnn_cell.BasicLSTMCell from https://github.com/tensorflow/tensorflow/issues/8771 #cell = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None) #cell_target = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None) ruido = OUNoise(action_dim, mu=0.4) # this is the Ornstein-Uhlenbeck Noise actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, outdir) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), outdir) #sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) replay_buffer.load() #goal = 0 max_state = -1. try: critic.recover_critic() actor.recover_actor() print('********************************') print('models restored succesfully') print('********************************') except Exception as e: print('********************************') print(e) print('********************************') #critic.recover_critic() #actor.recover_actor() for i in range(epochs): state = env.reset() #state = np.hstack(state) ep_reward = 0 ep_ave_max_q = 0 done = False step = 0 max_state_episode = -1 epsilon -= epsilon / EXPLORE if epsilon < min_epsilon: epsilon = min_epsilon while (not done): if render: env.render() #print('step', step) # 1. get action with actor, and add noise np.set_printoptions(precision=4) # remove comment if you want to see a step by step update #print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode) # 2. take action, see next state and reward : action_original = actor.predict( np.reshape(state, (1, actor.s_dim ))) # + (10. / (10. + i))* np.random.randn(1) action = action_original #+ max(epsilon, 0) * ruido.noise() ''' for j in range(action.shape[1]): if abs(action[0,j]) > 1: act=action[0,j] action[0,j]=act/abs(act) else: continue ''' action = np.reshape(action, (actor.a_dim, )) next_state, reward, done, info = env.step(action) if train_indicator: # 3. Save in replay buffer: replay_buffer.add(np.reshape(state, (actor.s_dim, )), np.reshape(action, (actor.a_dim, )), reward, done, np.reshape(next_state, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Calculate targets # 5. Train critic Network (states,actions, R + gamma* V(s', a')): # 5.1 Get critic prediction = V(s', a') # the a' is obtained using the actor prediction! or in other words : a' = actor(s') target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch), 20) # 5.2 get y_t where: y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # 5.3 Train Critic! predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)), 20) ep_ave_max_q += np.amax(predicted_q_value) # 6 Compute Critic gradient (depends on states and actions) # 6.1 therefore I first need to calculate the actions the current actor would take. a_outs = actor.predict(s_batch) # 6.2 I calculate the gradients grads = critic.action_gradients(s_batch, a_outs, 20) c = np.array(grads) #print(c.shape) #print('...') #print('...',c[0].shape) #print('...') actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state if next_state[0] > max_state_episode: max_state_episode = next_state[0] ep_reward = ep_reward + reward step += 1 if max_state_episode > max_state: max_state = max_state_episode print('th', i + 1, 'Step', step, 'Reward:', ep_reward, 'Pos', next_state[0], next_state[1], 'epsilon', epsilon) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************') replay_buffer.save() #proc = Popen(['rosclean','purge'],stdout=PIPE, stdin=PIPE, stderr=PIPE,universal_newlines=True) #out,err = proc.communicate(input="{}\n".format("y")) #print('maxmimum state reach', max_state) #print('the reward at the end of the episode,', reward) #print('Efficiency', 100.*((goal)/(i+1.))) ''' print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************') replay_buffer.save() #env.close() ''' sess.close() return 0
def trainer(epochs=1000, MINIBATCH_SIZE=32, GAMMA=0.99, save=1, save_image=1, epsilon=1.0, min_epsilon=0.05, BUFFER_SIZE=15000, train_indicator=True, render=True): with tf.Session() as sess: # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) # environment env = gym.make('CartPole-v1') print('action ', env.action_space) print('obs ', env.observation_space) observation_space = 4 action_space = 2 ''' env = gym.make('FrozenLake8x8-v0') print('action ', env.action_space) print('obs ', env.observation_space) observation_space = 64 action_space = 4 ''' # agent agent = Network(sess, observation_space, action_space, LEARNING_RATE, DEVICE, layer_norm=False) # worker_summary = tf.Summary() writer = tf.summary.FileWriter('./train', sess.graph) # TENSORFLOW init seession sess.run(tf.global_variables_initializer()) # Initialize target network weights agent.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) replay_buffer.load() print('buffer size is now', replay_buffer.count) # this is for loading the net if save: try: agent.recover() print('********************************') print('models restored succesfully') print('********************************') except: print('********************************') print('Failed to restore models') print('********************************') loss = 0. j = 0 for i in range(epochs): if (i % 500 == 0) and (i != 0): print('*************************') print('now we save the model') agent.save() #replay_buffer.save() print('model saved succesfuly') print('*************************') if i % 200 == 0: agent.update_target_network() print('update_target_network') state = env.reset() # state = to_one_hot(state, observation_space) # print('state', state) q0 = np.zeros(action_space) ep_reward = 0. done = False step = 0 loss_vector = deque() lr = 0. while not done: j = j + 1 epsilon -= 0.0000051 epsilon = np.maximum(min_epsilon, epsilon) # Get action with e greedy if np.random.random_sample() < epsilon: #Explore! action = np.random.randint(0, action_space) else: # Just stick to what you know bro q0 = agent.predict( np.reshape(state, (1, observation_space))) action = np.argmax(q0) next_state, reward, done, info = env.step(action) # next_state = to_one_hot(next_state, observation_space) # I made a change to the reward reward = np.cos(2 * next_state[3]) if train_indicator: # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) q_eval = agent.predict_target( np.reshape(s2_batch, (MINIBATCH_SIZE, observation_space))) q_target = np.zeros(MINIBATCH_SIZE) # q_target = q_eval.copy() for k in range(MINIBATCH_SIZE): if t_batch[k]: q_target[k] = r_batch[k] else: q_target[k] = r_batch[k] + GAMMA * np.max( q_eval[k]) #5.3 Train agent! summary, loss, _ = agent.train( np.reshape(a_batch, (MINIBATCH_SIZE, 1)), np.reshape(q_target, (MINIBATCH_SIZE, 1)), np.reshape(s_batch, (MINIBATCH_SIZE, observation_space))) loss_vector.append(loss) writer.add_summary(summary, j) # this function is there so you can see the gradients and the updates for debuggin #actiones, action_one_hot, out, target_q_t, q_acted_0, q_acted, delta, loss, _ = agent.train_v2(np.reshape(a_batch,(MINIBATCH_SIZE,1)),np.reshape(q_target,(MINIBATCH_SIZE, 1)), np.reshape(s_batch,(MINIBATCH_SIZE,observation_space)) ) #print('action',actiones, 'action one hot', action_one_hot, 'out', out,'q acted 0', q_acted_0, 'q acted', q_acted, 'target', target_q_t, 'loss',loss, 'delta', delta) # 3. Save in replay buffer: replay_buffer.add(state, action, reward, done, next_state) # prepare for next state state = next_state ep_reward = ep_reward + reward step += 1 print('th', i + 1, 'Step', step, 'Reward:', round(ep_reward, 0), 'epsilon', round(epsilon, 3), 'loss', round(np.mean(loss_vector), 3), lr) print('*************************') print('now we save the model') agent.save() #replay_buffer.save() print('model saved succesfuly') print('*************************')