def addToMemory(self, gameState, pastState): if (type(pastState) in (np.ndarray, list, tuple)): #nach reset/start ist pastState einfach False past_conv_inputs, past_other_inputs, _ = self.getAgentState(*pastState) s = (past_conv_inputs, past_other_inputs) a = self.getAction(*pastState) #das (throttle, brake, steer)-tuple. r = self.calculateReward(*gameState) conv_inputs, other_inputs, _ = self.getAgentState(*gameState) s2 = (conv_inputs, other_inputs) markovtuple = [s,a,r,s2,False] #not actually a tuple because punish&endepisode require something mutable self.memory.append(markovtuple) print("adding to Memory:",a, r, level=4) #values for evalation: # statesample = np.array(self.model.getstatecountfeaturevec(self.makeInferenceUsable(s),[self.makeNetUsableAction(a)])[0]) # # relativeNums = np.zeros_like(statesample) # for i in range(len(statesample)): # relativeNums[i] = (self.CountsByElement[i][statesample[i]]+0.5) / (self.allN+1) # # count = np.prod(np.array(relativeNums))*1e+23 # count = 0 stateval = self.model.statevalue(self.makeInferenceUsable(s))[0] qval = self.model.qvalue(self.makeInferenceUsable(s),[self.makeNetUsableAction(a)])[0] self.episode_statevals.append(stateval) return a, r, qval, count, self.humantakingcontrolstring #damit agents das printen können wenn sie wollen return None, 0, 0, 0, ""
def dauerLearnANN(self, steps): i = 0 res = 0 while self.containers.KeepRunning and self.model.run_inferences() <= self.conf.train_for and i < steps: cando = True #hier gehts darum das learnen zu freezen bis die Inference eingeholt hat. (falls update_frequency gesetzt) if self.conf.ForEveryInf and self.conf.ComesALearn and self.conf.learnMode == "parallel": if self.numLearnAfterInference == self.conf.ComesALearn and self.numInferencesAfterLearn == self.conf.ForEveryInf: self.numLearnAfterInference = self.numInferencesAfterLearn = 0 self.unFreezeLearn("updateFrequency") self.unFreezeInf("updateFrequency") #Alle ComesALearn sollst du warten, bis ForEveryInf mal zwischenzeitlich Inference gemacht wurde if self.numLearnAfterInference >= self.conf.ComesALearn: self.unFreezeInf("updateFrequency") if self.numInferencesAfterLearn < self.conf.ForEveryInf and self.canLearn(): self.freezeLearn("updateFrequency") print("FREEZELEARN", self.numLearnAfterInference, self.numInferencesAfterLearn, level=2) cando = False else: self.numInferencesAfterLearn = 0 if cando and not self.containers.freezeLearn and self.canLearn(): res += self.learnANN() if self.conf.ForEveryInf and self.conf.ComesALearn and self.conf.learnMode == "parallel": self.numLearnAfterInference += 1 i += 1 # print(res/steps) self.unFreezeInf("updateFrequency") #kann hier ruhig sein, da es eh nur unfreezed falls es aufgrund von diesem grund gefreezed war. if self.model.run_inferences() >= self.conf.train_for: #if you exited because you're completely done self.saveNet() print("Stopping learning because I'm done after", self.model.run_inferences(), "inferences", level=10)
def performAction(self, gameState, pastState): if self.checkIfAction(): self.numsteps += 1 self.repeated_action_for += 1 self.stepsAfterStart += 1 self.addToMemory(gameState, pastState) if self.stepsAfterStart <= self.conf.headstart_num: toUse, toSave = self.headstartAction() #weil der anderenfalls immer am anfang an den rand fahren will, denn die ersten states haben ne vollkommen atypische history elif self.repeated_action_for < self.action_repeat: toUse, toSave = self.last_action else: agentState = self.getAgentState(*gameState) #may be overridden if len(self.memory) >= self.conf.replaystartsize or self.epsilon == 0: self.epsilon = min(round(max(self.startepsilon-((self.startepsilon-self.minepsilon)*((self.model.run_inferences()-self.conf.replaystartsize)/self.finalepsilonframe)), self.minepsilon), 5), 1) if np.random.random() < self.epsilon: toUse, toSave = self.randomAction(agentState) else: toUse, toSave = self.policyAction(agentState) else: toUse, toSave = self.randomAction(agentState) self.last_action = toUse, toSave self.containers.outputval.update(toUse, toSave, self.containers.inputval.CTimestamp, self.containers.inputval.STimestamp) #note that his happens BEFORE it learns <- parallel if self.conf.learnMode == "between": if self.numsteps % self.conf.ForEveryInf == 0 and self.canLearn(): print("freezing python because after", self.model.run_inferences(), "iterations I need to learn (between)", level=2) self.freezeInf("LearningComes") self.dauerLearnANN(self.conf.ComesALearn) self.unFreezeInf("LearningComes") else: toUse, toSave = self.randomAction(agentState) self.containers.outputval.update(toUse, toSave, self.containers.inputval.CTimestamp, self.containers.inputval.STimestamp)
def returnRelevant(self): print("Removed 4 elements from speedsteer here, seems necessary", level=-1) return [i for i in self.CenterDistVec] + [0] * 4 + [ i for i in self.SpeedSteer[4:] ] + [i for i in self.StatusVector] + [i for i in self.WallDistVec ] + [i for i in self.LookAheadVec]
def eval_episodeVals(self, endReason): #ein bisschen hierher gecheatet aber whatever _, _, otherinput_hist, _ = self.containers.inputval.read() progress = round(otherinput_hist[0].ProgressVec.Progress*100 if endReason != "lapdone" else 100, 2) laptime = round(otherinput_hist[0].ProgressVec.Laptime, 1) valid = otherinput_hist[0].ProgressVec.fValidLap evalstring = "progress:",progress,"laptime:",laptime,"(valid)" if valid else "" print(evalstring, level=8) if self.use_evaluator: self.evaluator.add_episode([progress, laptime])
def getAgentState(self, *gameState): vvec1_hist, vvec2_hist, otherinput_hist, action_hist = gameState assert self.conf.use_cameras, "You disabled cameras in the config, which is impossible for this agent!" conv_inputs = np.concatenate([vvec1_hist, vvec2_hist]) if vvec2_hist is not None else vvec1_hist # other_inputs = [otherinput_hist[0].SpeedSteer.velocity, action_hist] other_inputs = [otherinput_hist[0].SpeedSteer.velocity, [np.zeros_like(i) if i != None else None for i in action_hist]] print("Removed actions as input to network, as it only learns from them then", level=-1) stands_inputs = otherinput_hist[0].SpeedSteer.velocity < 0.04 return conv_inputs, other_inputs, stands_inputs
def save(self, session): folder = self.conf.pretrain_checkpoint_dir if self.isPretrain else self.conf.checkpoint_dir checkpoint_file = os.path.join(self.agent.folder(folder), 'model.ckpt') session.run(self.pretrain_episode_tf.assign(self.pretrain_episode)) session.run(self.run_inferences_tf.assign(self.run_inferences)) self.saver.save(session, checkpoint_file, global_step=self.pretrain_step_tf if self.isPretrain else self.step_tf) print("Saved Model.", level=6)
def getAgentState(self, *gameState): vvec1_hist, vvec2_hist, otherinput_hist, action_hist = gameState flat_actions = flatten([i if i is not None else (0,0,0) for i in action_hist]) # other_inputs = np.ravel([i.returnRelevant() for i in otherinput_hist]) other_inputs = np.ravel([i.returnRelevant() for i in otherinput_hist[:2]]) flat_actions = list(np.zeros_like(flat_actions)) print("Removed actions as input to network, as it only learns from them then", level=-1) other_inputs = np.concatenate((other_inputs,flat_actions)) stands_inputs = otherinput_hist[0].SpeedSteer.velocity < 0.04 return None, other_inputs, stands_inputs
def freezeInf(self, reason): if self.containers.UnityConnected: if not reason in self.freezeInfReasons: print("freezing Unity because",reason, level=10) self.containers.freezeInf = True self.freezeInfReasons.append(reason) try: self.containers.outputval.freezeUnity() except: pass
def learnANN(self): tmp = super().learnANN() print("ReinfLearnSteps:", self.model.step(), level=3) if self.containers.showscreen: infoscreen.print(self.model.step(), "Iterations: >" + str(self.model.run_inferences()), containers=self.containers, wname="ReinfLearnSteps") return tmp
def handle_commands(self, command, wasValid=False): if command == "wallhit": self.punishLastAction(self.wallhitPunish) #ist das doppelt gemoppelt damit, dass er eh das if punish > 10 beibehält? self.endEpisode("wallhit", self.containers.inputval.read()) if command == "lapdone": print("Lap finished", level=6) #if wasValid gib +1000 reward?^^ self.endEpisode("lapdone", self.containers.inputval.read()) if command == "timeover": self.endEpisode("timeover", self.containers.inputval.read()) if command == "turnedaround": self.punishLastAction(self.wrongDirPunish) self.endEpisode("turnedaround", self.containers.inputval.read())
def showqvals(self, qvals): amount = self.conf.steering_steps*4 if self.conf.INCLUDE_ACCPLUSBREAK else self.conf.steering_steps*3 b = [] for i in range(amount): a = [0]*amount a[i] = 1 b.append(str(self.dediscretize(a))) b = list(zip(b, qvals)) toprint = [str(i[0])[1:-1]+": "+str(i[1]) for i in b] toprint = "\n".join(toprint) print(b, level=3) if self.containers.showscreen: infoscreen.print(toprint, containers= self.containers, wname="Current Q Vals")
def unFreezeInf(self, reason): if self.containers.UnityConnected: try: del self.freezeInfReasons[self.freezeInfReasons.index(reason)] if len(self.freezeInfReasons) == 0: self.containers.freezeInf = False try: #TODO: stattdessen ne variable unity_connected ahben! print("unfreezing Unity because",reason, level=10) self.containers.outputval.unFreezeUnity() except: pass except ValueError: pass #you have nothing to do if it wasnt in there anyway.
def readOneDArrayFromString(string): tmpstrings = string.split(",") tmpfloats = [] for i in tmpstrings: tmp = i.replace(" ", "") if len(tmp) > 0: try: tmp = ("1" if tmp == "T" else "0" if tmp == "F" else tmp) x = float(str(tmp)) tmpfloats.append(x) except ValueError: print("I'm crying") #cry. return tmpfloats
def update(self, toSend, toSave, CTimestamp, STimestamp): self.lock.acquire() try: if int(self.STimestamp) < int(STimestamp): self.value = toSend self.containers.inputval.addAction(toSave) self.CTimestamp, self.STimestamp = CTimestamp, STimestamp #es geht nicht um jetzt, sondern um dann als das ANN gestartet wurde print("Updated output-value to", toSend, level=4) self.send_via_senderthread(self.value, self.CTimestamp, self.STimestamp) else: print("Didn't update output-value because the new one wouldn't be newer", level=10) #raise finally: self.lock.release()
def readTwoDArrayFromString(string): tmpstrings = string.split(",") tmpreturn = [] for i in tmpstrings: tmp = i.replace(" ", "") if len(tmp) > 0: try: currline = [] for j in tmp: currline.append(int(j)) tmpreturn.append(currline) except ValueError: print("I'm crying") #cry. return np.array(tmpreturn)
def showhelp(): print("""Command-line arguments: "-DQN" to run with the DQN-config "-nolearn" to store agent's results to the memory, but not perform Reinforcement learning (sets the random-action-chance to 0) "-noscreen" to turn off the screen showing Q-vals etc. "-noplot" to turn off the plots evaluating each episode "-startfresh" to use a RL-agent without and (supervised or reinforcement) pretraining "-nomemorykeep" to not save the memory for this run. "-nomemoryload" to not LOAD the memory, which can save a lot of time "-help" shows this help and exits Concerning agents: Without any arguments, the agent defined in "dqn_rl_agent" is used With the "-svplay"-argument, the agent defined in "dqn_sv_agent" is used With the argument "--agent xyz", the agent defined in "xyz.py" is used""", level=999)
def send_via_senderthread(self, value, CTimestamp, STimestamp): #nehme die erste verbindung die keinen error schemißt! print("PYTHON SENDING TIME:", STimestamp, time.time()*1000, level=4) if self.containers.KeepRunning: assert len(self.containers.senderthreads) > 0, "There is no senderthread at all! How will I send?" for i in range(len(self.containers.senderthreads)): try: self.containers.senderthreads[i].send(value, CTimestamp, STimestamp) except (ConnectionResetError, ConnectionAbortedError): #if unity restarted, the old connection is now useless and should be deleted print("I assume you just restarted Unity.") self.containers.senderthreads[i].delete_me() self.containers.senderthreads[i].join() if i >= len(self.containers.senderthreads)-1: break
def eval_episodeVals(self, mem_epi_slice, gameState, endReason): vvec1_hist, vvec2_hist, otherinput_hist, action_hist = gameState avg_rewards = round(self.memory.average_rewards(mem_epi_slice[0], mem_epi_slice[1]),3) avg_values = round(np.mean(np.array(self.episode_statevals)), 3) self.episode_statevals = [] #other evaluation-values we need are time the agent took and percentage the agent made. However, becasue those values are not neccessarily #officially known to the agent (since agentstate != environmentstate), we need to take them from the environment-state progress = round(otherinput_hist[0].ProgressVec.Progress*100 if endReason != "lapdone" else 100, 2) laptime = round(otherinput_hist[0].ProgressVec.Laptime,1) valid = otherinput_hist[0].ProgressVec.fValidLap evalstring = "Avg-r:",avg_rewards,"Avg-Q:",avg_values,"progress:",progress,"laptime:",laptime,"(valid)" if valid else "" print(evalstring, level=8) if self.use_evaluator: self.evaluator.add_episode([avg_rewards, avg_values, progress, laptime], nr=self.episodes, startMemoryEntry=mem_epi_slice[0], endMemoryEntry=mem_epi_slice[1], endIteration=self.model.run_inferences(), reinfNetSteps=self.model.step(), endEpsilon=self.epsilon) return evalstring
def save_memory(self): with self._lock: if self.agent.keep_memory: self.agent.freezeEverything("saveMem") self.psave(self.memorypath + SAVENAME + 'TMP.pkl') print("Saving Memory at", time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), level=6) if os.path.exists(self.memorypath + SAVENAME + 'TMP.pkl'): if os.path.getsize( self.memorypath + SAVENAME + 'TMP.pkl' ) > 1024: #only use it as memory if you weren't disturbed while writing shutil.copyfile(self.memorypath + SAVENAME + 'TMP.pkl', self.memorypath + SAVENAME + '.pkl') self.lastsavetime = current_milli_time() self.agent.unFreezeEverything("saveMem")
def save(self): folder = self.conf.pretrain_checkpoint_dir if self.isPretrain else self.conf.checkpoint_dir critic_file = os.path.join( self.agent.folder(os.path.join(folder, "critic")), 'model.ckpt') self.critic.saver.save(self.session, critic_file, global_step=self.critic.pretrain_step_tf if self.isPretrain else self.critic.step_tf) actor_file = os.path.join( self.agent.folder(os.path.join(folder, "actor")), 'model.ckpt') self.session.run(self.actor.run_inferences_tf.assign(self.run_inf)) self.session.run( self.actor.pretrain_episode_tf.assign(self.pretrain_ep)) self.actor.saver.save(self.session, actor_file, global_step=self.actor.pretrain_step_tf if self.isPretrain else self.actor.step_tf) print("Saved Model.", level=6)
def extract_appropriate(self, TPList, TPmsperframe, wishmsperframe, filename): if float(TPmsperframe) > float(wishmsperframe) * 1.05: print( "%s could not be used because it recorded not enough frames!" % filename) return None elif float(wishmsperframe) * 0.95 < float( TPmsperframe) < float(wishmsperframe) * 1.05: returntp = TPList else: fraction = round(wishmsperframe / TPmsperframe * 100) / 100 i = 0 returntp = [] while round(i) < len(TPList): returntp.append(TPList[round(i)]) i += fraction returntp[len(returntp) - 1].endedAfter = True return returntp
def update(self, visionvec, vvec2, othervecs, STimestamp, CTimestamp): self.lock.acquire() try: if not self.just_reset: assert self.action_hist[0] is not None, "the output-val didn't add the last action before running again!" self.has_past_state = True #20.7.: deleted the "if is_new..." functionality, as I think its absolutely not helpful otherinputs = make_otherinputs(othervecs).normalized() #is now a namedtuple instead of an array if hasattr(self.containers.myAgent, "time_ends_episode") and self.containers.myAgent.time_ends_episode and otherinputs.ProgressVec.Laptime >= self.containers.myAgent.time_ends_episode: self.containers.myAgent.handle_commands("timeover") if self.conf.use_cameras and self.agent.usesConv: self._append_vvec_hist(visionvec, vvec2) self.otherinput_hist = self._append_other(otherinputs, self.otherinput_hist) self.containers.myAgent.humantakingcontrolstring = "(H)" if self.action_hist[0] is None or otherinputs.Action is None or np.any([abs(self.action_hist[0][i] - otherinputs.Action[i]) > 0.1 for i in range(len(otherinputs.Action))]) else "" self.action_hist[0] = tuple(otherinputs.Action) #it was already added in addAction, and will only overwritten here if humantakingcontrol changed it self.action_hist = self._append_other(None, self.action_hist) #will be updated in addAction #wenn otherinputs.CenterDist >= 10 war und seitdem keine neue action kam, muss er >= 10 bleiben! # if self.otherinput_hist[0].CenterDist[0] >= 0.99: # self.hit_a_wall = True # #wird erst sobald ne action kommt wieder false gesetzt.. und solange es true ist: # if self.hit_a_wall: # self.otherinput_hist[0] = self.otherinput_hist[0]._replace(CenterDist = [1]) try: if not self.otherinput_hist[0].SpeedSteer.rightDirection: self.containers.wrongdirectiontime += self.containers.conf.msperframe if self.containers.wrongdirectiontime >= 2000: #bei 2 sekunden falsche richtung self.containers.myAgent.handle_commands("turnedaround") else: self.containers.wrongdirectiontime = 0 except IndexError: self.containers.wrongdirectiontime = 0 self.alreadyread = False self.CTimestamp, self.STimestamp = CTimestamp, STimestamp print("Updated Input-Vec from", STimestamp, level=2) self.just_reset = False finally: self.lock.release()
def randomAction(self, agentState): print("Random Action", level=2) action = np.random.randint(4) if self.conf.INCLUDE_ACCPLUSBREAK else np.random.randint(3) if action == 0: brake, throttle = 0, 1 if action == 1: brake, throttle = 0, 0 if action == 2: brake, throttle = 1, 0 if action == 3: brake, throttle = 1, 1 if agentState[2]: #"carstands" brake, throttle = 0, 1 #alternative 1a: steer = ((np.random.random()*2)-1) #alternative 1b: steer = min(max(np.random.normal(scale=0.5), 1), -1) #für 1a und 1b: steer = read_supervised.dediscretize_steer(read_supervised.discretize_steering(steer, self.conf.steering_steps)) #alternative 2: tmp = [0]*self.conf.steering_steps tmp[np.random.randint(self.conf.steering_steps)] = 1 steer = read_supervised.dediscretize_steer(tmp) #throttle, brake, steer = 1, 0, 0 result = "["+str(throttle)+", "+str(brake)+", "+str(steer)+"]" return result, (throttle, brake, steer) #er returned immer toUse, toSave
def getAccuracy(self, batch, likeDDPG=True): #dummy for consistency to DDDQN oldstates, actions, _, _, _ = batch predict = self.actor.predict(oldstates, useOnline=False, is_training=False) print( "throt", np.mean( np.array([ abs(np.linalg.norm(predict[i][0] - actions[i][0])) for i in range(len(actions)) ]))) print( "brake", np.mean( np.array([ abs(np.linalg.norm(predict[i][1] - actions[i][1])) for i in range(len(actions)) ]))) print( "steer", np.mean( np.array([ abs(np.linalg.norm(predict[i][2] - actions[i][2])) for i in range(len(actions)) ]))) return np.mean( np.array([ abs(np.linalg.norm(predict[i] - actions[i])) for i in range(len(actions)) ]))
def _load(self, from_pretrain=False): folder = self.conf.pretrain_checkpoint_dir if from_pretrain else self.conf.checkpoint_dir critic_ckpt = tf.train.get_checkpoint_state( self.agent.folder(os.path.join(folder, "critic"))) actor_ckpt = tf.train.get_checkpoint_state( self.agent.folder(os.path.join(folder, "actor"))) if critic_ckpt and actor_ckpt and critic_ckpt.model_checkpoint_path and actor_ckpt.model_checkpoint_path: self.critic.saver.restore(self.session, critic_ckpt.model_checkpoint_path) self.actor.saver.restore(self.session, actor_ckpt.model_checkpoint_path) self.run_inf = self.actor.run_inferences_tf.eval(self.session) self.pretrain_ep = self.actor.pretrain_episode_tf.eval( self.session) else: print("Couldn't load", ("from pretrain" if from_pretrain else "from RL-train"), level=10) return False print("Loaded", ("from pretrain" if from_pretrain else "from RL-train"), level=10) print("Pretrain-Step:", self.actor.pretrain_step_tf.eval(self.session), "Pretrain-Episode:", self.pretrain_ep, "Main-Step:", self.step(), "Run'n Iterations:", self.run_inf, level=10) return True
def load(self, session, from_pretrain=False): folder = self.conf.pretrain_checkpoint_dir if from_pretrain else self.conf.checkpoint_dir ckpt = tf.train.get_checkpoint_state(self.agent.folder(folder)) if ckpt and ckpt.model_checkpoint_path: self.saver.restore(session, ckpt.model_checkpoint_path) print("Loaded", ("from pretrain" if from_pretrain else "from RL-train"), level=10) self.pretrain_step = self.pretrain_step_tf.eval(session) self.pretrain_episode = self.pretrain_episode_tf.eval(session) self.step = self.step_tf.eval(session) self.run_inferences = self.run_inferences_tf.eval(session) print("Pretrain-Step:", self.pretrain_step, "Pretrain-Episode:", self.pretrain_episode, "Main-Step:", self.step, "Run'n Iterations:", self.run_inferences, level=10) return True else: print("Couldn't load", ("from pretrain" if from_pretrain else "from RL-train"), level=10) return False
def run(self): print("Starting receiver_thread") while self.containers.KeepRunning and (not self.killme): try: if not self.containers.freezeInf: data = self.clientsocket.myreceive() if data: #print("received data:", data, level=10) if self.handle_special_commands(copy.deepcopy(data)): continue elif data[:6] == "STime(": #we MUST have the inputval, otherwise there wouldn't be the possibility for historyframes. STime, CTime, visionvec, vvec2, allOneDs = cutoutandreturnvectors(data) self.CTimestamp, self.STimestamp = CTime, STime for i in self.containers.receiverthreads: if int(i.STimestamp) < int(self.STimestamp): i.killme = True print("PYTHON RECEIVES TIME:", STime, time.time()*1000, level=4) self.containers.inputval.update(visionvec, vvec2, allOneDs, STime, CTime) #note that visionvec and vvec2 can both be None self.containers.myAgent.performAction(self.containers.inputval.read(), self.containers.inputval.read(pastState=True)) except TimeoutError: if len(self.containers.receiverthreads) < 2: pass else: break self.containers.receiverthreads.remove(self) print("stopping receiver_thread")
def checkIfAction(self): if self.containers.freezeInf: return False #hier gehts darum die Inference zu freezen bis das learnen eingeholt hat. (falls update_frequency gesetzt) if self.conf.ForEveryInf and self.conf.ComesALearn and self.canLearn() and self.conf.learnMode == "parallel": if self.numLearnAfterInference == self.conf.ComesALearn and self.numInferencesAfterLearn == self.conf.ForEveryInf: self.numLearnAfterInference = self.numInferencesAfterLearn = 0 self.unFreezeLearn("updateFrequency") self.unFreezeInf("updateFrequency") #Alle ForEveryInf inferences sollst du warten, bis ComesALearn mal in der zwischenzeit gelernt wurde. if self.numInferencesAfterLearn == self.conf.ForEveryInf: #gucke ob er in der zwischenzeit ComesALearn mal gelernt hat, wenn nein, freeze Inference self.unFreezeLearn("updateFrequency") if self.numLearnAfterInference < self.conf.ComesALearn: self.freezeInf("updateFrequency") print("FREEZEINF", self.numLearnAfterInference, self.numInferencesAfterLearn, level=2) return super().checkIfAction() self.numLearnAfterInference = 0 self.numInferencesAfterLearn += 1 #print(self.numLearnAfterInference, self.numInferencesAfterLearn, level=10) if self.model.run_inferences() >= self.conf.train_for: return False else: return super().checkIfAction()
def preTrain(self, dataset, iterations, supervised=False): assert self.model.step( ) == 0, "I dont pretrain if the model already learned on real data!" iterations = self.conf.pretrain_iterations if iterations is None else iterations if supervised: raise ValueError("A DDPG-Model cannot learn supervisedly!") print("Starting pretraining", level=10) for i in range(iterations): start_time = time.time() self.model.inc_episode() dataset.reset_batch() while dataset.has_next(self.conf.pretrain_batch_size): trainBatch = self.make_trainbatch( dataset, self.conf.pretrain_batch_size, 0.3) self.model.q_train_step(trainBatch, False) if (i + 1) % 25 == 0: self.model.save() dataset.reset_batch() trainBatch = self.make_trainbatch(dataset, dataset.numsamples) print( 'Iteration %3d: Closeness = %.2f (%.1f sec)' % (self.model.pretrain_episode(), self.model.getAccuracy(trainBatch), time.time() - start_time), level=10)