class Player: def __init__(self, name, isBot): self.name = name self.isBot = isBot if not self.isBot: self.chosenAction = 0 self.defineKeyboardListener() self.initializeProperties() self.QNetwork = DQN("QN{}".format(name), self.miniBatchSize) self.TDTarget = DQN("TD{}".format(name), self.miniBatchSize) self.sess = tf.Session() self.QNetwork.setSess(self.sess) self.TDTarget.setSess(self.sess) self.sess.run(tf.global_variables_initializer()) self.synchronise() def initializeProperties(self): self.synchronisationPeriod = 100 self.explorationRate = 0.999 # Behaviour when playing & training self.trainable = True self.exploiting = False # Statistics self.gamesWon = 0 self.gamesLost = 0 # Training self.trainingData = [] self.maxBatchSize = 50000 # trainingData will not have more than maxBatchSize elements self.miniBatchSize = 32 self.miniBatch = [] self.startTraining = 1000 # the training will happen iff we have more than startTraining data in trainingData print("Properties initialized") def defineKeyboardListener(self): def on_press(key): try: if key == Key.up: self.chosenAction = 1 elif key == Key.down: self.chosenAction = 2 else: self.chosenAction = 0 except AttributeError: self.chosenAction = 0 def on_release(key): self.chosenAction = 0 if key == keyboard.Key.esc: # Stop listener return False self.listener = keyboard.Listener(on_press=on_press, on_release=on_release) self.listener.start() def training(self, step): if not self.trainable or len(self.trainingData) < self.startTraining: return if step % self.synchronisationPeriod == 0: self.synchronise() self.miniBatch = random.sample(self.trainingData, self.miniBatchSize) states, actions, rewards, nextStates = zip(*self.miniBatch) output = self.TDTarget.computeTarget(nextStates, rewards) self.QNetwork.training(states, output, actions) def play(self): if self.isBot: if self.exploiting or random.random() > self.explorationRate: return self.QNetwork.evaluate(self.buffer) else: return random.randint(0, 1) else: return self.chosenAction def updateConstants(self, learningRate=None, explorationRate=None): self.QNetwork.updateConstants(learningRate) if not isinstance(explorationRate, type(None)): self.explorationRate = explorationRate def resetStats(self): self.gamesWon = 0 self.gamesLost = 0 def updateStats(self, reward): if reward == 1: self.gamesWon += 1 elif reward == -1: self.gamesLost += 1 def displayStats(self): # print("{} victories & {} defeats".format(self.gamesWon, self.gamesLost)) print(self.gamesWon, self.gamesLost) def addStateSequence(self, action, reward, nextState): if self.trainable: self.trainingData.append([self.buffer, action, reward, nextState]) while len(self.trainingData) > self.maxBatchSize: self.trainingData.pop(0) self.buffer = nextState def saveQNetwork(self, path, global_step=None): self.QNetwork.saveQNetwork(path, global_step) def restoreQNetwork(self, path, global_step=None): self.QNetwork.restoreQNetwork(path, global_step) def setBehaviour(self, isTraining): self.trainable = isTraining self.exploiting = not isTraining def synchronise(self): e1_params = [ t for t in tf.trainable_variables() if t.name.startswith(self.QNetwork.scope) ] e1_params = sorted(e1_params, key=lambda v: v.name) e2_params = [ t for t in tf.trainable_variables() if t.name.startswith(self.TDTarget.scope) ] e2_params = sorted(e2_params, key=lambda v: v.name) update_ops = [] for e1_v, e2_v in zip(e1_params, e2_params): op = e2_v.assign(e1_v) update_ops.append(op) self.sess.run(update_ops)
class Player : def __init__(self, name) : self.name = name self.initializeProperties() self.QNetwork = DQN(self.imageSize, "QN", self.miniBatchSize) self.TDTarget = DQN(self.imageSize, "TD", self.miniBatchSize) self.sess = tf.Session() self.QNetwork.setSess(self.sess) self.TDTarget.setSess(self.sess) self.sess.run(tf.global_variables_initializer()) self.synchronise() def initializeProperties(self) : # Q Network Constants self.imageSize = 80 self.synchronisationPeriod = 500 # Constants self.explorationRate = 0.999 # Behaviour when playing & training self.trainable = True self.exploiting = False # Statistics self.score = 0 # Training self.trainingData = [] self.maxBatchSize = 10000 # trainingData will not have more than maxBatchSize elements self.miniBatchSize = 32 self.miniBatch = [] self.startTraining = 1000 # the training will happen iff we have more than startTraining data in trainingData print("Properties initialized") def training(self, step) : if not self.trainable or len(self.trainingData) < self.startTraining: return if step % self.synchronisationPeriod == 0 : self.synchronise() self.miniBatch = random.sample(self.trainingData, self.miniBatchSize) states, actions, rewards, nextStates = zip(*self.miniBatch) output = self.TDTarget.computeTarget(nextStates, rewards) self.QNetwork.training(states, output, actions) def play(self) : if self.exploiting or random.random() > self.explorationRate : return self.QNetwork.evaluate(self.buffer) else : return int(random.random() < 0.9) def updateConstants(self, learningRate = None, explorationRate = None) : self.QNetwork.updateConstants(learningRate) if not isinstance(explorationRate, type(None)) : self.explorationRate = explorationRate def resetStats(self) : self.score = 0 def updateStats(self, reward) : if reward == 1 : self.score += 1 def displayStats(self) : # print("{} victories & {} defeats".format(self.gamesWon, self.gamesLost)) print(self.score) def addStateSequence(self, action, reward, nS) : # nS = np.transpose(nS, [1, 2, 0]) if self.trainable : self.trainingData.append([self.buffer, action, reward, nS]) while len(self.trainingData) > self.maxBatchSize : del self.trainingData[0] self.buffer = nS def saveQNetwork(self, path, global_step = None) : self.QNetwork.saveQNetwork(path, global_step) def restoreQNetwork(self, path, global_step = None): self.QNetwork.restoreQNetwork(path, global_step) def setBehaviour(self, isTraining) : self.trainable = isTraining self.exploiting = not isTraining def synchronise(self): e1_params = [t for t in tf.trainable_variables() if t.name.startswith(self.QNetwork.scope)] e1_params = sorted(e1_params, key=lambda v: v.name) e2_params = [t for t in tf.trainable_variables() if t.name.startswith(self.TDTarget.scope)] e2_params = sorted(e2_params, key=lambda v: v.name) update_ops = [] for e1_v, e2_v in zip(e1_params, e2_params): op = e2_v.assign(e1_v) update_ops.append(op) self.sess.run(update_ops)