class DeepQ: def __init__(self, environment, inputs): self.environment = environment self.state_size = inputs self.nr_actions = environment.action_space.n self.memory = Memory(30000) self.discountFactor = 0.975 self.predictionModels = [] def initImaginationNetworks(self): for t in xrange(self.nr_actions): self.predictionModels.insert(t, self.createModel(self.state_size, self.state_size, [self.state_size, self.state_size, self.state_size], "relu", 0.01)) def initRewardNetwork(self): self.rewardModel = self.createModel(self.state_size, 1, [self.state_size, self.state_size, self.state_size], "relu", 0.01) def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate): model = Sequential() if len(hiddenLayers) == 0: model.add(Dense(outputs, input_shape=(inputs,), init='lecun_uniform')) model.add(Activation("linear")) else : model.add(Dense(hiddenLayers[0], input_shape=(inputs,), init='lecun_uniform')) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) for index in range(1, len(hiddenLayers)-1): layerSize = hiddenLayers[index] model.add(Dense(layerSize, init='lecun_uniform')) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) model.add(Dense(outputs, init='lecun_uniform')) model.add(Activation("linear")) optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06) model.compile(loss="mse", optimizer=optimizer) return model def backupNetwork(self, model, backup): weightMatrix = [] for layer in self.model.layers: weights = layer.get_weights() weightMatrix.append(weights) i = 0 for layer in self.secondBrain.layers: weights = weightMatrix[i] layer.set_weights(weights) i += 1 def getStatePrediction(self, state, action): predicted = self.predictionModels[action].predict(state.reshape(1,len(state))) return predicted[0] def getPredictedStates(self, state): predictedStates = [] for a in xrange(self.nr_actions): predictedStates.insert(a, self.getStatePrediction(state, a)) return predictedStates def getStateValuePrediction(self, state): predictedReward = self.rewardModel.predict(state.reshape(1,len(state))) return predictedReward[0][0] def getPredictedActionValues(self, state): predictedActionValues = [] for a in xrange(self.nr_actions): predictedActionValues.insert(a, self.getStateValuePrediction(self.getStatePrediction(state, a))) return predictedActionValues def getMaxValue(self, array): return np.max(array) def getMaxIndex(self, array): return np.argmax(array) def getTarget(self, state, reward, isFinal): if isFinal: return reward else: predictedActionValues = self.getPredictedActionValues(state) # return reward + self.discountFactor * (sum(predictedActionValues)/len(predictedActionValues)) return reward + self.discountFactor * np.max(predictedActionValues) def printStatePredictionTree(self, state): root = Tree() # first layer predicted1 = self.getPredictedStates(state) root.data = state root.left = Tree() root.left.data = predicted1[0] root.right = Tree() root.right.data = predicted1[1] # second layer predicted2left = self.getPredictedStates(predicted1[0]) root.left.left = Tree() root.left.left.data = predicted2left[0] root.left.right = Tree() root.left.right.data = predicted2left[1] predicted2right = self.getPredictedStates(predicted1[1]) root.right.left = Tree() root.right.left.data = predicted2right[0] root.right.right = Tree() root.right.right.data = predicted2right[1] print "" print "\t\t\t\t\t\t\t\t\t\t",root.data print "\t\t\t\t",root.left.data,"\t\t\t\t\t\t\t",root.right.data print root.left.left.data,"\t",root.left.right.data,"\t",root.right.left.data,"\t",root.right.right.data def printStateValueTree(self, state): root = Tree() # first layer predicted1 = self.getPredictedStates(state) root.data = state root.left = Tree() root.left.data = predicted1[0] root.right = Tree() root.right.data = predicted1[1] # second layer predicted2left = self.getPredictedStates(predicted1[0]) root.left.left = Tree() root.left.left.data = predicted2left[0] root.left.right = Tree() root.left.right.data = predicted2left[1] predicted2right = self.getPredictedStates(predicted1[1]) root.right.left = Tree() root.right.left.data = predicted2right[0] root.right.right = Tree() root.right.right.data = predicted2right[1] print "" print "\t\t\t\t\t\t\t\t\t\t",self.getStateValuePrediction(root.data) print "\t\t\t\t",self.getStateValuePrediction(root.left.data),"\t\t\t\t\t\t\t\t\t\t\t",self.getStateValuePrediction(root.right.data) print self.getStateValuePrediction(root.left.left.data),"\t\t\t\t\t",self.getStateValuePrediction(root.left.right.data),"\t\t\t\t\t",self.getStateValuePrediction(root.right.left.data),"\t\t\t\t\t",self.getStateValuePrediction(root.right.right.data) # select the action with the highest Q value def selectAction(self, state, explorationRate): rand = random.random() if rand < explorationRate : action = np.random.randint(0, self.nr_actions) else : action = self.getMaxIndex(self.getPredictedActionValues(state)) return action def selectActionStepsForward(self, state, depth): root = Tree() # first layer predicted1 = self.getPredictedStates(state) leftMax = self.getStateValuePrediction(predicted1[0]) rightMax = self.getStateValuePrediction(predicted1[1]) predicted2left = self.getPredictedActionValues(predicted1[0]) leftMax = max(leftMax, np.max(self.getStateValuePrediction(predicted1[0]))) predicted2right = self.getPredictedStates(predicted1[1]) rightMax = max(rightMax, np.max(self.getStateValuePrediction(predicted1[1]))) if rightMax > leftMax: return 1 else: return 0 def addMemory(self, state, action, reward, newState, isFinal): self.memory.addMemory(state, action, reward, newState, isFinal) def trainStatePredictionOnLastState(self): X_batch = np.empty((0,self.state_size), dtype = np.float64) Y_batch = np.empty((0,self.state_size), dtype = np.float64) lastMemory = self.memory.getLastMemory() isFinal = lastMemory['isFinal'] state = lastMemory['state'] action = lastMemory['action'] reward = lastMemory['reward'] newState = lastMemory['newState'] X_batch = np.append(X_batch, [state], axis=0) Y_batch = np.append(Y_batch, [newState], axis=0) self.predictionModels[action].fit(X_batch, Y_batch, batch_size = len(X_batch), verbose = 0) def trainStatePreditions(self, miniBatchSize): X_batches = [] Y_batches = [] for t in xrange(self.nr_actions): X_batches.append(np.empty((0,self.state_size), dtype = np.float64)) Y_batches.append(np.empty((0,self.state_size), dtype = np.float64)) miniBatch = self.memory.getMiniBatch(miniBatchSize) for sample in miniBatch: isFinal = sample['isFinal'] state = sample['state'] action = sample['action'] reward = sample['reward'] newState = sample['newState'] inputValues = state.copy() targetValues = newState.copy() X_batches[action] = np.append(X_batches[action], np.array([inputValues]), axis=0) Y_batches[action] = np.append(Y_batches[action], np.array([targetValues]), axis=0) for a in xrange(self.nr_actions): if len(X_batches[action]) > 0: self.predictionModels[action].fit(X_batches[action].reshape(len(X_batches[action]),4), Y_batches[action], batch_size = len(X_batches[action]), verbose = 0) def trainRewardModel(self, miniBatchSize): miniBatch = self.memory.getMiniBatch(miniBatchSize) X_batch = np.empty((0,self.state_size), dtype = np.float64) Y_batch = np.empty((0,1), dtype = np.float64) for sample in miniBatch: isFinal = sample['isFinal'] state = sample['state'] action = sample['action'] reward = sample['reward'] newState = sample['newState'] inputValues = newState.copy() targetValue = [self.getTarget(newState, reward, isFinal)] X_batch = np.append(X_batch, np.array([inputValues]), axis=0) Y_batch = np.append(Y_batch, [targetValue], axis=0) self.rewardModel.fit(X_batch, Y_batch, batch_size = len(miniBatch), verbose = 0)
class APLDDPGAgent(AbstractAgent): name = "apl_ddpg" def __init__(self, env, iter=200000, *args, **kwargs): # create the actor model # create the critic model self.env = env self.action_dim = sum( sum(1 for i in row if i) for row in self.env.action_space.sample()) self.observation = env.reset() self.state_dim = self.observation.shape print ">>>>>>>>>>>>>>>>>>>>>state dim " + str(self.state_dim) self.nn_action_dim = 6 # limit ddpg network output to 3 DOF self.noise = OUProcess(self.nn_action_dim, mu=OU_MEAN, theta=OU_THETA, sigma=EPSILON_RANGE[0]) def fit(self, *args, **kwargs): MEM_SZ = MEM_SIZE_FCL sess = K.get_session() K.set_learning_phase(1) self.actor = ActorNetwork(sess, self.state_dim, self.nn_action_dim, BATCH_SIZE, TAU, LRA, convolutional=CONVOLUTIONAL, output_activation=ACTION_ACTIVATION) self.critic = CriticNetwork(sess, self.state_dim, self.nn_action_dim, BATCH_SIZE, TAU, LRC, convolutional=CONVOLUTIONAL) self.memory = Memory(MEM_SZ) self.actor.target_model.summary() self.critic.target_model.summary() if LOAD_WEIGHTS: self.actor.model.load_weights(LOAD_WEIGHTS_PREFIX + "actor_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.critic.model.load_weights(LOAD_WEIGHTS_PREFIX + "critic_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.actor.target_model.load_weights(LOAD_WEIGHTS_PREFIX + "actor_target_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.critic.target_model.load_weights(LOAD_WEIGHTS_PREFIX + "critic_target_model_" + LOAD_WEIGHTS_EPISODE + ".h5") print("Weights Loaded!") #==================================================== #Initialize noise processes #self.noise_procs = [] #for i in range(NUM_NOISE_PROCS): # self.noise_procs.append(OUProcess(OU_MEAN, OU_THETA, OU_STD_DEV)) #==================================================== PRE_LEARNING_EPISODES = STARTING_EPISODE + PRE_LEARNING_EPS steps = STARTING_EPISODE * EPISODE_LENGTH start_time = time.time() last_ep_time = time.time() if MAKE_PLOT: reward_graph = Grapher() for ep in range(STARTING_EPISODE, EPISODES): #reset noise processes #for ou in self.noise_procs: # ou.reset() self.noise.reset() #start time counter if (ep == PRE_LEARNING_EPISODES): start_time = time.time() print("Episode: " + str(ep) + " Frames: " + str(ep * EPISODE_LENGTH) + " Uptime: " + str( (time.time() - start_time) / 3600.0) + " hrs ===========") state = self.env.reset() play_only = (ep % 10 == 0) total_reward = 0 if play_only or ALREADY_TRAINED: for step in range(TEST_EPISODE_LENGTH): #print ">>>>>>>>>>>>>", state.shape #img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center #img = np.multiply(img, 1.0/128.0) #scale [-1,1] #img = np.transpose(state, (1,2,0)) #img = np.array(state) #img = np.transpose(img, (1,2,0)) #print ">>>>>>>>>>>>>", state.shape state = np.reshape(state, state.shape + (1, )) action, control_action = self.selectAction( state, can_be_random=False, use_target=True) nstate, reward, done, info = self.env.step(control_action) total_reward += reward state = nstate else: for step in range(EPISODE_LENGTH): # ACT ============================== epsilon = (float(steps) / float(EPSILON_STEPS)) * ( EPSILON_RANGE[1] - EPSILON_RANGE[0]) + EPSILON_RANGE[0] state = np.reshape(state, state.shape + (1, )) action, control_action = self.selectAction(state, epsilon=epsilon) new_state, reward, done, info = self.env.step( control_action) done = done or (step >= EPISODE_LENGTH) self.memory.addMemory(state, action, reward, new_state, done) state = new_state # LEARN ============================ if ep > PRE_LEARNING_EPISODES: batch, idxs = self.memory.getMiniBatch(BATCH_SIZE) self.learnFromBatch(batch) if done: break # CLEANUP ========================== steps += 1 #we need to consider the episodes without noise to actually tell how the system is doing if play_only and MAKE_PLOT: reward_graph.addSample(total_reward) reward_graph.displayPlot() #calculate fph on total frames total_frames = (ep - PRE_LEARNING_EPISODES) * EPISODE_LENGTH elapsed = time.time() - start_time fps = total_frames / elapsed fph = fps * 3600.0 #re-calculate fps on this episode, so it updates quickly fps = EPISODE_LENGTH / (time.time() - last_ep_time) last_ep_time = time.time() print("fps: " + str(fps) + " fph: " + str(fph) + "\n") #save plot and weights if (ep > 0 and ep % EPISODE_SAVE_FREQUENCY == 0) and not ALREADY_TRAINED: #plot if MAKE_PLOT: reward_graph.savePlot(SAVE_WEIGHTS_PREFIX + "graph_" + str(ep) + ".jpg") #weights self.actor.model.save_weights(SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) + ".h5", overwrite=True) self.actor.target_model.save_weights( SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) + ".h5", overwrite=True) self.critic.model.save_weights( SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".h5", overwrite=True) self.critic.target_model.save_weights( SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) + ".h5", overwrite=True) #network structures (although I don't think I ever actually use these) with open( SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.actor.target_model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.critic.target_model.to_json(), outfile) def learnFromBatch(self, miniBatch): dones = np.asarray([sample['isFinal'] for sample in miniBatch]) states = np.asarray([sample['state'] for sample in miniBatch]) actions = np.asarray([sample['action'] for sample in miniBatch]) new_states = np.asarray([sample['newState'] for sample in miniBatch]) Y_batch = np.asarray([sample['reward'] for sample in miniBatch]) new_states = np.reshape(new_states, new_states.shape + (1, )) target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) for i in range(len(miniBatch)): if not dones[i]: Y_batch[i] = Y_batch[i] + GAMMA * target_q_values[i] self.critic.model.train_on_batch([states, actions], Y_batch) #additional operations to train actor temp_actions = self.actor.model.predict(states) grads = self.critic.gradients(states, temp_actions) self.actor.train(states, grads) #update target networks self.actor.target_train() self.critic.target_train() ''' This is wrong I think def OU(x, mu, theta, sigma): return theta * (mu - x) + sigma * np.random.randn(1) ''' def clip(self, x, minx, maxx): return max(minx, min(maxx, x)) def selectAction(self, state, can_be_random=True, use_target=False, epsilon=1.0, permutation_num=0): state = np.array([state]) #add dimension to make a "batch" of 1 if use_target: actions = self.actor.target_model.predict(state) else: actions = self.actor.model.predict(state) actions = np.squeeze(actions) #print control_actions #print("+++++++++++") #print(actions) if can_be_random: self.noise.sigma = epsilon noise = self.noise.noise() #print noise i = 0 for idx, a in enumerate(actions): actions[i] = actions[i] + noise[i] actions[i] = self.clip( actions[i], -3.14, 3.14) #need to assign to actions[i], not just a. i += 1 #get noise #noise = [] #iterate over all noise procs for non-coop, or a single agent's procs for co-op #for n in range(permutation_num*ACTIONS_PER_AGENT, permutation_num*ACTIONS_PER_AGENT + self.action_dim): # ou = self.noise_procs[n] # noise.append(ou.step()) # for idx, a in enumerate(actions): # ou = self.noise_procs[0] # noise = ou.step() # a = a + epsilon*noise # #print epsilon * noise # actions[i] = self.clip(a, -3.14, 3.14) #need to assign to actions[i], not just a. # i += 1 # #print(actions) #fill in zeros for all non-learned outputs control_actions = np.pad(actions, (0, self.action_dim - len(actions)), 'constant') #print actions #print control_actions return actions, control_actions #Constructs an image from state vector def constructImageRepresentation(self, state): img = np.empty([IMAGE_SIDE_LENGTH, IMAGE_SIDE_LENGTH], dtype=np.uint8) img.fill(128) color = 255 delta_color = int(math.floor(128 / NUM_TARGETS)) for j in range(NUM_TARGETS): tar = [state[2 * j], state[2 * j + 1]] cv2.circle(img, (int( tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)), 5, 0, -1) cv2.circle(img, (int( tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)), 4, color, -1) color -= delta_color color = 0 for j in range(NUM_AGENTS): offset = 2 * NUM_TARGETS agent = [state[offset + 2 * j], state[offset + 2 * j + 1]] #draw blank agent, no thrust display cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 4, int(agent[1] * IMAGE_SIDE_LENGTH) - 1), (int(agent[0] * IMAGE_SIDE_LENGTH) + 4, int(agent[1] * IMAGE_SIDE_LENGTH) + 1), color, -1) cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 1, int(agent[1] * IMAGE_SIDE_LENGTH) - 4), (int(agent[0] * IMAGE_SIDE_LENGTH) + 1, int(agent[1] * IMAGE_SIDE_LENGTH) + 4), color, -1) #first agent ia 0 since we control it, others are same color color = 64 ''' cv2.namedWindow('perm_image',cv2.WINDOW_NORMAL) cv2.resizeWindow('perm_image', 600,600) cv2.imshow('perm_image', img) cv2.waitKey(1) ''' img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center img = np.multiply(img, 1.0 / 128.0) #scale [-1,1] img = np.transpose(img, (1, 2, 0)) return img #for co-op case, get an arrangement of the state vector for each agent. def getStatePermutations(self, state): perms = [] for i in range(NUM_AGENTS): if CONVOLUTIONAL and not DRAW_STATE: perms.append(state) else: pstate = [] #copy over target data for j in range(NUM_TARGETS * 2): pstate.append(state[j]) #copy agent data, rotated for j in range(NUM_AGENTS * 2): rot_j = (j + (i * 2)) % (NUM_AGENTS * 2) + (NUM_TARGETS * 2) pstate.append(state[rot_j]) if DRAW_STATE: perms.append(constructImageRepresentation(pstate)) else: perms.append(np.asarray(pstate, dtype=np.float32)) return perms
class dqn: def __init__(self, learning_rate, minibatch_size, gamma, state_space, action_space): self.state_space = state_space self.action_space = action_space self.gamma = gamma self.learning_rate = learning_rate self.minibatch_size = minibatch_size self.replay_memory_size = 10000 self.experience_buffer = Memory(self.replay_memory_size) self.init_network() tf.summary.FileWriter("logs/", self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.current_loss = 0.0 def init_network(self): tf.reset_default_graph() ''' Create eval q networks ''' # inputs is the observation self.s = tf.placeholder(dtype=tf.float32, shape=[None, self.state_space], name="observation") # fully connected layer layer 1 w_fc1 = tf.Variable( tf.truncated_normal([self.state_space, 1024], stddev=0.01)) b_fc1 = tf.Variable(tf.zeros([1024])) layer1 = tf.nn.relu(tf.matmul(self.s, w_fc1) + b_fc1) # fully connected layer layer 2 w_out = tf.Variable( tf.truncated_normal([1024, self.action_space], stddev=0.01)) b_out = tf.Variable(tf.zeros([self.action_space])) self.Qout = tf.matmul(layer1, w_out) + b_out # q value from next state self.Qout_next = tf.placeholder(tf.float32, [None, self.action_space]) ''' Loss Function ''' self.loss = tf.reduce_mean(tf.square(self.Qout_next - self.Qout)) optimizer = tf.train.RMSPropOptimizer(self.learning_rate) self.trainer = optimizer.minimize(self.loss) self.saver = tf.train.Saver() self.sess = tf.Session() def get_Q_values(self, state): return self.sess.run(self.Qout, feed_dict={self.s: [state]})[0] def store_experience(self, state, action, reward, nextState, done): self.experience_buffer.addMemory(state, action, reward, nextState, done) def replay_experience(self): if self.experience_buffer.getCurrentSize() > self.minibatch_size: state__miniBatch = [] qout_miniBatch = [] size = min(self.experience_buffer.getCurrentSize(), self.minibatch_size) miniBatch = self.experience_buffer.getMiniBatch(size) for sample in miniBatch: done = sample['isFinal'] state = sample['state'] action = sample['action'] reward = sample['reward'] newState = sample['newState'] qValues = self.get_Q_values(state) if done: qValues[action] = reward else: qValues[action] = reward + self.gamma * np.max( self.get_Q_values(newState)) state__miniBatch.append(state) qout_miniBatch.append(qValues) #train self.sess.run(self.trainer, feed_dict={ self.s: state__miniBatch, self.Qout_next: qout_miniBatch }) self.current_loss = self.sess.run(self.loss, feed_dict={ self.s: state__miniBatch, self.Qout_next: qout_miniBatch }) def load_model(self, model_path=None): if model_path: # load from saved file self.saver.restore(self.sess, model_path) else: # load from checkpoint checkpoint = tf.train.get_checkpoint_state( '/home/kin/python/q_learning/checkpoint') if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) def save_model(self): self.saver.save(self.sess, '/home/kin/python/q_learning/saved_model.ckpt')
class DeepQ: def __init__(self, inputs, outputs, memorySize, discountFactor, learningRate, learnStart): self.input_size = inputs self.output_size = outputs self.memory = Memory(memorySize) self.discountFactor = discountFactor self.learnStart = learnStart self.learningRate = learningRate def initNetwork(self, hiddenLayers): model = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", self.learningRate) self.model = model targetModel = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", self.learningRate) self.targetModel = targetModel def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate): bias = True dropout = 0 regularizationFactor = 0.01 model = tf.keras.models.Sequential() if len(hiddenLayers) == 0: model.add(tf.keras.layers.Dense(self.input_size, input_shpae =(self.input_size,), kernel_initializer = 'lecun_uniform', use_bias = bias)) model.add(tf.keras.layers.Activation("linear")) else: model.add(tf.keras.layers.Dense(hiddenLayers[0], input_shape=(self.input_size,), kernel_initializer = 'lecun_uniform', kernel_regularizer = tf.keras.regularizers.l2(l = regularizationFactor), use_bias = bias)) if (activationType == "LeakyReLU"): model.add(tf.keras.layers.LeakyReLU(alpha = 0.01)) else: model.add(tf.keras.layers.Activation(activationType)) for index in range(1, len(hiddenLayers)-1): layerSize = hiddenLayers[index] model.add(tf.keras.layers.Dense(layerSize, kernel_initializer= 'lecun_uniform', kernel_regularizer = tf.keras.regularizers.l2(l = regularizationFactor), use_bias = bias)) if dropout > 0: model.add(tf.keras.layers.Dropout(dropout)) if (activationType == "LeakyReLU"): model.add(tf.keras.layers.LeakyReLU(aplha = 0.01)) else: model.add(tf.keras.layers.Activation(activationType)) model.add(tf.keras.layers.Dense(self.output_size, kernel_initializer='lecun_uniform', use_bias=bias)) model.add(tf.keras.layers.Activation("linear")) optimizer = tf.keras.optimizers.RMSprop(lr=learningRate, rho = 0.9, epsilon = 1e-06) model.compile(loss="mse", optimizer = optimizer) return model def printNetwork(self): i = 0 for layer in self.model.layers: weight = layer.get_weights() print("layer ",i," : ",weight) i+=1 # copy current network to backup (traget) model def backupNetwork(self, model, backup): weightMatrix = [] for layer in model.layers: weights = layer.get_weights() weightMatrix.append(weights) i = 0 for layer in backup.layers: weights = weightMatrix[i] layer.set_weights(weights) i+=1 def updateTargetNetwork(self): self.backupNetwork(self.model, self.targetModel) def getQValues(self, state): predicted = self.targetModel.predict(state.reshape(1, len(state))) return predicted[0] def getMaxQ(self, qValues): return np.max(qValues) def getMaxIndex(self, qValues): return np.argmax(qValues) #calculate the traget fucntion def calculateTarget(self, qValuesNewState, reward, isFinal): if isFinal: return reward else: return reward + self.discountFactor * self.getMaxQ(qValuesNewState) #select the action with the highest Q value def selectAction(self, qValues, explorationRate): rand = random.random() if rand < explorationRate: action = np.random.randint(0 , self.output_size) else: action = self.getMaxIndex(qValues) return action def selectionActionByProbability(self, qValues, bias): qValueSum = 0 shiftBy = 0 for value in qValues: if value + shiftBy < 0: shiftBy = - (value + shiftBy) shiftBy += 1e-06 for value in qValues: qValueSum += (value + shiftBy) ** bias probabilitySum = 0 qValueProbabilities = [] for value in qValues: probability = ((value + shiftBy) ** bias) / float(qValueSum) qValueProbabilities.append(probability + probabilitySum) probabilitySum += probability qValueProbabilities[len(qValueProbabilities) - 1] = 1 rand = random.random() i = 0 for value in qValueProbabilities: if(rand<=value): return i i+=1 def addMemory(self, state, action, reward, newState, isFinal): self.memory.addMemory(state, action, reward, newState, isFinal) def learnOnLastState(self): if self.memory.getCurrentSize() >=1: return self.memory.getMemory(self.memory.getCurrentSize() - 1) def learnOnMiniBatch(self, miniBatchSize): if self.memory.getCurrentSize() > self.learnStart : miniBatch = self.memory.getMiniBatch(miniBatchSize) X_batch = np.empty((0,self.input_size), dtype = np.float64) Y_batch = np.empty((0,self.output_size), dtype = np.float64) for sample in miniBatch: isFinal = sample['isFinal'] state = sample['state'] action = sample['action'] reward = sample['reward'] newState = sample['newState'] qValues = self.getQValues(state) qValuesNewState = self.getTargetQValues(newState) targetValue = self.calculateTarget(qValuesNewState, reward, isFinal) X_batch = np.append(X_batch, np.array([state.copy()]), axis=0) Y_sample = qValues.copy() Y_sample[action] = targetValue Y_batch = np.append(Y_batch, np.array([Y_sample]), axis=0) if isFinal: X_batch = np.append(X_batch, np.array([newState.copy()]), axis=0) Y_batch = np.append(Y_batch, np.array([[reward]*self.output_size]), axis=0) self.model.fit(X_batch, Y_batch, batch_size = len(miniBatch), nb_epoch=1, verbose = 0) def saveModel(self): model_json = self.model.to_json() with open("model.json","w") as json_file: json_file.write(model_json) self.model.save_weights("model.h5") target_model_json = self.targetModel.to_json() with open("target_model.json","w") as json_file: json_file.write(target_model_json) self.model.save_weights("target_model.h5") def loadModel(self): json_file = open("model.json","r") loaded_model_json = json_file.read() json_file.close() loaded_model = tf.keras.models.model_from_json(loaded_model_json) loaded_model.load_weights("model.h5") self.model = loaded_model json_file = open("target_model.json","r") loaded_model_json = json_file.read() json_file.close() loaded_model = tf.keras.models.model_from_json(loaded_model_json) loaded_model.load_weights("target_model.h5") self.targetModel = loaded_model
class DeepQ: def __init__(self, outputs, memorySize, discountFactor, learningRate, learnStart): """ Parameters: - outputs: output size - memorySize: size of the memory that will store each state - discountFactor: the discount factor (gamma) - learningRate: learning rate - learnStart: steps to happen before for learning. Set to 128 """ self.output_size = outputs self.memory = Memory(memorySize) self.discountFactor = discountFactor self.learnStart = learnStart self.learningRate = learningRate self.model = self.createModel(True) self.targetModel = self.createModel() #To add stability to training def createModel(self, record=False): model = Sequential() #normalize the image to avoid saturation and make the gradients work better model.add( Lambda(lambda x: x / 127.5 - 1.0, input_shape=utils.INPUT_SHAPE) ) #127.5-1.0 = experimental value from udacity self driving car course #32 8x8 convolution kernels with 4x4 stride and activation function ReLU model.add( Conv2D(32, 8, strides=4, activation="relu", kernel_initializer='lecun_uniform')) model.add( Conv2D(64, 4, strides=2, activation="relu", kernel_initializer='lecun_uniform')) model.add( Conv2D(64, 3, strides=1, activation="relu", kernel_initializer='lecun_uniform')) model.add(Flatten()) model.add( Dense(512, activation="relu", kernel_initializer='lecun_uniform')) model.add(Dense( 3, activation="linear")) # 3 outputs for the 3 different actions optimizer = optimizers.RMSprop(lr=self.learningRate, rho=0.9, epsilon=1e-06) # From deepq.py model.compile(loss="mean_squared_error", optimizer=optimizers.Adam(self.learningRate)) #Try: optimizer=optimizers.Adam(self.learningRate) if record: timeStamp = time.time() path = os.path.dirname( os.path.realpath(__file__)) #get python file path self.tensorboard = TensorBoard( log_dir="{}/logs/{}".format(path, timeStamp)) print("Run `tensorboard --logdir={}/logs/{}` to see CNN status". format(path, timeStamp)) model.summary() return model #In order to have a stable training session we must back up a target network so that we can use it to provide a consistent policy at training time def backupNetwork(self, model, backup): weightMatrix = [] for layer in model.layers: weights = layer.get_weights() weightMatrix.append(weights) i = 0 for layer in backup.layers: weights = weightMatrix[i] layer.set_weights(weights) i += 1 def updateTargetNetwork(self): self.backupNetwork(self.model, self.targetModel) print("Taget model updated") #train the network to approximate the bellman equation `r + ymax2a'Q(s',a')` #use miniBatch / Experience Replay def learn(self, size): #X = numpy list of arrays of input data #Y = numpy list of arrays of target data # Batch size = samples per gradient udpate # Do not learn until we've got self.learnStart samples if self.memory.getCurrentSize() > self.learnStart: # learn in batches of 128 batch = self.memory.getMiniBatch(size) X_batch = np.empty((0, utils.INPUT_SHAPE[0], utils.INPUT_SHAPE[1], utils.INPUT_SHAPE[2]), dtype=np.float64) Y_batch = np.empty((0, self.output_size), dtype=np.float64) for sample in batch: state = sample['state'] qValues = self.getQValues(state) #model predicted Q(s,a) qTargetValues = self.getTargetQValues( sample['newState']) #model predicted Q'(s',a') targetValue = self.calculateTarget( qTargetValues, sample['reward'], sample['isFinal']) #est. bellman equation X_batch = np.append( X_batch, np.array(state.copy()), axis=0 ) #inuput states with corresponding actions w/ rewards for training # We are teaching the network to predict to the discounted reward of taking the optimal action at state s Y_sample = qValues.copy() Y_sample[0][sample['action']] = targetValue # Every action should be Q(s,a) except for the action taken so that the error on the other action stays 0 Y_batch = np.append(Y_batch, np.array(Y_sample), axis=0) # X provides the state to feed into the network to calc error based on Y #Not sure why this exists??????????????????????? if sample["isFinal"]: X_batch = np.append(X_batch, np.array(newState.copy()), axis=0) #Why use new state? #instead of appending discounted reward from bellman equation use final reward Y_batch = np.append(Y_batch, np.full((1, 3), sample['reward']), axis=0) # 3 = number of output neurons history = self.model.fit(X_batch, Y_batch, batch_size=len(batch), epochs=1, verbose=0, callbacks=[self.tensorboard]) print("Loss: " + str(history.history['loss'])) #monitor progress via tensorboard --logdir=logs/hal # predict Q values for all the actions def getQValues(self, state): predicted = self.model.predict(state) return predicted def getTargetQValues(self, state): predicted = self.targetModel.predict(state) return predicted def saveModel(self, filepath): self.model.save(filepath) def loadModel(self, filepath): self.model = load_model(filepath) def loadWeights(self, filepath): self.model.set_weights(load_model(filepath).get_weights()) def getMaxQ(self, qValues): return np.argmax(qValues) # calculate the target function def calculateTarget(self, qValuesNewState, reward, isFinal): """ Target = reward(s,a) + gamma * max(Q(s')) Bellman equation """ if isFinal: return reward else: return reward + self.discountFactor * self.getMaxQ(qValuesNewState) #`self.discountFactor * self.getMaxQ(qValuesNewState)` is an approximation but will improve as the network is trained # select the action with the highest Q value def selectAction(self, qValues, explorationRate): #rate from 0-1 rand = random.random() if rand < explorationRate: action = np.random.randint(0, self.output_size) else: action = self.getMaxQ(qValues) return action def addMemory(self, state, action, reward, newState, isFinal): self.memory.addMemory(state, action, reward, newState, isFinal)
class DeepQ: def __init__(self, environment, inputs): self.input_size = inputs self.output_size = environment.action_space.n self.memory = Memory(2000) self.discountFactor = 0.975 self.learnStart = 36 self.models = [None] * 5 def initNetwork(self, hiddenLayers): model = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", 0.01) self.models[0] = model model2 = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", 0.01) self.models[1] = model2 model3 = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", 0.01) self.models[2] = model3 def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate): model = Sequential() if len(hiddenLayers) == 0: model.add(Dense(self.output_size, input_shape=(self.input_size,), init='lecun_uniform')) model.add(Activation("linear")) else : model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform')) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) for index in range(1, len(hiddenLayers)-1): layerSize = hiddenLayers[index] model.add(Dense(layerSize, init='lecun_uniform')) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) model.add(Dense(self.output_size, init='lecun_uniform')) model.add(Activation("linear")) optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06) model.compile(loss="mse", optimizer=optimizer) return model def backupNetwork(self, model, backup): weightMatrix = [] for layer in self.model.layers: weights = layer.get_weights() weightMatrix.append(weights) i = 0 for layer in self.secondBrain.layers: weights = weightMatrix[i] layer.set_weights(weights) i += 1 # predict Q values for all the actions def getQValues(self, state, modelNr=0): predicted = self.models[modelNr].predict(state.reshape(1,len(state))) return predicted[0] def getMaxQ(self, qValues=None): if (qValues is None): qValues = self.getQValues(state) return np.max(qValues) def getMaxIndex(self, qValues=None): if (qValues is None): qValues = self.getQValues(state) return np.argmax(qValues) # calculate the target function def calculateTarget(self, qValuesNewState, reward, isFinal): if isFinal: return reward else : return reward + self.discountFactor * self.getMaxQ(qValuesNewState) # select the action with the highest Q value def selectAction(self, qValues, explorationRate): rand = random.random() if rand < explorationRate : action = np.random.randint(0, self.output_size) else : action = self.getMaxIndex(qValues) return action def selectActionMostConfident(self, qValues, qValues2, explorationRate): rand = random.random() if rand < explorationRate : action = np.random.randint(0, self.output_size) else : maxQ1 = self.getMaxQ(qValues) maxQ2 = self.getMaxQ(qValues2) if (abs(maxQ1) > abs(maxQ2)): action = self.getMaxIndex(qValues) else : action = self.getMaxIndex(qValues2) return action def selectActionAverage(self, qValues, qValues2, explorationRate): rand = random.random() if rand < explorationRate : action = np.random.randint(0, self.output_size) else : avgQValues = [] for i in range(0, len(qValues)-1): value1 = qValues[i] value2 = qValues2[i] avg = (value1 + value2) / 2.0 avgQValues.append(avg) action = self.getMaxIndex(avgQValues) return action def selectActionAdded(self, qValues, qValues2, explorationRate): rand = random.random() if rand < explorationRate : action = np.random.randint(0, self.output_size) else : addedQValues = qValues + qValues2 action = self.getMaxIndex(addedQValues) return action def selectActionMostPreferred(self, qValues, qValues2, qValues3, explorationRate): rand = random.random() if rand < explorationRate : action = np.random.randint(0, self.output_size) else : action1 = self.getMaxIndex(qValues) action2 = self.getMaxIndex(qValues2) action3 = self.getMaxIndex(qValues3) actionsChosen = [0, 0] actionsChosen[action1] += 1 actionsChosen[action2] += 1 actionsChosen[action3] += 1 if (actionsChosen[0] > actionsChosen[1]): action = 0 else : action = 1 return action def selectActionByProbability(self, qValues, bias): qValueSum = 0 shiftBy = 0 for value in qValues: if value + shiftBy < 0: shiftBy = - (value + shiftBy) shiftBy += 1e-06 for value in qValues: qValueSum += (value + shiftBy) ** bias probabilitySum = 0 qValueProbabilities = [] for value in qValues: probability = ((value + shiftBy) ** bias) / float(qValueSum) qValueProbabilities.append(probability + probabilitySum) probabilitySum += probability qValueProbabilities[len(qValueProbabilities) - 1] = 1 rand = random.random() i = 0 for value in qValueProbabilities: if (rand <= value): return i i += 1 def addMemory(self, state, action, reward, newState, isFinal): self.memory.addMemory(state, action, reward, newState, isFinal) def learnOnLastState(self): if self.memory.getCurrentSize() >= 1: return self.memory.getMemory(self.memory.getCurrentSize() - 1) def learnOnMiniBatch(self, miniBatchSize, modelNr=0): if self.memory.getCurrentSize() > self.learnStart : miniBatch = self.memory.getMiniBatch(miniBatchSize) X_batch = np.empty((0,self.input_size), dtype = np.float64) Y_batch = np.empty((0,self.output_size), dtype = np.float64) for sample in miniBatch: isFinal = sample['isFinal'] state = sample['state'] action = sample['action'] reward = sample['reward'] newState = sample['newState'] qValues = self.getQValues(state) qValuesNewState = self.getQValues(newState) targetValue = self.calculateTarget(qValuesNewState, reward, isFinal) X_batch = np.append(X_batch, np.array([state]), axis=0) Y_sample = qValues.copy() Y_sample[action] = targetValue Y_batch = np.append(Y_batch, np.array([Y_sample]), axis=0) self.models[modelNr].fit(X_batch, Y_batch, batch_size = 1, verbose = 0)
class DeepQ: def __init__(self, size_state, nr_actions, memorySize, discountFactor, learningRate, learnStart): self.input_size = size_state self.output_size = nr_actions self.memory = Memory(memorySize) self.discountFactor = discountFactor self.learnStart = learnStart self.learningRate = learningRate def initNetworks(self, hiddenLayers): model = self.createModel(self.input_size, self.output_size * , hiddenLayers, "relu", self.learningRate) self.model = model targetModel = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", self.learningRate) self.targetModel = targetModel def createRegularizedModel(self, inputs, outputs, hiddenLayers, activationType, learningRate): bias = True dropout = 0 regularizationFactor = 0.01 model = Sequential() if len(hiddenLayers) == 0: model.add(Dense(self.output_size, input_shape=(self.input_size,), init='lecun_uniform', bias=bias)) model.add(Activation("linear")) else : if regularizationFactor > 0: model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform', W_regularizer=l2(regularizationFactor), bias=bias)) else: model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform', bias=bias)) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) for index in range(1, len(hiddenLayers)-1): layerSize = hiddenLayers[index] if regularizationFactor > 0: model.add(Dense(layerSize, init='lecun_uniform', W_regularizer=l2(regularizationFactor), bias=bias)) else: model.add(Dense(layerSize, init='lecun_uniform', bias=bias)) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) if dropout > 0: model.add(Dropout(dropout)) model.add(Dense(self.output_size, init='lecun_uniform', bias=bias)) model.add(Activation("linear")) optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06) model.compile(loss="mse", optimizer=optimizer) return model def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate): model = Sequential() if len(hiddenLayers) == 0: model.add(Dense(self.output_size, input_shape=(self.input_size,), init='lecun_uniform')) model.add(Activation("linear")) else : model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform')) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) for index in range(1, len(hiddenLayers)-1): layerSize = hiddenLayers[index] model.add(Dense(layerSize, init='lecun_uniform')) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) model.add(Dense(self.output_size, init='lecun_uniform')) model.add(Activation("linear")) optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06) model.compile(loss="mse", optimizer=optimizer) return model def printNetwork(self): i = 0 for layer in self.model.layers: weights = layer.get_weights() print "layer ",i,": ",weights i += 1 def backupNetwork(self, model, backup): weightMatrix = [] for layer in model.layers: weights = layer.get_weights() weightMatrix.append(weights) i = 0 for layer in backup.layers: weights = weightMatrix[i] layer.set_weights(weights) i += 1 def updateTargetNetwork(self): self.backupNetwork(self.model, self.targetModel) # predict Q values for all the actions def getQValues(self, state): predicted = self.model.predict(state.reshape(1,len(state))) return predicted[0] def getTargetQValues(self, state): predicted = self.targetModel.predict(state.reshape(1,len(state))) return predicted[0] def getMaxQ(self, qValues): return np.max(qValues) def getMaxIndex(self, qValues): return np.argmax(qValues) # calculate the target function def calculateTarget(self, qValuesNewState, reward, isFinal): if isFinal: return reward else : return reward + self.discountFactor * self.getMaxQ(qValuesNewState) # select the action with the highest Q value def selectAction(self, qValues, explorationRate): rand = random.random() if rand < explorationRate : action = np.random.randint(0, self.output_size) else : action = self.getMaxIndex(qValues) return action def selectActionByProbability(self, qValues, bias): qValueSum = 0 shiftBy = 0 for value in qValues: if value + shiftBy < 0: shiftBy = - (value + shiftBy) shiftBy += 1e-06 for value in qValues: qValueSum += (value + shiftBy) ** bias probabilitySum = 0 qValueProbabilities = [] for value in qValues: probability = ((value + shiftBy) ** bias) / float(qValueSum) qValueProbabilities.append(probability + probabilitySum) probabilitySum += probability qValueProbabilities[len(qValueProbabilities) - 1] = 1 rand = random.random() i = 0 for value in qValueProbabilities: if (rand <= value): return i i += 1 def addMemory(self, state, action, reward, newState, isFinal): self.memory.addMemory(state, action, reward, newState, isFinal) def learnOnLastState(self): if self.memory.getCurrentSize() >= 1: return self.memory.getMemory(self.memory.getCurrentSize() - 1) def learnOnMiniBatch(self, miniBatchSize, useTargetNetwork=True): if self.memory.getCurrentSize() > self.learnStart : miniBatch = self.memory.getMiniBatch(miniBatchSize) X_batch = np.empty((0,self.input_size), dtype = np.float64) Y_batch = np.empty((0,self.output_size), dtype = np.float64) for sample in miniBatch: isFinal = sample['isFinal'] state = sample['state'] action = sample['action'] reward = sample['reward'] newState = sample['newState'] qValues = self.getQValues(state) if useTargetNetwork: qValuesNewState = self.getTargetQValues(newState) else : qValuesNewState = self.getQValues(newState) targetValue = self.calculateTarget(qValuesNewState, reward, isFinal) X_batch = np.append(X_batch, np.array([state.copy()]), axis=0) Y_sample = qValues.copy() Y_sample[action] = targetValue Y_batch = np.append(Y_batch, np.array([Y_sample]), axis=0) # if isFinal: # X_batch = np.append(X_batch, np.array([newState.copy()]), axis=0) # Y_batch = np.append(Y_batch, np.array([[reward]*self.output_size]), axis=0) self.model.fit(X_batch, Y_batch, batch_size = len(miniBatch), nb_epoch=1, verbose = 0)