def __init__(self, actions, state_size, reward_decay=0.9, e_greedy=0.9): # Save parameters for later use self.state_size = state_size self.actions = actions self.gamma = reward_decay self.epsilon = e_greedy # Produce neural network self.qnet = QNet(self.state_size)
def __init__(self, batch_size, learning_rate, initial_epsilon, epsilon_decay, experience_stored, step_delta, runOnGPU): self.network = QNet() if (runOnGPU): self.network.cuda() self.epsilon = initial_epsilon self.epsilon_decay = epsilon_decay self.targetNetwork = deepcopy( self.network ) #Fixed model used for target values for error calculation self.loss_fn = torch.nn.MSELoss(size_average=False) self.optimizer = torch.optim.SGD(self.network.parameters(), lr=learning_rate) self.batch_size = batch_size self.step_delta = step_delta #Array of latest states visited for performing experience replay self.experience_stored = experience_stored self.experience = [] self.index = 0 #Index used when replacing transitions seen in memory self.currentStepDelta = 0 #Step difference between fixed model and current model
def __init__(self, actions, state_size, optt='adam', miniBatchSize=32, TNRate=100, memorySize=1000000, reward_decay=0.95, e_greedy=0.9): # Save parameters for later use self.state_size = state_size self.actions = actions self.gamma = reward_decay self.epsilon = e_greedy # Produce neural network self.qnet = QNet(self.state_size, optt) self.targetNet = QNet(self.state_size, optt) self.targetNet.network = clone_model(self.qnet.network) self.memorySize = memorySize self.memory = list() self.TNRate = TNRate self.count = 0 self.miniBatchSize = miniBatchSize
class RLsys: """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ RL class constructor. @param actions: the possible actions of the system. state_size: the size of the state matrix. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def __init__(self, actions, state_size, reward_decay=0.9, e_greedy=0.9): # Save parameters for later use self.state_size = state_size self.actions = actions self.gamma = reward_decay self.epsilon = e_greedy # Produce neural network self.qnet = QNet(self.state_size) """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ Method which returns the action based on specified state and error. @param observation: the current state of the system, centered around the errors. Dimensionality: NxNxE, where E is the amount of errors we wish to evaluate actions for. @return int: the given action based on the state. int: the associated error. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def choose_action(self, observation): # ska returnera z-dimensionen numErrors = observation.shape[2] # de olika Q för alla errors predQ = self.predQ(observation) # Check the epsilon-greedy criterion if np.random.uniform() > self.epsilon: # Select the best action index = np.unravel_index(predQ.argmax(), predQ.shape) # hämta det bästa action för ett visst error action = index[0] error = index[1] else: # Choose random action and error action = np.random.choice(self.actions) error = np.random.choice(range(numErrors)) # slumpa error här # returnera action och error return action, error """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """" Returns the predicted Q-value for each error in each direction @param: observation: the current state of the system, centered around the errors. Dimensionality: NxNxE, where E is the amount of errors we wish to evaluate actions for. @return: predQ: 2D-vector with Q-values for each error in the observation. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def predQ(self, observation): numErrors = observation.shape[2] # de olika Q för alla errors predQ = np.zeros([4, numErrors]) # evaluera Q för de olika errors for x in range(numErrors): state = observation[:, :, x] predQ[:, x] = self.qnet.predictQ(state) return predQ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ Trains the neural network given the outcome of the action. @param state: the previous state of the system. action: the action taken. reward: the immediate reward received. observation_p: the resulting observation. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def learn(self, state, action, reward, observation_p): # Q is the more optimal Q Q = self.qnet.predictQ(state)[0, :] # Check if we are at terminal state if observation_p != 'terminal': # ska returnera z-dimensionen predQ = self.predQ(observation_p) # Update the approximation of Q Q[action] = reward + self.gamma * predQ.max() else: # Update the approximation of Q Q[action] = reward # Update the neural network self.qnet.improveQ(state, Q) """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ Changes the epsilon in the epsilon-greedy policy. @param epsilon: the new epsilon. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def changeEpsilon(self, epsilon): self.epsilon = epsilon
class Approximator(): #N - batch size #learning_rate - Step size #epsilon - Exploration parameter (probability of random action) #experience_stored - Number of states cached for experience replay #step_delta - Step difference between the target NN, and the NN currently being updated def __init__(self, batch_size, learning_rate, initial_epsilon, epsilon_decay, experience_stored, step_delta, runOnGPU): self.network = QNet() if (runOnGPU): self.network.cuda() self.epsilon = initial_epsilon self.epsilon_decay = epsilon_decay self.targetNetwork = deepcopy( self.network ) #Fixed model used for target values for error calculation self.loss_fn = torch.nn.MSELoss(size_average=False) self.optimizer = torch.optim.SGD(self.network.parameters(), lr=learning_rate) self.batch_size = batch_size self.step_delta = step_delta #Array of latest states visited for performing experience replay self.experience_stored = experience_stored self.experience = [] self.index = 0 #Index used when replacing transitions seen in memory self.currentStepDelta = 0 #Step difference between fixed model and current model #Add visited state to the memory def addExperience(self, transition): #Keep adding experience information if (len(self.experience) < self.experience_stored): self.experience.append(transition) #Replace information of old transitions else: self.experience[self.index] = transition self.index = (self.index + 1) % self.experience_stored #Select action with epsilon greedy policy def epsilonGreedy(self, x, avMoves): #Act Greedly if (random.random() > self.epsilon): return self.bestAction(x, avMoves) #Act Randomly else: return self.randomAction(avMoves) #Get the best action from nn forward pass to be performed by the agent def bestAction(self, x, avMoves): y_pred = self.network( x) #Action value for all actions in current state #Sorted action Q values in descending order sortedY, indices = torch.sort(y_pred, 1, True) i = 0 while (not avMoves[indices[0][i].data[0]]): #Pick best move available i += 1 return indices[0][i].data[0] #Return best action #Select random action from all available moves def randomAction(self, avMoves): possibleActions = [] for i in range(len(avMoves)): if (avMoves[i]): possibleActions.append(i) index = random.randrange(0, len(possibleActions)) return possibleActions[index] #Return random action #Get optimal Actions to be performed on S' states according to the frozen model def optimalFutureActions(self, x, avMoves): actions = [] y_pred = self.targetNetwork(x) #Sorted action Q values in descending order sortedY, indices = torch.sort(y_pred, 1, True) for i in range(len(y_pred)): if (not avMoves[i] ): #If there are no more actions available return -1 actions.append(-1) else: j = 0 while (j < 7 and (not avMoves[i][indices[i][j].data[0]]) ): #Best move available according to frozen targets j += 1 if (j < 7): actions.append(indices[i][j].data[0]) else: actions.append(-1) return y_pred, actions #Update target model with the current model which defines the agent's behaviour def updateTargets(self): if (self.currentStepDelta == self.step_delta): self.targetNetwork = deepcopy(self.network) self.currentStepDelta = 0 #Get batch_size sample from previous experience def sampleExperience(self): transitions = [] rewards = [] avMoves = [] actions = [] inputTensor = torch.FloatTensor( self.batch_size, 2, 6, 7).zero_() #Initialize tensor for input states inputTensor2 = torch.FloatTensor(self.batch_size, 2, 6, 7).zero_( ) #Initialize tensor for states reached (used fo calculating optimal future reward with frozen parameters) #Sample random transitions from experience for i in range(self.batch_size): transitions.append(random.choice(self.experience)) rewards.append(transitions[i].reward) avMoves.append(transitions[i].avMoves2) actions.append(transitions[i].action) inputTensor[i] = torch.FloatTensor(transitions[i].state1) inputTensor2[i] = torch.FloatTensor(transitions[i].state2) return inputTensor, inputTensor2, rewards, avMoves, actions #Update weights using experience replay and fixed-Q values def updateWeightsCPU(self, discount): #Update target parameters after a defined number of steps self.updateTargets() #Only update weights when the required batch size has been reached if (len(self.experience) >= self.batch_size): inputTensor, inputTensor2, rewards, avMoves, actions = self.sampleExperience( ) prevQ = self.network(Variable(inputTensor)) targetQ = prevQ.clone() Qvals, actions2 = self.optimalFutureActions( Variable(inputTensor2), avMoves) #Modify target variable only for values corresponding to actions that were actually taken for i in range(len(targetQ)): targetQ[i][actions[i]].data[ 0] = rewards[i] + discount * Qvals[i][actions2[i]].data[0] self.optimizer.zero_grad() loss = self.loss_fn(prevQ, Variable(targetQ.data, requires_grad=False)) loss.backward() self.optimizer.step() self.currentStepDelta += 1 return loss.data[0], 1 #Return loss from this update, as well as a #0 or a 1 to indicate if an update was made (useful for calculating avg loss after many updates) return 0, 0 #Update weights using experience replay and fixed-Q values def updateWeightsGPU(self, discount): #Update target parameters after a defined number of steps self.updateTargets() #Only update weights when the required batch size has been reached if (len(self.experience) >= self.batch_size): inputTensor, inputTensor2, rewards, avMoves, actions = self.sampleExperience( ) prevQ = self.network(Variable(inputTensor).cuda()) targetQ = prevQ.clone() Qvals, actions2 = self.optimalFutureActions( Variable(inputTensor2).cuda(), avMoves) #Modify target variable only for values corresponding to actions that were actually taken for i in range(len(targetQ)): targetQ[i][actions[i]].data[ 0] = rewards[i] + discount * Qvals[i][actions2[i]].data[0] self.optimizer.zero_grad() loss = self.loss_fn( prevQ, Variable(targetQ.data, requires_grad=False).cuda()) loss.backward() self.optimizer.step() self.currentStepDelta += 1 return loss.data[0], 1 #Return loss from this update, as well as a #0 or a 1 to indicate if an update was made (useful for calculating avg loss after many updates) return 0, 0 def decayEpsilon(self): self.epsilon = self.epsilon * self.epsilon_decay def saveExperience(self, filename): with open(filename, 'wb') as output: for i in range(len(self.experience)): pickle.dump(self.experience[i], output, pickle.HIGHEST_PROTOCOL) print("Saved transitions: ", len(self.experience)) def loadExperience(self, filename): expfile = Path(filename) if expfile.is_file(): with open(filename, 'rb') as input: while True: try: self.experience.append(pickle.load(input)) except EOFError: break print("Loaded transitions: ", len(self.experience))
def prepare_state(state): ret = np.array([(state[0] + .3) / .9, state[1] / .07]) ret = np.expand_dims(ret, axis=0) return ret if __name__ == '__main__': #Engine init env = gym.make('MountainCar-v0') #Load names models_dir = 'models' state_file = 'state' #Loading model Q_targ = QNet(3) weights_file_path, _ = utils.find_prev_state_files(models_dir, state_file) weights = torch.load(weights_file_path) Q_targ.load_state_dict(weights['Q_targ']) #Demonstration while True: env.reset() env.render() raw_state, reward, done, info = env.step(1) while not done: state = prepare_state(raw_state) action = np.argmax(
return ret if __name__ == '__main__': #Save/load dirs models_dir = './models' state_file = 'state' out_dir = os.path.join(models_dir, str(int(time.time()))) #Engine parameters env = gym.make('MountainCar-v0') num_actions = 3 #Initing neural networks and loading previos state, if exists Q = QNet(num_actions=num_actions) Q_targ = QNet(num_actions=num_actions) prev_state = None if os.listdir(models_dir) == []: Q.apply(utils.init_weights) Q_targ.load_state_dict(Q.state_dict()) else: weights_file_path, state_file_path = utils.find_prev_state_files( models_dir, state_file) weights = torch.load(weights_file_path) prev_state = torch.load(state_file_path) Q.load_state_dict(weights['Q'])
class RLsys: """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ RL class constructor. @param actions: the possible actions of the system. state_size: the size of the state matrix. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def __init__(self, actions, state_size, optt='adam', miniBatchSize=32, TNRate=100, memorySize=1000000, reward_decay=0.95, e_greedy=0.9): # Save parameters for later use self.state_size = state_size self.actions = actions self.gamma = reward_decay self.epsilon = e_greedy # Produce neural network self.qnet = QNet(self.state_size, optt) self.targetNet = QNet(self.state_size, optt) self.targetNet.network = clone_model(self.qnet.network) self.memorySize = memorySize self.memory = list() self.TNRate = TNRate self.count = 0 self.miniBatchSize = miniBatchSize def storeTransition(self, oldState, action, reward, newState): if len(self.memory) >= self.memorySize: i = np.random.randint(0, self.memorySize) self.memory[i] = (oldState, action, reward, newState) else: self.memory.append((oldState, action, reward, newState)) """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ Method which returns the action based on specified state and error. @param observation: the current state of the system, centered around the errors. Dimensionality: NxNxE, where E is the amount of errors we wish to evaluate actions for. @return int: the given action based on the state. int: the associated error. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def choose_action(self, observation): numErrors = observation.shape[2] predQ = self.predQ(observation) # Check the epsilon-greedy criterion if np.random.uniform() > self.epsilon: index = np.unravel_index(predQ.argmax(), predQ.shape) action = index[0] error = index[1] else: action = np.random.choice(self.actions) error = np.random.choice(range(numErrors)) return action, error """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """" Returns the predicted Q-value for each error in each direction @param: observation: the current state of the system, centered around the errors. Dimensionality: NxNxE, where E is the amount of errors we wish to evaluate actions for. @return: predQ: 2D-vector with Q-values for each error in the observation. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def predTargetQ(self, observation): numErrors = observation.shape[2] predQ = np.zeros([4, numErrors]) for x in range(numErrors): state = observation[:, :, x, np.newaxis] predQ[:, x] = self.targetNet.predictQ(state) return predQ def predQ(self, observation): numErrors = observation.shape[2] predQ = np.zeros([4, numErrors]) for x in range(numErrors): state = observation[:, :, x, np.newaxis] predQ[:, x] = self.qnet.predictQ(state) return predQ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ Trains the neural network given the outcome of the action. @param state: the previous state of the system. action: the action taken. reward: the immediate reward received. observation_p: the resulting observation. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def learn(self): # Q is the more optimal Q B = list() for i in range(self.miniBatchSize): j = np.random.randint(0, len(self.memory)) B.append(self.memory[j]) state = np.zeros( (self.miniBatchSize, self.state_size, self.state_size, 1)) Q = np.zeros((self.miniBatchSize, 4)) for i in range(self.miniBatchSize): transition = B[i] state_ = transition[0] action = transition[1] reward = transition[2] observation_p = transition[3] state_ = state_[:, :, np.newaxis] state[i, :, :, :] = state_ Q_ = self.qnet.predictQ(state_)[0, :] if observation_p != 'terminal': predQ = self.predTargetQ(observation_p) Q_[action] = reward + self.gamma * predQ.max() else: Q_[action] = reward Q[i, :] = Q_ self.qnet.improveQ(state, Q) self.count += 1 if self.count % self.TNRate == 0: self.targetNet.network.set_weights(self.qnet.network.get_weights()) """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ Changes the epsilon in the epsilon-greedy policy. @param epsilon: the new epsilon. """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" def changeEpsilon(self, epsilon): self.epsilon = epsilon