Exemple #1
0
 def __init__(self, actions, state_size, reward_decay=0.9, e_greedy=0.9):
     # Save parameters for later use
     self.state_size = state_size
     self.actions = actions
     self.gamma = reward_decay
     self.epsilon = e_greedy
     # Produce neural network
     self.qnet = QNet(self.state_size)
Exemple #2
0
    def __init__(self, batch_size, learning_rate, initial_epsilon,
                 epsilon_decay, experience_stored, step_delta, runOnGPU):

        self.network = QNet()
        if (runOnGPU):
            self.network.cuda()
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.targetNetwork = deepcopy(
            self.network
        )  #Fixed model used for target values for error calculation
        self.loss_fn = torch.nn.MSELoss(size_average=False)
        self.optimizer = torch.optim.SGD(self.network.parameters(),
                                         lr=learning_rate)

        self.batch_size = batch_size
        self.step_delta = step_delta

        #Array of latest states visited for performing experience replay
        self.experience_stored = experience_stored
        self.experience = []
        self.index = 0  #Index used when replacing transitions seen in memory
        self.currentStepDelta = 0  #Step difference between fixed model and current model
Exemple #3
0
 def __init__(self,
              actions,
              state_size,
              optt='adam',
              miniBatchSize=32,
              TNRate=100,
              memorySize=1000000,
              reward_decay=0.95,
              e_greedy=0.9):
     # Save parameters for later use
     self.state_size = state_size
     self.actions = actions
     self.gamma = reward_decay
     self.epsilon = e_greedy
     # Produce neural network
     self.qnet = QNet(self.state_size, optt)
     self.targetNet = QNet(self.state_size, optt)
     self.targetNet.network = clone_model(self.qnet.network)
     self.memorySize = memorySize
     self.memory = list()
     self.TNRate = TNRate
     self.count = 0
     self.miniBatchSize = miniBatchSize
Exemple #4
0
class RLsys:
    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """
	RL class constructor.
		@param
			actions: the possible actions of the system.
			state_size: the size of the state matrix.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def __init__(self, actions, state_size, reward_decay=0.9, e_greedy=0.9):
        # Save parameters for later use
        self.state_size = state_size
        self.actions = actions
        self.gamma = reward_decay
        self.epsilon = e_greedy
        # Produce neural network
        self.qnet = QNet(self.state_size)

    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """
	Method which returns the action based on specified state and error.
		@param
			observation: the current state of the system, centered
			around the errors. Dimensionality: NxNxE, where E is the
			amount of errors we wish to evaluate actions for.
		@return
			int: the given action based on the state.
			int: the associated error.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def choose_action(self, observation):

        # ska returnera z-dimensionen
        numErrors = observation.shape[2]
        # de olika Q för alla errors
        predQ = self.predQ(observation)

        # Check the epsilon-greedy criterion
        if np.random.uniform() > self.epsilon:
            # Select the best action
            index = np.unravel_index(predQ.argmax(), predQ.shape)
            # hämta det bästa action för ett visst error
            action = index[0]
            error = index[1]
        else:
            # Choose random action and error
            action = np.random.choice(self.actions)
            error = np.random.choice(range(numErrors))
            # slumpa error här

        # returnera action och error
        return action, error

    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"
	Returns the predicted Q-value for each error in each direction
		@param:
			observation: the current state of the system, centered
			around the errors. Dimensionality: NxNxE, where E is the
			amount of errors we wish to evaluate actions for.
		
		@return:
			predQ: 2D-vector with Q-values for each error in the
			observation.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def predQ(self, observation):
        numErrors = observation.shape[2]
        # de olika Q för alla errors
        predQ = np.zeros([4, numErrors])
        # evaluera Q för de olika errors
        for x in range(numErrors):
            state = observation[:, :, x]
            predQ[:, x] = self.qnet.predictQ(state)
        return predQ

    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """
	Trains the neural network given the outcome of the action.
		@param
			state: the previous state of the system.
			action: the action taken.
			reward: the immediate reward received.
			observation_p: the resulting observation.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def learn(self, state, action, reward, observation_p):
        # Q is the more optimal Q
        Q = self.qnet.predictQ(state)[0, :]
        # Check if we are at terminal state
        if observation_p != 'terminal':
            # ska returnera z-dimensionen
            predQ = self.predQ(observation_p)
            # Update the approximation of Q
            Q[action] = reward + self.gamma * predQ.max()
        else:
            # Update the approximation of Q
            Q[action] = reward

        # Update the neural network
        self.qnet.improveQ(state, Q)

    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """
	Changes the epsilon in the epsilon-greedy policy.
		@param
			epsilon: the new epsilon.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def changeEpsilon(self, epsilon):
        self.epsilon = epsilon
Exemple #5
0
class Approximator():

    #N - batch size
    #learning_rate - Step size
    #epsilon - Exploration parameter (probability of random action)
    #experience_stored - Number of states cached for experience replay
    #step_delta - Step difference between the target NN, and the NN currently being updated
    def __init__(self, batch_size, learning_rate, initial_epsilon,
                 epsilon_decay, experience_stored, step_delta, runOnGPU):

        self.network = QNet()
        if (runOnGPU):
            self.network.cuda()
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.targetNetwork = deepcopy(
            self.network
        )  #Fixed model used for target values for error calculation
        self.loss_fn = torch.nn.MSELoss(size_average=False)
        self.optimizer = torch.optim.SGD(self.network.parameters(),
                                         lr=learning_rate)

        self.batch_size = batch_size
        self.step_delta = step_delta

        #Array of latest states visited for performing experience replay
        self.experience_stored = experience_stored
        self.experience = []
        self.index = 0  #Index used when replacing transitions seen in memory
        self.currentStepDelta = 0  #Step difference between fixed model and current model

    #Add visited state to the memory
    def addExperience(self, transition):
        #Keep adding experience information
        if (len(self.experience) < self.experience_stored):
            self.experience.append(transition)

        #Replace information of old transitions
        else:
            self.experience[self.index] = transition
            self.index = (self.index + 1) % self.experience_stored

    #Select action with epsilon greedy policy
    def epsilonGreedy(self, x, avMoves):
        #Act Greedly
        if (random.random() > self.epsilon):
            return self.bestAction(x, avMoves)

        #Act Randomly
        else:
            return self.randomAction(avMoves)

    #Get the best action from nn forward pass to be performed by the agent
    def bestAction(self, x, avMoves):
        y_pred = self.network(
            x)  #Action value for all actions in current state

        #Sorted action Q values in descending order
        sortedY, indices = torch.sort(y_pred, 1, True)
        i = 0
        while (not avMoves[indices[0][i].data[0]]):  #Pick best move available
            i += 1
        return indices[0][i].data[0]  #Return best action

    #Select random action from all available moves
    def randomAction(self, avMoves):
        possibleActions = []
        for i in range(len(avMoves)):
            if (avMoves[i]):
                possibleActions.append(i)
        index = random.randrange(0, len(possibleActions))
        return possibleActions[index]  #Return random action

    #Get optimal Actions to be performed on S' states according to the frozen model
    def optimalFutureActions(self, x, avMoves):
        actions = []
        y_pred = self.targetNetwork(x)

        #Sorted action Q values in descending order
        sortedY, indices = torch.sort(y_pred, 1, True)
        for i in range(len(y_pred)):
            if (not avMoves[i]
                ):  #If there are no more actions available return -1
                actions.append(-1)
            else:
                j = 0
                while (j < 7 and (not avMoves[i][indices[i][j].data[0]])
                       ):  #Best move available according to frozen targets
                    j += 1
                if (j < 7):
                    actions.append(indices[i][j].data[0])
                else:
                    actions.append(-1)
        return y_pred, actions

    #Update target model with the current model which defines the agent's behaviour
    def updateTargets(self):
        if (self.currentStepDelta == self.step_delta):
            self.targetNetwork = deepcopy(self.network)
            self.currentStepDelta = 0

    #Get batch_size sample from previous experience
    def sampleExperience(self):
        transitions = []
        rewards = []
        avMoves = []
        actions = []
        inputTensor = torch.FloatTensor(
            self.batch_size, 2, 6,
            7).zero_()  #Initialize tensor for input states
        inputTensor2 = torch.FloatTensor(self.batch_size, 2, 6, 7).zero_(
        )  #Initialize tensor for states reached (used fo calculating optimal future reward with frozen parameters)

        #Sample random transitions from experience
        for i in range(self.batch_size):
            transitions.append(random.choice(self.experience))
            rewards.append(transitions[i].reward)
            avMoves.append(transitions[i].avMoves2)
            actions.append(transitions[i].action)
            inputTensor[i] = torch.FloatTensor(transitions[i].state1)
            inputTensor2[i] = torch.FloatTensor(transitions[i].state2)

        return inputTensor, inputTensor2, rewards, avMoves, actions

    #Update weights using experience replay and fixed-Q values
    def updateWeightsCPU(self, discount):

        #Update target parameters after a defined number of steps
        self.updateTargets()

        #Only update weights when the required batch size has been reached
        if (len(self.experience) >= self.batch_size):
            inputTensor, inputTensor2, rewards, avMoves, actions = self.sampleExperience(
            )

            prevQ = self.network(Variable(inputTensor))
            targetQ = prevQ.clone()
            Qvals, actions2 = self.optimalFutureActions(
                Variable(inputTensor2), avMoves)

            #Modify target variable only for values corresponding to actions that were actually taken
            for i in range(len(targetQ)):
                targetQ[i][actions[i]].data[
                    0] = rewards[i] + discount * Qvals[i][actions2[i]].data[0]

            self.optimizer.zero_grad()
            loss = self.loss_fn(prevQ,
                                Variable(targetQ.data, requires_grad=False))
            loss.backward()
            self.optimizer.step()

            self.currentStepDelta += 1
            return loss.data[0], 1  #Return loss from this update, as well as a
            #0 or a 1 to indicate if an update was made (useful for calculating avg loss after many updates)
        return 0, 0

    #Update weights using experience replay and fixed-Q values
    def updateWeightsGPU(self, discount):

        #Update target parameters after a defined number of steps
        self.updateTargets()

        #Only update weights when the required batch size has been reached
        if (len(self.experience) >= self.batch_size):

            inputTensor, inputTensor2, rewards, avMoves, actions = self.sampleExperience(
            )
            prevQ = self.network(Variable(inputTensor).cuda())
            targetQ = prevQ.clone()
            Qvals, actions2 = self.optimalFutureActions(
                Variable(inputTensor2).cuda(), avMoves)

            #Modify target variable only for values corresponding to actions that were actually taken
            for i in range(len(targetQ)):
                targetQ[i][actions[i]].data[
                    0] = rewards[i] + discount * Qvals[i][actions2[i]].data[0]

            self.optimizer.zero_grad()
            loss = self.loss_fn(
                prevQ,
                Variable(targetQ.data, requires_grad=False).cuda())
            loss.backward()
            self.optimizer.step()

            self.currentStepDelta += 1
            return loss.data[0], 1  #Return loss from this update, as well as a
            #0 or a 1 to indicate if an update was made (useful for calculating avg loss after many updates)
        return 0, 0

    def decayEpsilon(self):
        self.epsilon = self.epsilon * self.epsilon_decay

    def saveExperience(self, filename):
        with open(filename, 'wb') as output:
            for i in range(len(self.experience)):
                pickle.dump(self.experience[i], output,
                            pickle.HIGHEST_PROTOCOL)
        print("Saved transitions: ", len(self.experience))

    def loadExperience(self, filename):
        expfile = Path(filename)
        if expfile.is_file():
            with open(filename, 'rb') as input:
                while True:
                    try:
                        self.experience.append(pickle.load(input))
                    except EOFError:
                        break
        print("Loaded transitions: ", len(self.experience))
def prepare_state(state):
    ret = np.array([(state[0] + .3) / .9, state[1] / .07])
    ret = np.expand_dims(ret, axis=0)
    return ret


if __name__ == '__main__':
    #Engine init
    env = gym.make('MountainCar-v0')

    #Load names
    models_dir = 'models'
    state_file = 'state'

    #Loading model
    Q_targ = QNet(3)
    weights_file_path, _ = utils.find_prev_state_files(models_dir, state_file)
    weights = torch.load(weights_file_path)
    Q_targ.load_state_dict(weights['Q_targ'])

    #Demonstration
    while True:
        env.reset()
        env.render()

        raw_state, reward, done, info = env.step(1)

        while not done:
            state = prepare_state(raw_state)

            action = np.argmax(
Exemple #7
0
    return ret


if __name__ == '__main__':
    #Save/load dirs
    models_dir = './models'
    state_file = 'state'
    out_dir = os.path.join(models_dir, str(int(time.time())))

    #Engine parameters
    env = gym.make('MountainCar-v0')

    num_actions = 3

    #Initing neural networks and loading previos state, if exists
    Q = QNet(num_actions=num_actions)
    Q_targ = QNet(num_actions=num_actions)

    prev_state = None
    if os.listdir(models_dir) == []:
        Q.apply(utils.init_weights)

        Q_targ.load_state_dict(Q.state_dict())
    else:
        weights_file_path, state_file_path = utils.find_prev_state_files(
            models_dir, state_file)

        weights = torch.load(weights_file_path)
        prev_state = torch.load(state_file_path)

        Q.load_state_dict(weights['Q'])
Exemple #8
0
class RLsys:
    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """
	RL class constructor.
		@param
			actions: the possible actions of the system.
			state_size: the size of the state matrix.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def __init__(self,
                 actions,
                 state_size,
                 optt='adam',
                 miniBatchSize=32,
                 TNRate=100,
                 memorySize=1000000,
                 reward_decay=0.95,
                 e_greedy=0.9):
        # Save parameters for later use
        self.state_size = state_size
        self.actions = actions
        self.gamma = reward_decay
        self.epsilon = e_greedy
        # Produce neural network
        self.qnet = QNet(self.state_size, optt)
        self.targetNet = QNet(self.state_size, optt)
        self.targetNet.network = clone_model(self.qnet.network)
        self.memorySize = memorySize
        self.memory = list()
        self.TNRate = TNRate
        self.count = 0
        self.miniBatchSize = miniBatchSize

    def storeTransition(self, oldState, action, reward, newState):
        if len(self.memory) >= self.memorySize:
            i = np.random.randint(0, self.memorySize)
            self.memory[i] = (oldState, action, reward, newState)
        else:
            self.memory.append((oldState, action, reward, newState))

    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """
	Method which returns the action based on specified state and error.
		@param
			observation: the current state of the system, centered
			around the errors. Dimensionality: NxNxE, where E is the
			amount of errors we wish to evaluate actions for.
		@return
			int: the given action based on the state.
			int: the associated error.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def choose_action(self, observation):

        numErrors = observation.shape[2]
        predQ = self.predQ(observation)

        # Check the epsilon-greedy criterion
        if np.random.uniform() > self.epsilon:
            index = np.unravel_index(predQ.argmax(), predQ.shape)
            action = index[0]
            error = index[1]
        else:
            action = np.random.choice(self.actions)
            error = np.random.choice(range(numErrors))

        return action, error

    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"
	Returns the predicted Q-value for each error in each direction
		@param:
			observation: the current state of the system, centered
			around the errors. Dimensionality: NxNxE, where E is the
			amount of errors we wish to evaluate actions for.
		
		@return:
			predQ: 2D-vector with Q-values for each error in the
			observation.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def predTargetQ(self, observation):
        numErrors = observation.shape[2]
        predQ = np.zeros([4, numErrors])
        for x in range(numErrors):
            state = observation[:, :, x, np.newaxis]
            predQ[:, x] = self.targetNet.predictQ(state)
        return predQ

    def predQ(self, observation):
        numErrors = observation.shape[2]
        predQ = np.zeros([4, numErrors])
        for x in range(numErrors):
            state = observation[:, :, x, np.newaxis]
            predQ[:, x] = self.qnet.predictQ(state)
        return predQ

    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """
	Trains the neural network given the outcome of the action.
		@param
			state: the previous state of the system.
			action: the action taken.
			reward: the immediate reward received.
			observation_p: the resulting observation.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def learn(self):
        # Q is the more optimal Q
        B = list()
        for i in range(self.miniBatchSize):
            j = np.random.randint(0, len(self.memory))
            B.append(self.memory[j])

        state = np.zeros(
            (self.miniBatchSize, self.state_size, self.state_size, 1))
        Q = np.zeros((self.miniBatchSize, 4))

        for i in range(self.miniBatchSize):

            transition = B[i]

            state_ = transition[0]
            action = transition[1]
            reward = transition[2]
            observation_p = transition[3]

            state_ = state_[:, :, np.newaxis]

            state[i, :, :, :] = state_

            Q_ = self.qnet.predictQ(state_)[0, :]
            if observation_p != 'terminal':
                predQ = self.predTargetQ(observation_p)
                Q_[action] = reward + self.gamma * predQ.max()
            else:
                Q_[action] = reward

            Q[i, :] = Q_
        self.qnet.improveQ(state, Q)
        self.count += 1
        if self.count % self.TNRate == 0:
            self.targetNet.network.set_weights(self.qnet.network.get_weights())

    """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """
	Changes the epsilon in the epsilon-greedy policy.
		@param
			epsilon: the new epsilon.
	""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """"""

    def changeEpsilon(self, epsilon):
        self.epsilon = epsilon