Ejemplo n.º 1
0
    def __init__(self, settings):

        assert(type(settings)==dict)

        # seed
        self.rng = settings['RNG']

        # Epsilon
        self.epsilon_start = settings['EPSILON_START']
        self.epsilon_end = settings['EPSILON_END']
        self.epsilon_end_time = settings['EPSILON_END_TIME']
        self.testing_epsilon = settings['TESTING_EPSILON']
        self.epsilon_decay = (self.epsilon_start-self.epsilon_end)/float(self.epsilon_end_time)

        # Training
        self.learning_rate = settings['LEARNING_RATE']
        self.rmsprop_rho = settings['RMSPROP_RHO']
        self.rmsprop_epsilon = settings['RMSPROP_EPSILON']
        self.target_net_update = settings['TARGET_NET_UPDATE']
        self.min_reward = settings['MIN_REWARD']
        self.max_reward = settings['MAX_REWARD']

        # Q-Learning Parameters
        self.n_actions = settings['N_ACTIONS']
        self.discount_factor = settings['DISCOUNT_FACTOR']
        self.update_frequency = settings['UPDATE_FREQUENCY']
        self.learn_start = settings['LEARN_START']
        self.agent_history_length = settings['AGENT_HISTORY_LENGTH']
        self.batch_size = settings['BATCH_SIZE']

        # Preprocess
        self.resize_width = settings['RESIZE_WIDTH']
        self.resize_height = settings['RESIZE_HEIGHT']
        self.resize_dims = (self.resize_width, self.resize_height)

        self.net = DeepQNetwork(
            self.n_actions,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )
        self.target_net = DeepQNetwork(
            self.n_actions,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )
        self.target_net.setWeights(self.net.getWeights())

        self.memory = ReplayMemory(settings)

        self.numSteps = 0
        self.lastState = None
        self.lastAction = None
        self.lastTerminal = None

        self.compile()
Ejemplo n.º 2
0
class NeuralQLearner:

    def __init__(self, settings):

        assert(type(settings)==dict)

        # seed
        self.rng = settings['RNG']

        # Epsilon
        self.epsilon_start = settings['EPSILON_START']
        self.epsilon_end = settings['EPSILON_END']
        self.epsilon_end_time = settings['EPSILON_END_TIME']
        self.testing_epsilon = settings['TESTING_EPSILON']
        self.epsilon_decay = (self.epsilon_start-self.epsilon_end)/float(self.epsilon_end_time)

        # Training
        self.learning_rate = settings['LEARNING_RATE']
        self.rmsprop_rho = settings['RMSPROP_RHO']
        self.rmsprop_epsilon = settings['RMSPROP_EPSILON']
        self.target_net_update = settings['TARGET_NET_UPDATE']
        self.min_reward = settings['MIN_REWARD']
        self.max_reward = settings['MAX_REWARD']

        # Q-Learning Parameters
        self.n_actions = settings['N_ACTIONS']
        self.discount_factor = settings['DISCOUNT_FACTOR']
        self.update_frequency = settings['UPDATE_FREQUENCY']
        self.learn_start = settings['LEARN_START']
        self.agent_history_length = settings['AGENT_HISTORY_LENGTH']
        self.batch_size = settings['BATCH_SIZE']

        # Preprocess
        self.resize_width = settings['RESIZE_WIDTH']
        self.resize_height = settings['RESIZE_HEIGHT']
        self.resize_dims = (self.resize_width, self.resize_height)

        self.net = DeepQNetwork(
            self.n_actions,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )
        self.target_net = DeepQNetwork(
            self.n_actions,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )
        self.target_net.setWeights(self.net.getWeights())

        self.memory = ReplayMemory(settings)

        self.numSteps = 0
        self.lastState = None
        self.lastAction = None
        self.lastTerminal = None

        self.compile()

    def compile(self):

        input_shape = (
            self.batch_size,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )

        pred_input_shape = (
            1,
            self.agent_history_length,
            self.resize_height,
            self.resize_width
        )

        self.pred_input = shared(np.zeros(pred_input_shape, dtype=floatX))
        self.net_input = shared(np.zeros(input_shape, dtype=floatX))
        self.target_net_input = shared(np.zeros(input_shape, dtype=floatX))

        self.shared_actions = shared(np.zeros((self.batch_size,), dtype='int32'))
        self.shared_rewards = shared(np.zeros((self.batch_size,), dtype='int32'))
        self.shared_terminals = shared(np.zeros((self.batch_size,), dtype=floatX))

        actions = T.ivector()
        rewards = T.ivector()
        terminals = T.vector()

        targets = (rewards +
                    (T.ones_like(terminals) - terminals) *
                                self.discount_factor * T.max(self.target_net.qvalues, axis=1))

        diff = targets - self.net.qvalues[T.arange(self.batch_size), actions]

        qp = T.minimum(abs(diff), 1.0)
        lp = abs(diff) - qp
        delta = 0.5 * qp ** 2 + lp
        cost = T.sum(delta)

        optimizer = RMSprop(
            cost,
            self.net.params,
            lr=self.learning_rate,
            rho=self.rmsprop_rho,
            epsilon=self.rmsprop_epsilon
        )

        givens = {
            self.net.input: self.net_input,
            self.target_net.input: self.target_net_input,
            actions: self.shared_actions,
            rewards: self.shared_rewards,
            terminals: self.shared_terminals
        }

        self.train = function(
            inputs=[],
            outputs=cost,
            updates=optimizer.getUpdates(),
            givens=givens
        )

        self.prediction = function(
            inputs=[],
            outputs=self.net.qvalues.flatten(1),
            givens={
                self.net.input: self.pred_input
            }
        )

    def preprocess(self, rawstate):

        return cv2.resize(rawstate, self.resize_dims, interpolation=cv2.INTER_LINEAR)

    def getEpsilon(self):

        current_epsilon = self.epsilon_start - (self.numSteps * self.epsilon_decay)
        return max(self.epsilon_end, current_epsilon)

    def qLearnMinibatch(self):

        s1, a, r, t, s2 = self.memory.sampleMinibatch()
        # borrow=True para que no se haga una copia del arreglo y sea mas rapido
        self.net_input.set_value(s1, borrow=True)
        self.shared_actions.set_value(a, borrow=True)
        self.shared_rewards.set_value(r, borrow=True)
        self.shared_terminals.set_value(t, borrow=True)
        self.target_net_input.set_value(s2, borrow=True)
        return self.train()

    def perceive(self, rawstate, reward, terminal, testing):

        state = self.preprocess(rawstate)
        reward = max(reward, self.min_reward)
        reward = min(reward, self.max_reward)

        self.memory.storeRecentState(state, terminal)

        if((not testing) and (self.lastState is not None)):
            self.memory.storeTransition(self.lastState, self.lastAction, reward, self.lastTerminal)

        actionIndex = 0
        if(not terminal):
            actionIndex = self.eGreedy(testing)

        flag1 = (self.numSteps > self.learn_start)
        flag2 = (self.numSteps % self.update_frequency == 0)

        # Short-Circuit-Eval ...
        if((not testing) and flag1 and flag2):
            cost = self.qLearnMinibatch()

        if(self.numSteps % self.target_net_update == 0):
            self.target_net.setWeights(self.net.getWeights())

        self.lastState = state
        self.lastAction = actionIndex
        self.lastTerminal = terminal

        if(not testing):
            self.numSteps += 1

        return actionIndex

    def eGreedy(self, testing):

        epsilon = self.testing_epsilon if(testing) else self.getEpsilon()
        if(self.rng.uniform(0,1) < epsilon):
            return self.rng.randint(0, self.n_actions)
        else:
            return self.greedy()

    def greedy(self):

        curState = self.memory.getRecentState()
        curState = curState.reshape(1, curState.shape[0], curState.shape[1], curState.shape[2])
        self.pred_input.set_value(curState, borrow=True)
        q = self.prediction()

        maxq = q[0]
        besta = [0]
        for a in xrange(1, self.n_actions):
            if(q[a] > maxq):
                maxq = q[a]
                besta = [a]
            elif(q[a] == maxq):
                besta.append(a)

        r = self.rng.randint(0, len(besta))
        return besta[r]