Esempio n. 1
0
 def __init__(self,
              state_shape,
              action_shape,
              learning_rate=0.005,
              gamma=0.98,
              memory=Memory(capacity=2000)):
     self.state_shape = state_shape
     self.action_shape = action_shape
     self.gamma = gamma  # Agent's discount factor
     self.learning_rate = learning_rate  # Agent's Q-learning rate
     # self.Q is the Action-Value function. This agent represents Q using a
     # Neural Network.
     print(self.state_shape, self.action_shape)
     self.Q = DQNAgent().build_model(self.state_shape[0], self.action_shape,
                                     0.01, 0.01)
     self.tQ = DQNAgent().build_model(self.state_shape[0],
                                      self.action_shape, 0.01, 0.01)
     # self.policy is the policy followed by the agent. This agents follows
     # an epsilon-greedy policy w.r.t it's Q estimate.
     self.policy = self.epsilon_greedy_Q
     self.epsilon_max = 1.0
     self.epsilon_min = 0.05
     self.epsilon_decay = LinearDecaySchedule(
         initial_value=self.epsilon_max,
         final_value=self.epsilon_min,
         max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEPS_PER_EPISODE)
     self.step_num = 0
     self.update_steps = 64
     #self.memory =deque(maxlen=2000)
     self.memory = memory
Esempio n. 2
0
class Shallow_Q_Learner(object):
    def __init__(self, state_shape, action_shape, learning_rate=0.005,gamma=0.98):
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.gamma = gamma # Agent's discount factor
        self.learning_rate = learning_rate # Agent's Q-learning rate
        # self.Q is the Action-Value function. This agent represents Q using a
        # Neural Network.
        print(self.state_shape, self.action_shape)
        self.Q = DQNAgent().build_model(self.state_shape[0], self.action_shape, 0.01,0.01 )
        # self.policy is the policy followed by the agent. This agents follows
        # an epsilon-greedy policy w.r.t it's Q estimate.
        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(initial_value=self.epsilon_max,final_value=self.epsilon_min,max_steps= 0.5 * MAX_NUM_EPISODES *MAX_STEPS_PER_EPISODE)
        self.step_num = 0

    def get_action(self, observation):
        return self.policy(observation)

    def epsilon_greedy_Q(self, observation):
        # Decay Epsilion/exploratin as per schedule
        if random.random() < self.epsilon_decay(self.step_num):
            return random.choice([i for i in range(self.action_shape)])
        return np.argmax(self.model.predict(state))


    def learn(self, s, a, r, s_next):
        y_target = self.Q.predict(s)
        y_target[0][a] = r + self.gamma * np.max(self.Q.predict(s_next)[0])

        self.Q.fit(np.array(s), np.array(y_target), batch_size=1, verbose=0)
Esempio n. 3
0
class Shallow_Q_Learner(object):
    def __init__(self,
                 state_shape,
                 action_shape,
                 learning_rate=0.005,
                 gamma=0.98):
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.gamma = gamma  # Agent's discount factor
        self.learning_rate = learning_rate  # Agent's Q-learning rate
        # self.Q is the Action-Value function. This agent represents Q using a
        # Neural Network.
        print(self.state_shape, self.action_shape)
        self.Q = DQNAgent().build_model(self.state_shape[0], self.action_shape,
                                        0.01, 0.01)
        # self.policy is the policy followed by the agent. This agents follows
        # an epsilon-greedy policy w.r.t it's Q estimate.
        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEPS_PER_EPISODE)
        self.step_num = 0
        self.memory = deque(maxlen=2000)

    def get_action(self, observation):
        return self.policy(observation)

    def epsilon_greedy_Q(self, observation):
        # Decay Epsilion/exploratin as per schedule
        if random.random() < self.epsilon_decay(self.step_num):
            return random.choice([i for i in range(self.action_shape)])
        return np.argmax(self.model.predict(state))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        x_batch, y_batch = [], []
        minibatch = random.sample(self.memory, min(len(self.memory),
                                                   batch_size))
        for state, action, reward, next_state, done in minibatch:
            y_target = self.Q.predict(state)
            y_target[0][
                action] = reward if done else reward + self.gamma * np.max(
                    self.Q.predict(next_state)[0])
            x_batch.append(state[0])
            y_batch.append(y_target[0])

        self.Q.fit(np.array(x_batch),
                   np.array(y_batch),
                   batch_size=len(x_batch),
                   verbose=0)
Esempio n. 4
0
class Shallow_Q_Learner(object):
    def __init__(self,
                 state_shape,
                 action_shape,
                 learning_rate=0.005,
                 gamma=0.98,
                 memory=Memory(capacity=2000)):
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.gamma = gamma  # Agent's discount factor
        self.learning_rate = learning_rate  # Agent's Q-learning rate
        # self.Q is the Action-Value function. This agent represents Q using a
        # Neural Network.
        print(self.state_shape, self.action_shape)
        self.Q = DQNAgent().build_model(self.state_shape[0], self.action_shape,
                                        0.01, 0.01)
        self.tQ = DQNAgent().build_model(self.state_shape[0],
                                         self.action_shape, 0.01, 0.01)
        # self.policy is the policy followed by the agent. This agents follows
        # an epsilon-greedy policy w.r.t it's Q estimate.
        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEPS_PER_EPISODE)
        self.step_num = 0
        self.update_steps = 64
        #self.memory =deque(maxlen=2000)
        self.memory = memory

    def get_action(self, observation):
        return self.policy(observation)

    def epsilon_greedy_Q(self, observation):
        # Decay Epsilion/exploratin as per schedule
        if random.random() < self.epsilon_decay(self.step_num):
            return random.choice([i for i in range(self.action_shape)])
        return np.argmax(self.model.predict(state))

    def compute_td_error(self, next_state, reward):
        Q_next = self.Q.predict(next_state)[0]
        current = reward + self.learning_rate * np.max(
            self.tQ.predict(next_state)[0])
        td_error = abs(current - Q_next)
        return td_error

    def remember(self, experience):
        self.memory.add(experience)

    def replay(self, batch_size):
        batch = self.memory.sample(batch_size)
        x_batch, y_batch, errors = [], [], []
        for i in range(len(batch)):
            minibatch = batch[i][1]
            state = minibatch[0]
            action = minibatch[1]
            reward = minibatch[2]
            next_state = minibatch[3]
            done = minibatch[4]

            y_target = self.Q.predict(state)
            target_init = y_target[0][action]
            y_target[0][
                action] = reward if done else reward + self.gamma * np.max(
                    self.tQ.predict(next_state)[0])

            x_batch.append(state[0])
            y_batch.append(y_target[0])

            td_error = abs(target_init - y_target[0][action])
            errors.append(td_error * batch[i][2])

        for i in range(len(batch)):
            idx = batch[i][0]
            self.memory.update(idx, errors[i])

        self.Q.fit(np.array(x_batch),
                   np.array(y_batch),
                   batch_size=len(x_batch),
                   verbose=0,
                   callbacks=[tensorboard])

    def target_train(self):
        weights = self.Q.get_weights()
        target_weights = self.tQ.get_weights()
        for i in range(len(target_weights)):
            target_weights[
                i] = weights[i] * self.learning_rate + target_weights[i] * (
                    1 - self.learning_rate)
        self.tQ.set_weights(target_weights)