コード例 #1
0
class DeepQlearner:
    def __init__(self,
                 random_action_method,
                 future_discount=0.75,
                 learning_rate=0.001,
                 load_path=None):
        learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8)

        self.model = RLModel()
        self.model.build((None, AGENT_INPUT_SIZE))
        self.load_path = load_path
        if load_path is not None and os.path.isfile(load_path):
            print("Loading")
            self.model.load_weights(load_path)

        self.exp_rep = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)

        self.random_action_method = random_action_method

        self.learning_rate = learning_rate
        self.future_discount = future_discount

        self.loss_measure = tf.losses.MeanSquaredError()
        self.opt = tf.optimizers.Adam(lr=self.learning_rate)

        self.n_since_last_train = 0

        self.latestLoss = tf.add(0, 0)

    def getActions(self, agentInputs):
        rand_action = self.random_action_method.get_random_action()
        if rand_action is not None:
            return [rand_action] * agentInputs.shape[0]
        else:
            pred = self.model.call(agentInputs)
            #print(pred[0])
            return [ACTIONS[x] for x in np.argmax(pred, axis=1)]

    def update(self, oldAgentInputs, actions, newAgentInputs, rewards):
        # Lägg till i experience_replay

        actions = np.array([ACTIONS.index(action) for action in actions])
        #print(["LEFT","RIGHT","JUMP","NONE"][actions[0]],rewards[0])
        self.exp_rep.add_experinces(oldAgentInputs, actions, newAgentInputs,
                                    rewards)

        self.n_since_last_train += oldAgentInputs.shape[0]

        if self.n_since_last_train > TRAIN_RATE:
            loss = self.train_on_random_minibatch()

            self.n_since_last_train = 0

    def train_on_random_minibatch(self):
        input, action, new_input, reward = self.exp_rep.get_random_minibatch(
            BATCH_SIZE)

        loss = self.train_on_batch(input, action, new_input, reward)

        #if self.load_path is not None:
        #    self.save(self.load_path)

        return loss.numpy()

    def train_on_batch(self, agent_input_before, action, agent_input_after,
                       reward):
        q_after = self.model(agent_input_after)
        wanted_q = reward + self.future_discount * tf.reduce_max(q_after,
                                                                 axis=1)
        #wanted_q = reward

        tvars = self.model.trainable_variables

        with tf.GradientTape() as tape:
            pred_q_for_all_actions = self.model(agent_input_before)

            # Indexera med rätt actions
            action_ind = tf.transpose(
                [tf.range(agent_input_before.shape[0]), action])
            pred_q_for_action = tf.gather_nd(pred_q_for_all_actions,
                                             action_ind)

            loss = self.loss_measure(wanted_q, pred_q_for_action)

            gradients = tape.gradient(loss, tvars)
        self.opt.apply_gradients(zip(gradients, tvars))

        self.latestLoss = loss
        return loss

    def save(self, path=SAVE_PATH):
        self.model.save_weights(path)
コード例 #2
0
class DoubleDeepQlearner:
    def __init__(self,
                 random_action_method,
                 future_discount=0.75,
                 learning_rate=0.001,
                 saveAndLoad=True):
        learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8)

        self.model_a = RLModel()
        self.model_a.build((None, AGENT_INPUT_SIZE))

        self.model_b = RLModel()
        self.model_b.build((None, AGENT_INPUT_SIZE))

        self.saveAndLoad = saveAndLoad

        if os.path.isfile(SAVE_PATH_A) and os.path.isfile(
                SAVE_PATH_B) and saveAndLoad:
            print("Loading")
            self.model_a.load_weights(SAVE_PATH_A)
            self.model_b.load_weights(SAVE_PATH_B)

        self.exp_rep_a = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)
        self.exp_rep_b = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)

        self.random_action_method = random_action_method

        self.learning_rate = learning_rate
        self.future_discount = future_discount

        self.loss_measure = tf.losses.MeanSquaredError()
        self.opt = tf.optimizers.Adam(lr=self.learning_rate)

        self.n_since_last_train = 0

        self.latestLoss = tf.add(0, 0)

    def getAction(self, agentInput):
        rand_action = self.random_action_method.get_random_action()
        if rand_action is not None:
            return rand_action
        else:
            model = random.choice([self.model_a, self.model_b])

            pred = model.call_fast(agentInput)
            return ACTIONS[np.argmax(pred)]

    def update(self, oldAgentInput, action, newAgentInput, reward):
        if random.random() < 0.5:
            self.exp_rep_a.add_experince(oldAgentInput, ACTIONS.index(action),
                                         newAgentInput, reward)
        else:
            self.exp_rep_b.add_experince(oldAgentInput, ACTIONS.index(action),
                                         newAgentInput, reward)

        self.n_since_last_train += 1

        if self.n_since_last_train > TRAIN_RATE:
            # print("Training")
            loss = self.train_on_random_minibatch()
            #print("Loss =", loss)
            if self.saveAndLoad:
                self.model_a.save_weights(SAVE_PATH_A)
                self.model_b.save_weights(SAVE_PATH_B)

            self.n_since_last_train = 0

    def train_on_random_minibatch(self):
        train_a = random.random() < 0.5

        if train_a:
            input, action, new_input, reward = self.exp_rep_a.get_random_minibatch(
                BATCH_SIZE)
        else:
            input, action, new_input, reward = self.exp_rep_b.get_random_minibatch(
                BATCH_SIZE)

        loss = self.train_on_batch(input, action, new_input, reward, train_a)
        return loss.numpy()

    def train_on_batch(self, agent_input_before, action, agent_input_after,
                       reward, train_a):

        if train_a:
            model_t = self.model_a
            model_p = self.model_b
        else:
            model_t = self.model_b
            model_p = self.model_a

        t_best_action = tf.math.argmax(model_t(agent_input_after), axis=1)
        tba_ind = tf.transpose([
            tf.range(agent_input_before.shape[0]),
            tf.cast(t_best_action, "int32")
        ])

        q_after = model_p(agent_input_after)

        q_after_max = tf.gather_nd(q_after, tba_ind)
        wanted_q = reward + self.future_discount * q_after_max
        #wanted_q = reward

        tvars = model_t.trainable_variables

        with tf.GradientTape() as tape:
            pred_q_for_all_actions = model_t(agent_input_before)

            # Indexera med rätt actions
            action_ind = tf.transpose(
                [tf.range(agent_input_before.shape[0]), action])
            pred_q_for_action = tf.gather_nd(pred_q_for_all_actions,
                                             action_ind)

            loss = self.loss_measure(wanted_q, pred_q_for_action)

            gradients = tape.gradient(loss, tvars)
        self.opt.apply_gradients(zip(gradients, tvars))

        self.latestLoss = loss
        return loss