Esempio n. 1
0
    def __init__(self, actions, gamma=0.1, e_greedy=0.9):
        self.state_size = 4
        neurons = 24

        self.actions = actions
        self.gamma = gamma
        self.epsilon = e_greedy
        self.lr = 0.1
        self.count = 0
        self.epochs = 50

        self.v_max = 10
        self.v_min = -10
        self.atoms = 51
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)
        self.z = [self.v_min + i * self.delta_z for i in range(self.atoms)]

        self.m = Build_Model(self.state_size,
                             neurons,
                             len(actions),
                             atoms=self.atoms)
        self.m.build()
        self.model = self.m.model
        self.dump_model = copy.copy(self.model)
        self.optimizer = tf.optimizers.Adam(lr=self.lr)
        self.batch_size = 100

        self.capacity = 300
        self.memory = Memory(self.capacity)

        self.record_size = self.capacity
Esempio n. 2
0
    def __init__(self, actions, gamma=0.95, e_greedy=0.9):
        self.actions = actions  # a list
        self.gamma = gamma
        self.epsilon = e_greedy
        self.record = []
        self.lr = 0.4
        self.count = 0

        self.m = Build_Model(1, 10, len(actions))
        self.model = self.m.model
        self.dump_model = copy.copy(self.model)
Esempio n. 3
0
    def __init__(self, actions, gamma=0.7, e_greedy = 0.7):
        self.actions = actions  # a list
        self.gamma = gamma
        self.epsilon = e_greedy
        self.lr = 0.01
        self.count = 0
        self.epochs = 50
        self.bar = Progbar(self.epochs)
        self.epoch_loss_avg = tf.keras.metrics.Mean()

        self.batch_size = 100
        self.state_size = 2
        self.record_size = 200

        # initial model include the hard-working one and the dump one.
        M = Build_Model(self.state_size, 16, len(actions))
        self.model = M.build()
        self.dump_model = copy.copy(self.model)
        self.optimizer = tf.optimizers.Adam(lr = self.lr)
        # initial memory with sum tree
        self.capacity = 200
        self.memory = Memory(self.capacity)
Esempio n. 4
0
    def __init__(self,
                 actions,
                 learning_rate=0.001,
                 reward_decay=0.9,
                 e_greedy=0.4):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.batch_size = 25
        self.state_size = 4

        # neural network
        M = Build_Model(4, 4, 4)
        self.model = M.build()
        self.target_model = copy.copy(self.model)
        self.optimizer = tf.optimizers.Adam(lr=self.lr)
        self.epochs = 1

        # memory
        self.capacity = 200
        self.memory = Memory(self.capacity)
        self.store_times = 0
Esempio n. 5
0
 def __init__(self, actions, gamma=0.1, e_greedy=0.9):
     self.actions = actions  # a list
     self.gamma = gamma
     self.epsilon = e_greedy
     self.lr = 0.1
     self.count = 0
     self.epochs = 5
     # initial model include the hard-working one and the dump one.
     self.m = Build_Model(1, 10, len(actions))
     self.model = self.m.model
     self.dump_model = copy.copy(self.model)
     # initial memory with sum tree
     self.capacity = 200
     self.memory = Memory(self.capacity)
Esempio n. 6
0
class Agent:
    def __init__(self, actions, gamma=0.1, e_greedy=0.9):
        self.state_size = 4
        neurons = 24

        self.actions = actions
        self.gamma = gamma
        self.epsilon = e_greedy
        self.lr = 0.1
        self.count = 0
        self.epochs = 50

        self.v_max = 10
        self.v_min = -10
        self.atoms = 51
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)
        self.z = [self.v_min + i * self.delta_z for i in range(self.atoms)]

        self.m = Build_Model(self.state_size,
                             neurons,
                             len(actions),
                             atoms=self.atoms)
        self.m.build()
        self.model = self.m.model
        self.dump_model = copy.copy(self.model)
        self.optimizer = tf.optimizers.Adam(lr=self.lr)
        self.batch_size = 100

        self.capacity = 300
        self.memory = Memory(self.capacity)

        self.record_size = self.capacity

    @timecost
    def choose_action(self, s):
        if np.random.uniform() < self.epsilon:
            # choose the best action
            state_action = []
            for i in self.model.predict([[s]]):
                state_action.append(
                    np.sum([self.z[j] * i[0][j] for j in range(self.atoms)]))
            action = np.random.choice([
                i for i in range(len(state_action))
                if state_action[i] == max(state_action)
            ])
        else:
            # choose action randomly
            action = np.random.choice(self.actions)
        return action

    #@timecost
    def learn(self, s, a, r, s_, done):
        loss, q_distribution = self.get_q_value(s, a, r, s_, done)
        self.memory.add(loss, [s, a, r, s_, q_distribution])
        self.count += 1

        # train when do record_size times actions.
        if self.count % self.record_size == 0:
            batch, idxs, is_weights = self.memory.sample(self.batch_size)
            X_train = np.zeros((self.batch_size, self.state_size))
            Y_train = [
                np.zeros((self.batch_size, self.atoms))
                for i in range(len(self.actions))
            ]

            for i in range(self.batch_size):
                X_train[i] = batch[i][0]
                for i_ in range(len(self.actions)):
                    Y_train[i_][i][:] = batch[i][4][i_][:]

            print('-----training-----')
            for i in range(self.epochs):
                self.train(X_train, Y_train)

            # update prioritized experience
            for i in range(self.batch_size):
                _s, _a, _r, _s_, is_weight = batch[i][0], batch[i][1], batch[
                    i][2], batch[i][3], is_weights[i]
                loss = self.get_q_value(_s, _a, _r, _s_, done)[0]
                self.memory.update(idxs[i], is_weight * loss)

    #@timecost
    def get_q_value(self, s, a, r, s_, done):
        p = self.model.predict([[s]])
        old_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1)
        # 一樣有 double dqn
        p_next = self.model.predict([[s_]])
        q = np.sum(np.multiply(np.vstack(p_next), np.array(self.z)), axis=1)

        p_d_next = self.dump_model.predict([[s_]])
        next_action_idxs = np.argmax(q)
        # init m 值
        m_prob = [np.zeros((1, self.atoms))]
        # action 後更新 m 值
        if done:  # Distribution collapses to a single point
            Tz = min(self.v_max, max(self.v_min, r))
            bj = (Tz - self.v_min) / self.delta_z
            m_l, m_u = math.floor(bj), math.ceil(bj)
            m_prob[0][0][int(m_l)] += (m_u - bj)
            m_prob[0][0][int(m_u)] += (bj - m_l)
        else:
            for j in range(self.atoms):
                Tz = min(self.v_max, max(self.v_min,
                                         r + self.gamma * self.z[j]))
                bj = (Tz - self.v_min) / self.delta_z
                m_l, m_u = math.floor(bj), math.ceil(bj)
                m_prob[0][0][int(
                    m_l)] += p_d_next[next_action_idxs][0][j] * (m_u - bj)
                m_prob[0][0][int(
                    m_u)] += p_d_next[next_action_idxs][0][j] * (bj - m_l)
# 更新後放回p,回去訓練
        p[a][0][:] = m_prob[0][0][:]
        # 計算q估計
        new_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1)
        #計算 error 給PER
        error = abs(old_q[a] - new_q[a])
        return error, p

    def _loss(self, model, x, y):
        y_ = self.model(x)
        #loss = sum(sum(tf.nn.softmax_cross_entropy_with_logits(y, y_)))
        loss = tf.nn.softmax_cross_entropy_with_logits(y, y_)

        return loss

    def _grad(self, model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = self._loss(self.model, inputs, targets)
        return loss_value, tape.gradient(loss_value,
                                         self.model.trainable_variables)

    def train(self, s, q):
        loss_value, grads = self._grad(self.model, s, q)
        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables),
            get_or_create_global_step())
Esempio n. 7
0
    def _loss(self, model, x, y):
        x = np.array(x)
        y_ = model(x)
        loss = huber_loss(y, y_)
        return loss

    def _grad(self, model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = self._loss(model, inputs, targets)

            return tape.gradient(loss_value, self.model.trainable_variables)

    def train(self, model, s, q):
        grads = self._grad(model, s, q)
        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables),
            get_or_create_global_step())


if __name__ == "__main__":
    M = Build_Model(2, 10, 2)
    model = M.build()
    x = [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 3], [2, 0],
         [2, 3], [3, 0], [3, 1], [3, 2], [3, 3]]
    y = [[1, 1], [0, 1], [0, 1], [1, 0], [1, 0], [-1, -1], [1, 0], [1, 0],
         [0, -1], [0, 1], [0, 1], [-1, 0], [-1, -1]]
    for i in range(200):
        M.train(model, x, y)
    print(model.predict([x]))