Ejemplo n.º 1
0
class Worker(object):
    def __init__(self, sess, state_len, actions_no, max_depth, weights,
                 workers_no):
        from ac_net import ACNet
        from net_evaluator import NetEvaluator
        global BATCH_SIZE
        self.processes = workers_no + 1

        self.state_len = state_len
        self.actions_no = actions_no
        self.eps = EPS
        self.evaluator = NetEvaluator(trainable=True)
        self.actions_no = actions_no
        self.state = np.zeros(state_len)
        self.state[0] = 1

        self.max_depth = max_depth
        self.prev_acc = 0

        self.current_max_depth = max_depth
        self.old_weights = weights
        self.grads = []
        self.samples = []
        self.memory = dict()
        self.ac_net = ACNet(sess, self.state_len, self.actions_no, 'worker')
        self.ac_net.set_weights(self.old_weights)

    def update_ac_weights(self, weights):
        self.old_weights = weights
        if self.ac_net is not None:
            self.ac_net.set_weights(weights)

    def get_grads(self):
        return self.grads

    def calculate_gradients(self):
        grads = []
        weights = self.ac_net.get_weights()
        for i in range(len(weights)):
            grads.append(weights[i] - self.old_weights[i])
        return grads

    def fetch_from_memory(self, state):
        if state in self.memory:
            return self.memory[state]
        else:
            return None

    def add_to_memory(self, state, acc):
        self.memory[state] = acc

    def play(self):
        self.state = np.zeros(len(self.state))
        self.state[0] = 1
        self.prev_acc = self.evaluator.baseline
        del self.model
        self.model = None
        t_start = self.t

        episode_flag = True
        while episode_flag:

            policy, value = self.ac_net.predict(
                self.state.reshape(1, self.state_len))
            policy = policy[0]

            action = np.argmax(policy)
            reward, new_state = self.perform_action(action)
            self.state = new_state
            self.t += 1
            if self.t - t_start >= self.current_max_depth:
                episode_flag = False

        return self.prev_acc, self.state

    def run(self):

        self.grads = []
        self.t = 1
        self.episodes = 0
        self.samples = []
        self.eps = self.eps * EPS_RED_FACTOR

        while self.t <= BATCH_SIZE:

            self.state = np.zeros(len(self.state))
            self.state[0] = 1
            self.prev_acc = self.evaluator.baseline
            del self.model

            self.model = None

            t_start = self.t
            s_buffer = []
            r_buffer = []
            a_buffer = []

            episode_flag = True
            while episode_flag:

                policy, value = self.ac_net.predict(
                    self.state.reshape(1, self.state_len))

                policy = policy[0]
                value = value[0]
                action = np.random.choice(self.actions_no, p=policy)
                if np.random.uniform() < self.eps:
                    action = np.random.choice(self.actions_no)

                reward, new_state = self.perform_action(action)

                s_buffer.append(self.state)
                r_buffer.append(reward)
                a_buffer.append(action)

                self.state = new_state
                self.t += 1
                self.print_episode(policy, action, value, reward)
                if self.t - t_start >= self.current_max_depth:
                    episode_flag = False

            self.episodes += 1

            R = 0.0
            rev_rewards = []
            counter = 0
            for r in reversed(r_buffer):
                if counter == self.current_max_depth:
                    counter = 0
                    R = 0
                R = R * gamma + r
                rev_rewards.append(R)

            for reward, state, action in zip(rev_rewards, reversed(s_buffer),
                                             reversed(a_buffer)):
                self.samples.append((state, action, reward))

            np.random.shuffle(self.samples)

            # Transfrom to column vectors
            state, action, reward = list(map(np.array, zip(*self.samples)))
            v_l, p_l, e, g_n, v_n, grads = self.ac_net.fit(
                state, action, reward)
            self.samples = []

            for i in range(len(grads)):
                if len(self.grads) == i:
                    self.grads.append(grads[i])
                else:
                    self.grads[i] = self.grads[i] + grads[i]

        if self.current_max_depth < self.max_depth:
            self.current_max_depth += 1

        return self.prev_acc, self.state
        # return self.play()

    def print_episode(self, policy, action, value, reward):
        if DEBUG:
            print('Policy :\n', np.array2string(policy, precision=3))
            print('Action :\n', action)
            print('Value  :\n', np.array2string(value, precision=3))
            print('State :', self.state)
            print('Reward : %.3f' % reward, 'Accuracy : %.3f' % self.prev_acc)

    def perform_action(self, action):
        # Get new state
        new_state = self.update_state(action)
        # Expand model and evaluate
        acc = self.fetch_from_memory(str(new_state))
        if acc is None:
            acc = self.evaluator.evaluate_model(new_state,
                                                epochs=TRAINING_EPOCHS)
            self.add_to_memory(str(new_state), acc)
        # Get the reward
        reward = acc - self.prev_acc

        self.prev_acc = acc
        return reward, new_state

    def update_state(self, action, old_state=None):
        '''
            Update the state, based on the action taken
        '''

        if old_state is None:
            old_state = np.copy(self.state)

        new_state = np.copy(old_state)

        # If we added a layer
        if action != 0:
            onehot_action = np.zeros(self.actions_no - 1)
            onehot_action[action - 1] = 1
            index = 1
            for depth in range(self.max_depth):
                start = depth * (self.actions_no - 1) + 1
                actives = 0
                for i in range(self.actions_no - 1):
                    actives += old_state[start + i]
                if actives == 0:
                    index = start
                    break
            for i in range(self.actions_no - 1):
                new_state[index + i] = onehot_action[i]
        return new_state
Ejemplo n.º 2
0
class Worker(object):
    def __init__(self, sess, state_len, actions_no, actions_bounds, max_depth,
                 weights, workers_no, dataset, trainable):
        from ac_net import ACNet
        from net_evaluator import NetEvaluator

        self.processes = workers_no + 1
        self.actions_no = actions_no
        self.eps = EPS
        self.evaluator = NetEvaluator(ACTIONS_NO,
                                      trainable=trainable,
                                      dataset=dataset)
        self.state = STARTING_STATE.copy()
        self.state_len = state_len
        self.max_depth = max_depth
        self.t = 1
        self.prev_acc = self.evaluator.baseline
        self.model = None
        self.current_max_depth = self.max_depth
        self.old_weights = weights
        self.grads = []
        self.samples = []
        self.best_samples = []
        self.best_reward = -1000
        self.memory = dict()
        self.ac_net = ACNet(sess, self.state_len, self.actions_no,
                            actions_bounds, 'worker')
        self.ac_net.set_weights(self.old_weights)

    def update_ac_weights(self, weights):
        self.old_weights = weights
        if self.ac_net is not None:
            self.ac_net.set_weights(weights)

    def get_grads(self):
        return self.grads

    def fetch_from_memory(self, state):
        state_repr = state.copy()
        for i in range(len(state)):
            if i % ACTIONS_NO == ACTIONS_NO - 1:
                state_repr[i] = np.round(state_repr[i], 1)
            else:
                state_repr[i] = np.round(state_repr[i], 0)
        state_repr = str(state_repr)
        if state_repr in self.memory:
            return self.memory[state_repr]
        else:
            return None

    def add_to_memory(self, state, acc):
        state_repr = state.copy()
        for i in range(len(state)):
            if i % ACTIONS_NO == ACTIONS_NO - 1:
                state_repr[i] = np.round(state_repr[i], 1)
            else:
                state_repr[i] = np.round(state_repr[i], 0)
        state_repr = str(state_repr)
        self.memory[state_repr] = acc

    def play(self):
        prev_trainable = self.evaluator.builder.trainable
        self.evaluator.builder.trainable = True
        self.state = STARTING_STATE.copy()
        self.prev_acc = 0
        t_start = self.t
        episode_flag = True
        self.current_layer = 0
        while episode_flag:

            action, policy_mean, policy_sigma, value = self.ac_net.predict(
                self.state.reshape(1, self.state_len // self.actions_no,
                                   self.actions_no))

            value = value[(self.current_layer)]
            reward, new_state = self.perform_action(action)

            self.state = new_state
            self.t += 1
            self.current_layer += 1
            if self.t - t_start >= self.current_max_depth:
                episode_flag = False

        self.evaluator.builder.trainable = prev_trainable
        return self.prev_acc, self.state

    def run(self):

        self.grads = []
        self.samples = []
        t_start = self.t
        # Gather experiences
        self.eps = self.eps * EPS_RED_FACTOR

        while self.t - t_start < self.max_depth:
            self.current_layer = 0
            R = 0.0
            self.state = STARTING_STATE.copy()
            self.prev_acc = self.evaluator.baseline
            del self.model

            self.model = None
            self.d_theta = 0
            self.d_theta_v = 0
            self.alive = True

            s_buffer = []
            r_buffer = []
            a_buffer = []
            v_buffer = []

            episode_flag = True
            while episode_flag:

                action, policy_mean, policy_sigma, value = self.ac_net.predict(
                    self.state.reshape(1, self.state_len // self.actions_no,
                                       self.actions_no))

                action = action[(self.current_layer)]

                if np.random.uniform() < self.eps:
                    action = (np.random.uniform() *
                              (ACTIONS_BOUNDS[1] - ACTIONS_BOUNDS[0])) // 1

                value = value[(self.current_layer)]

                print('Policy_mean :\n',
                      np.array2string(policy_mean, precision=3))
                print('Policy_sigma :\n',
                      np.array2string(policy_sigma, precision=3))
                print('Action :\n', action)
                print('Value  :\n', np.array2string(value, precision=3))
                print('Layer :', self.current_layer)
                print('State :', self.state)
                reward, new_state = self.perform_action(action)

                r_buffer.extend(([reward]))
                a_buffer.append(([action]))
                v_buffer.append(([value]))
                R = reward + gamma * R
                self.state = new_state
                self.t += 1
                self.current_layer += 1

                self.print_episode(policy_mean, policy_sigma, action, value,
                                   reward)
                if self.current_layer >= self.current_max_depth:
                    episode_flag = False

            # Kill grads
            r_buffer.extend(([0]))
            a_buffer.append(([policy_mean[-1]]))
            v_buffer.append(([0]))
            # Add state
            s_buffer.append(
                self.state.reshape(1, self.state_len // self.actions_no,
                                   self.actions_no))

            R = 0.0
            rev_rewards = []
            for r in reversed(r_buffer):
                R = R * gamma + r
                rev_rewards.append(R)

            reward = rev_rewards.reverse()
            reward = np.array(rev_rewards).reshape((-1, 1))
            action = np.array(a_buffer).reshape((-1, self.actions_no))
            state = self.state

            self.samples.append((self.state, action, reward))

        np.random.shuffle(self.samples)

        # Transfrom to column vectors
        state, action, reward = list(map(np.array, zip(*self.samples)))

        v_l, p_l, e, grads = self.ac_net.fit(state, action, reward)
        self.samples = []
        self.grads = grads

        if self.current_max_depth < self.max_depth and self.t > 100:
            self.current_max_depth += 1

        self.grads = self.ac_net.get_grads()
        if return_episode:
            return self.prev_acc, self.state
        else:
            return self.play()

    def perform_action(self, action, search_mem=True):
        def get_acc(new_state):

            return self.evaluator.evaluate_model(new_state,
                                                 epochs=TRAIN_EPOCHS)

        # Get new state
        new_state = self.update_state(action)

        # Build the model and evaluate
        acc = self.fetch_from_memory(new_state)
        if not search_mem:
            acc = get_acc(new_state)
        else:
            if acc is None:
                acc = get_acc(new_state)
                self.add_to_memory(new_state, acc)
        # Get the reward
        reward = (acc - self.prev_acc)
        self.prev_acc = acc
        return reward, new_state

    def update_state(self, action, old_state=None):
        '''
            Update the state, based on the action taken
        '''

        if old_state is None:
            old_state = np.copy(self.state)

        new_state = np.copy(old_state)
        index = (self.current_layer + 1) * ACTIONS_NO

        for i in range(self.actions_no):
            new_state[index + i] = action[i]

        return new_state

    def print_episode(self, policy_mean, policy_sigma, action, value, reward):
        if DEBUG:
            print('Policy_mean :\n', np.array2string(policy_mean, precision=3))
            print('Policy_sigma :\n', np.array2string(policy_sigma,
                                                      precision=3))
            print('Action :\n', action)
            print('Value  :\n', np.array2string(value, precision=3))
            print('Layer :', self.current_layer)
            print('State :', self.state)
            print('Reward : %.3f' % reward, 'Accuracy : %.3f' % self.prev_acc)