# train data X = np.array([i[0] for i in train]).reshape(-1, WIDTH, HEIGHT, 1) Y = [i[1] for i in train] # val data val_x = np.array([i[0] for i in val]).reshape(-1, WIDTH, HEIGHT, 1) val_y = [i[1] for i in val] # test data test_x = np.array([i[0] for i in test]).reshape(-1, WIDTH, HEIGHT, 1) test_y = [i[1] for i in test] # train model history = model.fit(x=np.array(X), y=np.array(Y), batch_size=512, validation_data=(np.array(val_x), np.array(val_y)), epochs=EPOCHS) # plot accuracy over epochs plt.plot(history.history['categorical_accuracy']) plt.plot(history.history['val_categorical_accuracy']) plt.title('Model Accuracy (Categorical Accuracy) - {}'.format(MODEL_NAME)) plt.ylabel('Categorical Accuracy') plt.xlabel('Epoch') plt.legend(['training', 'validation'], loc='upper left') plt.show() # plot loss over epochs plt.plot(history.history['loss']) plt.plot(history.history['val_loss'])
class Worker(object): def __init__(self, sess, state_len, actions_no, max_depth, weights, workers_no): from ac_net import ACNet from net_evaluator import NetEvaluator global BATCH_SIZE self.processes = workers_no + 1 self.state_len = state_len self.actions_no = actions_no self.eps = EPS self.evaluator = NetEvaluator(trainable=True) self.actions_no = actions_no self.state = np.zeros(state_len) self.state[0] = 1 self.max_depth = max_depth self.prev_acc = 0 self.current_max_depth = max_depth self.old_weights = weights self.grads = [] self.samples = [] self.memory = dict() self.ac_net = ACNet(sess, self.state_len, self.actions_no, 'worker') self.ac_net.set_weights(self.old_weights) def update_ac_weights(self, weights): self.old_weights = weights if self.ac_net is not None: self.ac_net.set_weights(weights) def get_grads(self): return self.grads def calculate_gradients(self): grads = [] weights = self.ac_net.get_weights() for i in range(len(weights)): grads.append(weights[i] - self.old_weights[i]) return grads def fetch_from_memory(self, state): if state in self.memory: return self.memory[state] else: return None def add_to_memory(self, state, acc): self.memory[state] = acc def play(self): self.state = np.zeros(len(self.state)) self.state[0] = 1 self.prev_acc = self.evaluator.baseline del self.model self.model = None t_start = self.t episode_flag = True while episode_flag: policy, value = self.ac_net.predict( self.state.reshape(1, self.state_len)) policy = policy[0] action = np.argmax(policy) reward, new_state = self.perform_action(action) self.state = new_state self.t += 1 if self.t - t_start >= self.current_max_depth: episode_flag = False return self.prev_acc, self.state def run(self): self.grads = [] self.t = 1 self.episodes = 0 self.samples = [] self.eps = self.eps * EPS_RED_FACTOR while self.t <= BATCH_SIZE: self.state = np.zeros(len(self.state)) self.state[0] = 1 self.prev_acc = self.evaluator.baseline del self.model self.model = None t_start = self.t s_buffer = [] r_buffer = [] a_buffer = [] episode_flag = True while episode_flag: policy, value = self.ac_net.predict( self.state.reshape(1, self.state_len)) policy = policy[0] value = value[0] action = np.random.choice(self.actions_no, p=policy) if np.random.uniform() < self.eps: action = np.random.choice(self.actions_no) reward, new_state = self.perform_action(action) s_buffer.append(self.state) r_buffer.append(reward) a_buffer.append(action) self.state = new_state self.t += 1 self.print_episode(policy, action, value, reward) if self.t - t_start >= self.current_max_depth: episode_flag = False self.episodes += 1 R = 0.0 rev_rewards = [] counter = 0 for r in reversed(r_buffer): if counter == self.current_max_depth: counter = 0 R = 0 R = R * gamma + r rev_rewards.append(R) for reward, state, action in zip(rev_rewards, reversed(s_buffer), reversed(a_buffer)): self.samples.append((state, action, reward)) np.random.shuffle(self.samples) # Transfrom to column vectors state, action, reward = list(map(np.array, zip(*self.samples))) v_l, p_l, e, g_n, v_n, grads = self.ac_net.fit( state, action, reward) self.samples = [] for i in range(len(grads)): if len(self.grads) == i: self.grads.append(grads[i]) else: self.grads[i] = self.grads[i] + grads[i] if self.current_max_depth < self.max_depth: self.current_max_depth += 1 return self.prev_acc, self.state # return self.play() def print_episode(self, policy, action, value, reward): if DEBUG: print('Policy :\n', np.array2string(policy, precision=3)) print('Action :\n', action) print('Value :\n', np.array2string(value, precision=3)) print('State :', self.state) print('Reward : %.3f' % reward, 'Accuracy : %.3f' % self.prev_acc) def perform_action(self, action): # Get new state new_state = self.update_state(action) # Expand model and evaluate acc = self.fetch_from_memory(str(new_state)) if acc is None: acc = self.evaluator.evaluate_model(new_state, epochs=TRAINING_EPOCHS) self.add_to_memory(str(new_state), acc) # Get the reward reward = acc - self.prev_acc self.prev_acc = acc return reward, new_state def update_state(self, action, old_state=None): ''' Update the state, based on the action taken ''' if old_state is None: old_state = np.copy(self.state) new_state = np.copy(old_state) # If we added a layer if action != 0: onehot_action = np.zeros(self.actions_no - 1) onehot_action[action - 1] = 1 index = 1 for depth in range(self.max_depth): start = depth * (self.actions_no - 1) + 1 actives = 0 for i in range(self.actions_no - 1): actives += old_state[start + i] if actives == 0: index = start break for i in range(self.actions_no - 1): new_state[index + i] = onehot_action[i] return new_state
class Worker(object): def __init__(self, sess, state_len, actions_no, actions_bounds, max_depth, weights, workers_no, dataset, trainable): from ac_net import ACNet from net_evaluator import NetEvaluator self.processes = workers_no + 1 self.actions_no = actions_no self.eps = EPS self.evaluator = NetEvaluator(ACTIONS_NO, trainable=trainable, dataset=dataset) self.state = STARTING_STATE.copy() self.state_len = state_len self.max_depth = max_depth self.t = 1 self.prev_acc = self.evaluator.baseline self.model = None self.current_max_depth = self.max_depth self.old_weights = weights self.grads = [] self.samples = [] self.best_samples = [] self.best_reward = -1000 self.memory = dict() self.ac_net = ACNet(sess, self.state_len, self.actions_no, actions_bounds, 'worker') self.ac_net.set_weights(self.old_weights) def update_ac_weights(self, weights): self.old_weights = weights if self.ac_net is not None: self.ac_net.set_weights(weights) def get_grads(self): return self.grads def fetch_from_memory(self, state): state_repr = state.copy() for i in range(len(state)): if i % ACTIONS_NO == ACTIONS_NO - 1: state_repr[i] = np.round(state_repr[i], 1) else: state_repr[i] = np.round(state_repr[i], 0) state_repr = str(state_repr) if state_repr in self.memory: return self.memory[state_repr] else: return None def add_to_memory(self, state, acc): state_repr = state.copy() for i in range(len(state)): if i % ACTIONS_NO == ACTIONS_NO - 1: state_repr[i] = np.round(state_repr[i], 1) else: state_repr[i] = np.round(state_repr[i], 0) state_repr = str(state_repr) self.memory[state_repr] = acc def play(self): prev_trainable = self.evaluator.builder.trainable self.evaluator.builder.trainable = True self.state = STARTING_STATE.copy() self.prev_acc = 0 t_start = self.t episode_flag = True self.current_layer = 0 while episode_flag: action, policy_mean, policy_sigma, value = self.ac_net.predict( self.state.reshape(1, self.state_len // self.actions_no, self.actions_no)) value = value[(self.current_layer)] reward, new_state = self.perform_action(action) self.state = new_state self.t += 1 self.current_layer += 1 if self.t - t_start >= self.current_max_depth: episode_flag = False self.evaluator.builder.trainable = prev_trainable return self.prev_acc, self.state def run(self): self.grads = [] self.samples = [] t_start = self.t # Gather experiences self.eps = self.eps * EPS_RED_FACTOR while self.t - t_start < self.max_depth: self.current_layer = 0 R = 0.0 self.state = STARTING_STATE.copy() self.prev_acc = self.evaluator.baseline del self.model self.model = None self.d_theta = 0 self.d_theta_v = 0 self.alive = True s_buffer = [] r_buffer = [] a_buffer = [] v_buffer = [] episode_flag = True while episode_flag: action, policy_mean, policy_sigma, value = self.ac_net.predict( self.state.reshape(1, self.state_len // self.actions_no, self.actions_no)) action = action[(self.current_layer)] if np.random.uniform() < self.eps: action = (np.random.uniform() * (ACTIONS_BOUNDS[1] - ACTIONS_BOUNDS[0])) // 1 value = value[(self.current_layer)] print('Policy_mean :\n', np.array2string(policy_mean, precision=3)) print('Policy_sigma :\n', np.array2string(policy_sigma, precision=3)) print('Action :\n', action) print('Value :\n', np.array2string(value, precision=3)) print('Layer :', self.current_layer) print('State :', self.state) reward, new_state = self.perform_action(action) r_buffer.extend(([reward])) a_buffer.append(([action])) v_buffer.append(([value])) R = reward + gamma * R self.state = new_state self.t += 1 self.current_layer += 1 self.print_episode(policy_mean, policy_sigma, action, value, reward) if self.current_layer >= self.current_max_depth: episode_flag = False # Kill grads r_buffer.extend(([0])) a_buffer.append(([policy_mean[-1]])) v_buffer.append(([0])) # Add state s_buffer.append( self.state.reshape(1, self.state_len // self.actions_no, self.actions_no)) R = 0.0 rev_rewards = [] for r in reversed(r_buffer): R = R * gamma + r rev_rewards.append(R) reward = rev_rewards.reverse() reward = np.array(rev_rewards).reshape((-1, 1)) action = np.array(a_buffer).reshape((-1, self.actions_no)) state = self.state self.samples.append((self.state, action, reward)) np.random.shuffle(self.samples) # Transfrom to column vectors state, action, reward = list(map(np.array, zip(*self.samples))) v_l, p_l, e, grads = self.ac_net.fit(state, action, reward) self.samples = [] self.grads = grads if self.current_max_depth < self.max_depth and self.t > 100: self.current_max_depth += 1 self.grads = self.ac_net.get_grads() if return_episode: return self.prev_acc, self.state else: return self.play() def perform_action(self, action, search_mem=True): def get_acc(new_state): return self.evaluator.evaluate_model(new_state, epochs=TRAIN_EPOCHS) # Get new state new_state = self.update_state(action) # Build the model and evaluate acc = self.fetch_from_memory(new_state) if not search_mem: acc = get_acc(new_state) else: if acc is None: acc = get_acc(new_state) self.add_to_memory(new_state, acc) # Get the reward reward = (acc - self.prev_acc) self.prev_acc = acc return reward, new_state def update_state(self, action, old_state=None): ''' Update the state, based on the action taken ''' if old_state is None: old_state = np.copy(self.state) new_state = np.copy(old_state) index = (self.current_layer + 1) * ACTIONS_NO for i in range(self.actions_no): new_state[index + i] = action[i] return new_state def print_episode(self, policy_mean, policy_sigma, action, value, reward): if DEBUG: print('Policy_mean :\n', np.array2string(policy_mean, precision=3)) print('Policy_sigma :\n', np.array2string(policy_sigma, precision=3)) print('Action :\n', action) print('Value :\n', np.array2string(value, precision=3)) print('Layer :', self.current_layer) print('State :', self.state) print('Reward : %.3f' % reward, 'Accuracy : %.3f' % self.prev_acc)