def __init__(self, sess, state_len, actions_no, max_depth, weights, workers_no): from ac_net import ACNet from net_evaluator import NetEvaluator global BATCH_SIZE self.processes = workers_no + 1 self.state_len = state_len self.actions_no = actions_no self.eps = EPS self.evaluator = NetEvaluator(trainable=True) self.actions_no = actions_no self.state = np.zeros(state_len) self.state[0] = 1 self.max_depth = max_depth self.prev_acc = 0 self.current_max_depth = max_depth self.old_weights = weights self.grads = [] self.samples = [] self.memory = dict() self.ac_net = ACNet(sess, self.state_len, self.actions_no, 'worker') self.ac_net.set_weights(self.old_weights)
def __init__(self, sess, state_len, actions_no, actions_bounds, max_depth, weights, workers_no, dataset, trainable): from ac_net import ACNet from net_evaluator import NetEvaluator self.processes = workers_no + 1 self.actions_no = actions_no self.eps = EPS self.evaluator = NetEvaluator(ACTIONS_NO, trainable=trainable, dataset=dataset) self.state = STARTING_STATE.copy() self.state_len = state_len self.max_depth = max_depth self.t = 1 self.prev_acc = self.evaluator.baseline self.model = None self.current_max_depth = self.max_depth self.old_weights = weights self.grads = [] self.samples = [] self.best_samples = [] self.best_reward = -1000 self.memory = dict() self.ac_net = ACNet(sess, self.state_len, self.actions_no, actions_bounds, 'worker') self.ac_net.set_weights(self.old_weights)
def __init__(self, sess, state_len, actions_no, action_bounds, workers_no): from ac_net import ACNet self.T = 0 self.processes = workers_no + 1 # Next, we build a very simple model. self.ac_net = ACNet(sess, state_len, actions_no, action_bounds, 'Master')
def __init__(self, sess, state_len, actions_no, workers_no): from ac_net import ACNet self.T = 0 self.processes = workers_no + 1 self.ac_net = ACNet(sess, state_len, actions_no, 'Master')
class Worker(object): def __init__(self, sess, state_len, actions_no, max_depth, weights, workers_no): from ac_net import ACNet from net_evaluator import NetEvaluator global BATCH_SIZE self.processes = workers_no + 1 self.state_len = state_len self.actions_no = actions_no self.eps = EPS self.evaluator = NetEvaluator(trainable=True) self.actions_no = actions_no self.state = np.zeros(state_len) self.state[0] = 1 self.max_depth = max_depth self.prev_acc = 0 self.current_max_depth = max_depth self.old_weights = weights self.grads = [] self.samples = [] self.memory = dict() self.ac_net = ACNet(sess, self.state_len, self.actions_no, 'worker') self.ac_net.set_weights(self.old_weights) def update_ac_weights(self, weights): self.old_weights = weights if self.ac_net is not None: self.ac_net.set_weights(weights) def get_grads(self): return self.grads def calculate_gradients(self): grads = [] weights = self.ac_net.get_weights() for i in range(len(weights)): grads.append(weights[i] - self.old_weights[i]) return grads def fetch_from_memory(self, state): if state in self.memory: return self.memory[state] else: return None def add_to_memory(self, state, acc): self.memory[state] = acc def play(self): self.state = np.zeros(len(self.state)) self.state[0] = 1 self.prev_acc = self.evaluator.baseline del self.model self.model = None t_start = self.t episode_flag = True while episode_flag: policy, value = self.ac_net.predict( self.state.reshape(1, self.state_len)) policy = policy[0] action = np.argmax(policy) reward, new_state = self.perform_action(action) self.state = new_state self.t += 1 if self.t - t_start >= self.current_max_depth: episode_flag = False return self.prev_acc, self.state def run(self): self.grads = [] self.t = 1 self.episodes = 0 self.samples = [] self.eps = self.eps * EPS_RED_FACTOR while self.t <= BATCH_SIZE: self.state = np.zeros(len(self.state)) self.state[0] = 1 self.prev_acc = self.evaluator.baseline del self.model self.model = None t_start = self.t s_buffer = [] r_buffer = [] a_buffer = [] episode_flag = True while episode_flag: policy, value = self.ac_net.predict( self.state.reshape(1, self.state_len)) policy = policy[0] value = value[0] action = np.random.choice(self.actions_no, p=policy) if np.random.uniform() < self.eps: action = np.random.choice(self.actions_no) reward, new_state = self.perform_action(action) s_buffer.append(self.state) r_buffer.append(reward) a_buffer.append(action) self.state = new_state self.t += 1 self.print_episode(policy, action, value, reward) if self.t - t_start >= self.current_max_depth: episode_flag = False self.episodes += 1 R = 0.0 rev_rewards = [] counter = 0 for r in reversed(r_buffer): if counter == self.current_max_depth: counter = 0 R = 0 R = R * gamma + r rev_rewards.append(R) for reward, state, action in zip(rev_rewards, reversed(s_buffer), reversed(a_buffer)): self.samples.append((state, action, reward)) np.random.shuffle(self.samples) # Transfrom to column vectors state, action, reward = list(map(np.array, zip(*self.samples))) v_l, p_l, e, g_n, v_n, grads = self.ac_net.fit( state, action, reward) self.samples = [] for i in range(len(grads)): if len(self.grads) == i: self.grads.append(grads[i]) else: self.grads[i] = self.grads[i] + grads[i] if self.current_max_depth < self.max_depth: self.current_max_depth += 1 return self.prev_acc, self.state # return self.play() def print_episode(self, policy, action, value, reward): if DEBUG: print('Policy :\n', np.array2string(policy, precision=3)) print('Action :\n', action) print('Value :\n', np.array2string(value, precision=3)) print('State :', self.state) print('Reward : %.3f' % reward, 'Accuracy : %.3f' % self.prev_acc) def perform_action(self, action): # Get new state new_state = self.update_state(action) # Expand model and evaluate acc = self.fetch_from_memory(str(new_state)) if acc is None: acc = self.evaluator.evaluate_model(new_state, epochs=TRAINING_EPOCHS) self.add_to_memory(str(new_state), acc) # Get the reward reward = acc - self.prev_acc self.prev_acc = acc return reward, new_state def update_state(self, action, old_state=None): ''' Update the state, based on the action taken ''' if old_state is None: old_state = np.copy(self.state) new_state = np.copy(old_state) # If we added a layer if action != 0: onehot_action = np.zeros(self.actions_no - 1) onehot_action[action - 1] = 1 index = 1 for depth in range(self.max_depth): start = depth * (self.actions_no - 1) + 1 actives = 0 for i in range(self.actions_no - 1): actives += old_state[start + i] if actives == 0: index = start break for i in range(self.actions_no - 1): new_state[index + i] = onehot_action[i] return new_state
# data = np.load('3_training_data_2_balanced_400k.npy', allow_pickle = True) WIDTH = 64 HEIGHT = 48 VERSION = 3 EPOCHS = 50 SAMPLES = 400 MODEL = 'acnet' MODEL_NAME = '{}-v{}-{}-epochs-{}k-samples.model'.format( MODEL, VERSION, EPOCHS, SAMPLES) # initialize a new model with the given width and height model = ACNet(WIDTH, HEIGHT) # train-test split = 75-5-20 (TRAIN, VAL, TEST) % # 400k samples balanced data- FROM 87939, TRAIN = 65954, VAL = 4397, TEST = 17588 (65954 + 4397 = 70351) # 400k samples balanced data- FROM 92820, TRAIN = 69615, VAL = 4641, TEST = 18564 (69615 + 4641 = 74556) train = data[:-18564] # removed testing data from total val = train[-4641:] # removed val from train data test = data[-18564:] # reserved test data from total # train data X = np.array([i[0] for i in train]).reshape(-1, WIDTH, HEIGHT, 1) Y = [i[1] for i in train] # val data val_x = np.array([i[0] for i in val]).reshape(-1, WIDTH, HEIGHT, 1) val_y = [i[1] for i in val]
class Worker(object): def __init__(self, sess, state_len, actions_no, actions_bounds, max_depth, weights, workers_no, dataset, trainable): from ac_net import ACNet from net_evaluator import NetEvaluator self.processes = workers_no + 1 self.actions_no = actions_no self.eps = EPS self.evaluator = NetEvaluator(ACTIONS_NO, trainable=trainable, dataset=dataset) self.state = STARTING_STATE.copy() self.state_len = state_len self.max_depth = max_depth self.t = 1 self.prev_acc = self.evaluator.baseline self.model = None self.current_max_depth = self.max_depth self.old_weights = weights self.grads = [] self.samples = [] self.best_samples = [] self.best_reward = -1000 self.memory = dict() self.ac_net = ACNet(sess, self.state_len, self.actions_no, actions_bounds, 'worker') self.ac_net.set_weights(self.old_weights) def update_ac_weights(self, weights): self.old_weights = weights if self.ac_net is not None: self.ac_net.set_weights(weights) def get_grads(self): return self.grads def fetch_from_memory(self, state): state_repr = state.copy() for i in range(len(state)): if i % ACTIONS_NO == ACTIONS_NO - 1: state_repr[i] = np.round(state_repr[i], 1) else: state_repr[i] = np.round(state_repr[i], 0) state_repr = str(state_repr) if state_repr in self.memory: return self.memory[state_repr] else: return None def add_to_memory(self, state, acc): state_repr = state.copy() for i in range(len(state)): if i % ACTIONS_NO == ACTIONS_NO - 1: state_repr[i] = np.round(state_repr[i], 1) else: state_repr[i] = np.round(state_repr[i], 0) state_repr = str(state_repr) self.memory[state_repr] = acc def play(self): prev_trainable = self.evaluator.builder.trainable self.evaluator.builder.trainable = True self.state = STARTING_STATE.copy() self.prev_acc = 0 t_start = self.t episode_flag = True self.current_layer = 0 while episode_flag: action, policy_mean, policy_sigma, value = self.ac_net.predict( self.state.reshape(1, self.state_len // self.actions_no, self.actions_no)) value = value[(self.current_layer)] reward, new_state = self.perform_action(action) self.state = new_state self.t += 1 self.current_layer += 1 if self.t - t_start >= self.current_max_depth: episode_flag = False self.evaluator.builder.trainable = prev_trainable return self.prev_acc, self.state def run(self): self.grads = [] self.samples = [] t_start = self.t # Gather experiences self.eps = self.eps * EPS_RED_FACTOR while self.t - t_start < self.max_depth: self.current_layer = 0 R = 0.0 self.state = STARTING_STATE.copy() self.prev_acc = self.evaluator.baseline del self.model self.model = None self.d_theta = 0 self.d_theta_v = 0 self.alive = True s_buffer = [] r_buffer = [] a_buffer = [] v_buffer = [] episode_flag = True while episode_flag: action, policy_mean, policy_sigma, value = self.ac_net.predict( self.state.reshape(1, self.state_len // self.actions_no, self.actions_no)) action = action[(self.current_layer)] if np.random.uniform() < self.eps: action = (np.random.uniform() * (ACTIONS_BOUNDS[1] - ACTIONS_BOUNDS[0])) // 1 value = value[(self.current_layer)] print('Policy_mean :\n', np.array2string(policy_mean, precision=3)) print('Policy_sigma :\n', np.array2string(policy_sigma, precision=3)) print('Action :\n', action) print('Value :\n', np.array2string(value, precision=3)) print('Layer :', self.current_layer) print('State :', self.state) reward, new_state = self.perform_action(action) r_buffer.extend(([reward])) a_buffer.append(([action])) v_buffer.append(([value])) R = reward + gamma * R self.state = new_state self.t += 1 self.current_layer += 1 self.print_episode(policy_mean, policy_sigma, action, value, reward) if self.current_layer >= self.current_max_depth: episode_flag = False # Kill grads r_buffer.extend(([0])) a_buffer.append(([policy_mean[-1]])) v_buffer.append(([0])) # Add state s_buffer.append( self.state.reshape(1, self.state_len // self.actions_no, self.actions_no)) R = 0.0 rev_rewards = [] for r in reversed(r_buffer): R = R * gamma + r rev_rewards.append(R) reward = rev_rewards.reverse() reward = np.array(rev_rewards).reshape((-1, 1)) action = np.array(a_buffer).reshape((-1, self.actions_no)) state = self.state self.samples.append((self.state, action, reward)) np.random.shuffle(self.samples) # Transfrom to column vectors state, action, reward = list(map(np.array, zip(*self.samples))) v_l, p_l, e, grads = self.ac_net.fit(state, action, reward) self.samples = [] self.grads = grads if self.current_max_depth < self.max_depth and self.t > 100: self.current_max_depth += 1 self.grads = self.ac_net.get_grads() if return_episode: return self.prev_acc, self.state else: return self.play() def perform_action(self, action, search_mem=True): def get_acc(new_state): return self.evaluator.evaluate_model(new_state, epochs=TRAIN_EPOCHS) # Get new state new_state = self.update_state(action) # Build the model and evaluate acc = self.fetch_from_memory(new_state) if not search_mem: acc = get_acc(new_state) else: if acc is None: acc = get_acc(new_state) self.add_to_memory(new_state, acc) # Get the reward reward = (acc - self.prev_acc) self.prev_acc = acc return reward, new_state def update_state(self, action, old_state=None): ''' Update the state, based on the action taken ''' if old_state is None: old_state = np.copy(self.state) new_state = np.copy(old_state) index = (self.current_layer + 1) * ACTIONS_NO for i in range(self.actions_no): new_state[index + i] = action[i] return new_state def print_episode(self, policy_mean, policy_sigma, action, value, reward): if DEBUG: print('Policy_mean :\n', np.array2string(policy_mean, precision=3)) print('Policy_sigma :\n', np.array2string(policy_sigma, precision=3)) print('Action :\n', action) print('Value :\n', np.array2string(value, precision=3)) print('Layer :', self.current_layer) print('State :', self.state) print('Reward : %.3f' % reward, 'Accuracy : %.3f' % self.prev_acc)
from ac_net import ACNet # from resnet import ResNet # from dense_net import DenseNet WIDTH = 64 HEIGHT = 48 VERSION = 4 EPOCHS = 50 SAMPLES = 400 MODEL = 'acnet' MODEL_NAME = '{}-v{}-{}-epochs-{}k-samples.model'.format(MODEL, VERSION, EPOCHS, SAMPLES) model = ACNet(WIDTH, HEIGHT) # model = ResNet(WIDTH, HEIGHT) # model = DenseNet(WIDTH, HEIGHT) model = load_model(MODEL_NAME) t_time = 0.05 # defining possible movements def straight(): PressKey(W) ReleaseKey(A) ReleaseKey(D) def left(): PressKey(W)
def run(render=False): env = gym.make(GAME) s = env.reset() N_S, N_A = env.observation_space.shape[0], env.action_space.shape[0] A_BOUND = [env.action_space.low, env.action_space.high] env.close() sess = tf.InteractiveSession() OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') # OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') GLOBAL_AC = ACNet(sess, GLOBAL_NET_SCOPE, N_S, N_A, A_BOUND, OPT_A, entropy_beta=ENTROPY_BETA) # we only need its params # Create train worker workers = [] for i in range(N_WORKERS): i_name = 'W_%i' % i # worker name env = gym.make(GAME) ac = ACNet(sess, i_name, N_S, N_A, A_BOUND, OPT_A, global_ac=GLOBAL_AC, entropy_beta=ENTROPY_BETA) workers.append(ACWorker(ac, env, GAMMA, name=i_name)) # create test worker env = gym.make(GAME) ac = ACNet(sess, 'test', N_S, N_A, A_BOUND, OPT_A, global_ac=GLOBAL_AC, entropy_beta=ENTROPY_BETA) tester = ACWorker(ac, env, GAMMA, name="test") # create save worker saver = SaveWorker(sess) # init variables sess.run(tf.global_variables_initializer()) ''' if OUTPUT_GRAPH: if os.path.exists(LOG_DIR): shutil.rmtree(LOG_DIR) tf.summary.FileWriter(LOG_DIR, sess.graph) ''' worker_threads = [] # train workers for worker in workers: job = lambda: worker.train() t = threading.Thread(target=job) t.start() worker_threads.append(t) # test worker job = lambda: tester.test(render=render) t = threading.Thread(target=job) t.start() worker_threads.append(t) # save worker job = lambda: saver() t = threading.Thread(target=job) t.start() worker_threads.append(t) # wait COORD = tf.train.Coordinator() COORD.join(worker_threads)