def training_pipeline(): model_num = 1 best_model_num = 0 best_model = CNNModel(best_model_num) best_model.save_weights() for _ in range(NUM_TRAINS): states, valids, improved_policy, win_loss = self_play(best_model_num) contender = CNNModel(model_num) contender.train_model(np.array(states, np.uint32), np.array(valids, np.float32), np.array(win_loss), np.array(improved_policy)) contender_wins = bot_fight(best_model.model_num, contender.model_num) if contender_wins >= np.ceil(BOT_GAMES * 0.55): best_model = contender best_model_num = contender.model_num logging.info( f'best model: {best_model_num}, new model won {contender_wins}') best_model.save_weights(best=True) model_num += 1
class ACBrain(): def __init__(self, talker): super(ACBrain, self).__init__() if use_RNN: self.model = RNNModel() self.model.call( np.random.random( (batch_size, IMG_H, IMG_W, k)).astype(np.float32), np.zeros((batch_size, hidden_unit_num), dtype=np.float32), np.zeros((batch_size, hidden_unit_num), dtype=np.float32)) else: self.model = CNNModel() self.model.call( np.random.random( (batch_size, IMG_H, IMG_W, k)).astype(np.float32)) self.talker = talker self.i = 1 self.optimizer = optim.Adam(learning_rate=CustomSchedule(lr)) self.states_list = self.talker.states_list self.one_episode_reward_index = 0 def cnn_forward_calc(self, state): a_prob, v = self.model.call(state) return a_prob.numpy(), v.numpy() # [(*,a_num) , (*,1)] np.array def rnn_forward_calc(self, state, h, c): a_prob, v, hc = self.model.call(state, h, c) return a_prob.numpy(), v.numpy(), hc[0].numpy(), hc[1].numpy() def run(self): print("brain" + " ", os.getpid()) total_obs = np.zeros((batch_size, process_num, IMG_H, IMG_W, k), dtype=np.float32) total_v = np.zeros((batch_size + 1, process_num), dtype=np.float32) total_as = np.zeros((batch_size, process_num), dtype=np.int32) total_rs = np.zeros((batch_size, process_num), dtype=np.float32) total_is_done = np.zeros((batch_size, process_num), dtype=np.float32) total_old_ap = np.zeros((batch_size, process_num, a_num), dtype=np.float32) if use_RNN: total_h = np.zeros((batch_size, process_num, hidden_unit_num), dtype=np.float32) total_c = np.zeros((batch_size, process_num, hidden_unit_num), dtype=np.float32) temp_obs = np.zeros((process_num, IMG_H, IMG_W, k), dtype=np.float32) while 1: if use_RNN: temp_h = total_h[-1, :, :] temp_c = total_c[-1, :, :] for i in range(batch_size): for j in range(process_num): child_id, data = self.talker.recv() temp_obs[child_id, :, :, :] = np.array(data[0], dtype=np.float32) if use_RNN and data[1]: temp_h[child_id, :] = 0 temp_c[child_id, :] = 0 total_obs[i, :, :, :, :] = temp_obs if use_RNN: total_h[i, :, :] = temp_h total_c[i, :, :] = temp_c a_prob, v, temp_h, temp_c = self.rnn_forward_calc( temp_obs, temp_h, temp_c) else: a_prob, v = self.cnn_forward_calc(temp_obs) for child_id in range(process_num): self.talker.send(a_prob[child_id], child_id) v.resize((process_num, )) total_v[i, :] = v total_old_ap[i, :, :] = a_prob for j in range(process_num): child_id, data = self.talker.recv() temp_obs[child_id, :, :, :] = np.array(data[0], dtype=np.float32) if use_RNN: a_prob, v, _, _ = self.rnn_forward_calc( temp_obs, temp_h, temp_c) else: a_prob, v = self.cnn_forward_calc(temp_obs) for child_id in range(process_num): self.talker.send(a_prob[child_id], child_id) v.resize((process_num, )) total_v[-1, :] = v for j in range(process_num): child_id, data = self.talker.recv() # data # [ # self.send_as, # self.send_rs, # self.send_is_done, # self.episode_reward # ] total_as[:, child_id] = data[0] total_rs[:, child_id] = data[1] total_is_done[:, child_id] = data[2] for one_episode_reward in data[ 3]: # use tensorflow recode reward in one episode self.model.record(name='one_episode_reward', data=one_episode_reward, step=self.one_episode_reward_index) self.one_episode_reward_index += 1 total_realv, total_adv = self.calc_realv_and_adv_GAE( total_v, total_rs, total_is_done) total_obs.resize((process_num * batch_size, IMG_H, IMG_W, k)) total_as.resize((process_num * batch_size, )) total_old_ap.resize((process_num * batch_size, a_num)) total_adv.resize((process_num * batch_size, )) total_realv = total_realv.reshape((process_num * batch_size, )) if use_RNN: total_h.resize((process_num * batch_size, hidden_unit_num)) total_c.resize((process_num * batch_size, hidden_unit_num)) self.rnn_learn(total_obs, tf.one_hot(total_as, depth=a_num).numpy(), total_old_ap, total_adv, total_realv, total_h, total_c) else: self.cnn_learn(total_obs, tf.one_hot(total_as, depth=a_num).numpy(), total_old_ap, total_adv, total_realv) if self.i == max_learning_times: from model import weight_dir self.model.save_weights(weight_dir + str(self.model.total_index), save_format='tf') print("learning done") return 0 self.i += 1 print("-------------------------") print(self.i) for child_id in range( process_num): # tell agents that can start act with env self.states_list[child_id] = 0 self.talker.send("ok", child_id) total_obs.resize((batch_size, process_num, IMG_H, IMG_W, k)) total_as.resize(( batch_size, process_num, )) total_old_ap.resize((batch_size, process_num, a_num)) total_adv.resize(( batch_size, process_num, )) if use_RNN: total_h.resize((batch_size, process_num, hidden_unit_num)) total_c.resize((batch_size, process_num, hidden_unit_num)) def cnn_learn(self, total_obs, total_as, total_old_ap, total_adv, total_real_v): for _ in range(epochs): sample_index = np.random.choice(total_as.shape[0], size=learning_batch) grads, loss = self.model.total_grad(total_obs[sample_index], total_as[sample_index], total_adv[sample_index], total_real_v[sample_index], total_old_ap[sample_index]) grads, grad_norm = tf.clip_by_global_norm(grads, 0.5) self.optimizer.apply_gradients( zip(grads, self.model.trainable_weights)) def rnn_learn(self, total_obs, total_as, total_old_ap, total_adv, total_real_v, total_h, total_c): for _ in range(epochs): sample_index = np.random.choice(total_as.shape[0], size=learning_batch) grads, loss = self.model.total_grad( total_obs[sample_index], total_as[sample_index], total_adv[sample_index], total_real_v[sample_index], total_old_ap[sample_index], total_h[sample_index], total_c[sample_index]) grads, grad_norm = tf.clip_by_global_norm(grads, 0.5) self.optimizer.apply_gradients( zip(grads, self.model.trainable_weights)) @staticmethod @numba.jit(nopython=True) def calc_realv_and_adv(v, r, done): length = r.shape[0] num = r.shape[1] realv = np.zeros((length + 1, num), dtype=np.float32) adv = np.zeros((length, num), dtype=np.float32) realv[-1, :] = v[-1, :] * (1 - done[-1, :]) for t in range(length - 1, -1, -1): realv[t, :] = realv[t + 1, :] * gamma * (1 - done[t, :]) + r[t, :] adv[t, :] = realv[t, :] - v[t, :] return realv[:-1, :], adv # end_v dont need @staticmethod @numba.jit(nopython=True) def calc_realv_and_adv_GAE(v, r, done): length = r.shape[0] num = r.shape[1] adv = np.zeros((length + 1, num), dtype=np.float32) for t in range(length - 1, -1, -1): delta = r[t, :] + v[t + 1, :] * gamma * (1 - done[t, :]) - v[t, :] adv[t, :] = delta + gamma * 0.95 * adv[t + 1, :] * (1 - done[t, :]) adv = adv[:-1, :] realv = adv + v[:-1, :] return realv, adv