def work(self): try: self._loop() except ConnectionResetError as e: logger.error(f"<{self.worker_id}> 与客户端的连接已断开,连接信息: {self.addr}") except TimeoutError as e: logger.error(f"<{self.worker_id}> 与客户端的连接超时,连接信息: {self.addr}") except Exception as e: logger.exception(f"<{self.worker_id}> Worker内部发生异常,循环计数: {self.loop_counter}") finally: logger.info(f"<{self.worker_id}> Worker已停止运行") self.handler.handle_stop() self.conn.close()
def _loop(self): logger.info(f"<{self.worker_id}> Worker正在启动") while self._running: recv = self.conn.recv_json() if self.record_transmission: self.transmission_logger.debug(json.dumps(recv)) if recv['type'] == INITIALIZE: self.handler.handle_init_data(recv['data']) elif recv['type'] == RESET: self.handler.handle_reset_data() elif recv['type'] == STEP: result = self.handler.handle_step_data(recv['data']) self.conn.send_json(result) elif recv['type'] == STOP: break self.loop_counter += 1
def run(self): listener = socket.socket() listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) listener.bind((self.host, self.port)) listener.listen(10) logger.info(f"准备接受远程连接,监听地址{self.host}:{self.port}") try: if self.multi_worker: self.thread_controller = ThreadController() while True: conn, addr = listener.accept() self.thread_controller.create_work(addr, conn, self.handler_factory) else: conn, addr = listener.accept() worker = Worker(addr, conn, self.handler_factory) worker.work() except KeyboardInterrupt as e: logger.info("AIServer已被用户终止") if self.thread_controller: self.thread_controller.request_stop() else: worker.stop() finally: logger.info("AIServer正在尝试退出,请耐心等待...") listener.close() if self.thread_controller: self.thread_controller.join()
def load_model(self): ckpt = tf.train.get_checkpoint_state(self.model_path_move + str(self.load_model_num) + '/') if ckpt and ckpt.model_checkpoint_path: self.saver.restore(self.sess, ckpt.model_checkpoint_path) logger.info('加载神经网络')
save_net = False save_model_num = '2010211129_red_new' load_net = False load_model_num = '2010211129_red_new' save_interval_steps = 10000 is_Train = True max_map_size_x = 77 * 2 max_map_size_y = 92 * 2 step_num = 2 move_space = sum(range(step_num + 1)) * 6 map_chanel = 1 input_entity_size = 384 batch = 128 IS_MULTI_PROCESSING = True logger.info("*** SERVER RUNNING ***") cf = configparser.ConfigParser() cf.read("./config/server.ini") host = cf.get('Server', 'host') port = cf.getint('Server', 'port') transmission_test = cf.getboolean('Handler', 'transmission_test') ppo = None if transmission_test: func = lambda: TransmissionTest() else: ppo = ParallelPPO(max_map_size_x=max_map_size_x, max_map_size_y=max_map_size_y, move_space=move_space,
def update(self, current_step, spatial_actor, spatial_critic, entity_actor, entity_critic, scalar_state, action, my_id, obj_id, move, move_type, r): if self.is_ppo_move: self.ppo_move.update(current_step, entity_critic, move_type, r) # return # with tf.device("/cpu:0"): logger.info(f'更新网络') with tf.device(self._device_str2): self.sess.run(self.update_oldpi_op) adv = self.sess.run( self.advantage, { self.tf_entity_critic: entity_critic, self.tf_scalar_state: scalar_state, self.tfdc_r: r }) # adv = (adv - adv.mean()) / (adv.std() + 1e-6) # sometimes helpful # update actor with tf.device(self._device_str1): if self.METHOD['name'] == 'kl_pen': for _ in range(self.A_UPDATE_STEPS): _, kl = self.sess.run( [self.atrain_op, self.kl_mean], { self.tf_spatial_critic: spatial_critic, self.tf_entity_critic: entity_critic, self.tfaction_type: action, self.tfadv: adv, self.tflam: self.METHOD['lam'] }) if kl > 4 * self.METHOD[ 'kl_target']: # this in in google's paper break if kl < self.METHOD[ 'kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper self.METHOD['lam'] /= 2 elif kl > self.METHOD['kl_target'] * 1.5: self.METHOD['lam'] *= 2 self.METHOD['lam'] = np.clip( self.METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is my solution else: # clipping method, find this is better (OpenAI's paper) [ self.sess.run( self.atrain_op, { self.tf_entity_critic: entity_critic, self.tf_scalar_state: scalar_state, # self.tfaction_type: np.squeeze(a, axis=1), self.tfaction_type: action, self.tfmy_id: my_id, self.tfobj_id: obj_id, self.tfmove: move, self.tfadv: adv }) for _ in range(self.A_UPDATE_STEPS) ] # update critic with tf.device(self._device_str2): [ self.sess.run( self.ctrain_op, { # self.tf_spatial_critic: spatial_critic, self.tf_entity_critic: entity_critic, self.tf_scalar_state: scalar_state, self.tfdc_r: r }) for _ in range(self.C_UPDATE_STEPS) ] # 显示损失函数 adv_lis = np.squeeze(np.square(adv), axis=1) self.adv_list += adv_lis.tolist() critic_list = np.squeeze(self.get_v(entity_critic, scalar_state), axis=1) self.critic_list += critic_list.tolist() if self.dislpay_loss_counter == self.display_loss_interval: self.display_loss(self.adv_list, self.critic_list) self.dislpay_loss_counter = 0 self.dislpay_loss_counter += 1
def __init__(self, actions_space, max_map_size_x, max_map_size_y, move, map_chanel, input_entity_size, load_net, is_train, batch, load_model_num, use_gpu): self._device_str1 = '/gpu:0' if use_gpu else '/cpu:0' self._device_str2 = '/gpu:1' if use_gpu else '/cpu:0' logger.info(f"PPO初始化") logger.info(f"使用设备:{self._device_str1}") logger.info(f"使用设备:{self._device_str2}") self.A_LR = 0.0001 self.C_LR = 0.0002 self.batch = batch self.A_UPDATE_STEPS = 10 self.C_UPDATE_STEPS = 10 self.METHOD = [ dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty dict(name='clip', epsilon=0.2 ), # Clipped surrogate objective, find this is better ][1] # choose the method for optimization # 设置Sess运算环境 config = tf.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 0.5 # 使用GPU显存的比率 config.gpu_options.allow_growth = True # 按需求使用GPU config.log_device_placement = False # config.device_count = {'cpu': 0} # config.allow_soft_placement = True self.sess = tf.Session(config=config) self.C_DIM = 1 # 添加的参数 self.memory_entity_critic, self.memory_scalar_state, self.memory_r = None, None, None self.memory_action, self.memory_my_id, self.memory_obj_id, self.memory_move = None, None, None, None self.actions_space = actions_space self.max_map_size_x = max_map_size_x self.max_map_size_y = max_map_size_y self.move = move self.map_chanel = map_chanel self.action_type_size = 14 self.my_id_size = 34 self.obj_id_size = 34 self.input_entity_size = input_entity_size self.move_type = 10 self.input_scalar_size = 20 self.update_interval = 1 self.lstm_batch_size = 1 self.eps = 0.000001 # define the input of the state # self.tfs = tf.placeholder(tf.float32, [None, self.S_DIM], "state") # saving and loading the network self.model_path = 'PredictAI_model/' self.load_model_num = load_model_num self.save_interval = 0 self.adv_list = [] self.critic_list = [] self.fig = plt.figure() self.display_loss_interval = 10 self.dislpay_loss_counter = 0 # # define the ppo_move # self.ppo_move = PPO_MOVE(batch, step_num=2) self.ppo_move = PPONet(step_num=3) self.is_ppo_move = False # self.loss_display = fig.add_subplot(1, 1, 1) # self.loss_display.scatter(self.current_step, self.loss) plt.ion() # 本次运行请注释,全局运行不要注释 plt.show() self.get_on_flage = False # actor & critic logger.info(f'初始化神经网络') with tf.variable_scope('ACnet'): self.ACNet = ACNet(self.max_map_size_y, self.max_map_size_x, self.map_chanel, self.lstm_batch_size, self.actions_space, self.action_type_size, self.my_id_size, self.obj_id_size, self.move, self.A_LR, use_gpu) # self.tf_spatial_critic = tf.placeholder(tf.float32, [None, self.max_map_size_y, self.max_map_size_x, # self.map_chanel], name='inputs') self.tf_entity_critic = tf.placeholder( tf.float32, [None, self.input_entity_size], name='inputs_entity') self.tf_scalar_state = tf.placeholder( tf.float32, [None, self.input_scalar_size], name='inputs_scale') self.tf_spatial_critic = tf.placeholder(tf.float32, [ None, self.max_map_size_y, self.max_map_size_x, self.map_chanel ], name='actor_inputs') # self.tf_entity_actor = tf.placeholder(tf.float32, [None, self.input_entity_size], name='actor_inputs_scale') self.tfdc_r = tf.placeholder(tf.float32, [None, self.C_DIM], 'discounted_r') self.ANet, self.old_ANet, self.critic = self.ACNet.build_ACNet( self.tf_entity_critic, self.tf_scalar_state, True) self.advantage = self.tfdc_r - self.critic self.closs = tf.reduce_mean(tf.square(self.advantage)) with tf.device('/cpu:0'): tf.summary.scalar('closs', self.closs) # tensorflow >= 0.12 self.ctrain_op = tf.train.AdamOptimizer(self.C_LR).minimize( self.closs) # with tf.device(self._device_str1): # 离散状态下使用 with tf.variable_scope('update_oldpi'): self.update_oldpi_op = [ oldp.assign(p) for p, oldp in zip( self.ANet['actor_params'], self.old_ANet['actor_params']) ] # 从动作类型概率分布中选择已选动作进行更新 # with tf.device(self._device_str): # fixme self.tfaction_type = tf.placeholder(tf.int32, [None, 1], 'action') # a_tfa = tf.reshape(self.tfaction_type, [self.A_DIM, -1]) action_type_indices = tf.stack([ tf.range(tf.shape(self.tfaction_type)[0], dtype=tf.int32), tf.squeeze(self.tfaction_type, axis=1) ], axis=1) action_type_prob = tf.gather_nd( params=self.ANet['action_type_prob'], indices=action_type_indices) # shape=(None, ) oldaction_type_prob = tf.gather_nd( params=self.old_ANet['action_type_prob'], indices=action_type_indices) # shape=(None, ) # 从我方算子概率分布中选择我方算子进行更新 self.tfmy_id = tf.placeholder(tf.int32, [None, 1], 'action') my_id_indices = tf.stack([ tf.range(tf.shape(self.tfmy_id)[0], dtype=tf.int32), tf.squeeze(self.tfmy_id, axis=1) ], axis=1) my_id_prob = tf.gather_nd(params=self.ANet['my_id_prob'], indices=my_id_indices) # shape=(None, ) oldmy_id_prob = tf.gather_nd(params=self.old_ANet['my_id_prob'], indices=my_id_indices) # shape=(None, ) # 从敌方算子概率分布中选择敌方算子进行更新 self.tfobj_id = tf.placeholder(tf.int32, [None, 1], 'action') obj_id_indices = tf.stack([ tf.range(tf.shape(self.tfobj_id)[0], dtype=tf.int32), tf.squeeze(self.tfobj_id, axis=1) ], axis=1) obj_id_prob = tf.gather_nd(params=self.ANet['obj_id_prob'], indices=obj_id_indices) # shape=(None, ) oldobj_id_prob = tf.gather_nd(params=self.old_ANet['obj_id_prob'], indices=obj_id_indices) # shape=(None, ) # 从move分布中选择已选位置 self.tfmove = tf.placeholder(tf.int32, [None, 1], 'move_x') move_indices = tf.stack([ tf.range(tf.shape(self.tfmove)[0], dtype=tf.int32), tf.squeeze(self.tfmove, axis=1) ], axis=1) move_prob = tf.gather_nd(params=self.ANet['move_prob'], indices=move_indices) # shape=(None, ) oldmove_prob = tf.gather_nd(params=self.old_ANet['move_prob'], indices=move_indices) # shape=(None, ) # loss self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') # 计算TD-error with tf.variable_scope('loss'): with tf.variable_scope('surrogate'): # ratio = tf.exp(pi.log_prob(self.tfaction_type) - oldpi.log_prob(self.tfaction_type)) # 连续情况下使用 OpenAI算法 # ratio = pi.tf.divide(self.tfaction_type) / oldpi.prob(self.tfaction_type) # 连续情况下使用 DeepMind 算法 # 三个比率分别计算,最后求和算总体损失 # 离散情况下使用 ratio_action_type = tf.divide( action_type_prob, tf.maximum(oldaction_type_prob, 1e-5)) ratio_my_id = tf.divide(my_id_prob, tf.maximum(oldmy_id_prob, 1e-5)) ratio_obj_id = tf.divide(obj_id_prob, tf.maximum(oldobj_id_prob, 1e-5)) ratio_move = tf.divide(move_prob, tf.maximum(oldmove_prob, 1e-5)) surr_action_type = ratio_action_type * self.tfadv surr_my_id = ratio_my_id * self.tfadv surr_obj_id = ratio_obj_id * self.tfadv surr_move = ratio_move * self.tfadv if self.METHOD['name'] == 'kl_pen': self.tflam = tf.placeholder(tf.float32, None, 'lambda') kl = tf.distributions.kl_divergence( self.old_ANet['action_type_prob'], self.ANet['action_type_prob']) self.kl_mean = tf.reduce_mean(kl) with tf.name_scope('aloss'): self.aloss = -(tf.reduce_mean(surr_action_type - self.tflam * kl)) else: # clipping method, find this is better self.aloss = -( tf.reduce_mean( tf.minimum( surr_action_type, tf.clip_by_value( ratio_action_type, 1. - self.METHOD['epsilon'], 1. + self.METHOD['epsilon']) * self.tfadv)) + tf.reduce_mean( tf.minimum( surr_my_id, tf.clip_by_value(ratio_my_id, 1. - self.METHOD['epsilon'], 1. + self.METHOD['epsilon']) * self.tfadv)) + tf.reduce_mean( tf.minimum( surr_obj_id, tf.clip_by_value( ratio_obj_id, 1. - self.METHOD['epsilon'], 1. + self.METHOD['epsilon']) * self.tfadv)) + tf.reduce_mean( tf.minimum( surr_move, tf.clip_by_value( ratio_move, 1. - self.METHOD['epsilon'], 1. + self.METHOD['epsilon']) * self.tfadv))) with tf.variable_scope('atrain'): with tf.device(self._device_str1): self.atrain_op = tf.train.AdamOptimizer(self.A_LR).minimize( self.aloss) self.merged = tf.summary.merge_all() self.writer = tf.summary.FileWriter("log/tensorflow/", self.sess.graph) # initializing the global components self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() # loading the network model if load_net is True: self.load_model()