def generate_ner(args) -> None: """ 总共分成2个步骤: Step1 : 用模型进行实体识别 Step2 : 对每篇文章按照中文句点进行分割 Args: args: --file_root : root path of data """ file_names = scan_files(args.file_root) # type:List[str] for file in file_names: data = load_file(args.file_root, file, "txt") # Part1 : 计算得到当前文章的实体识别结果 prepare_data = prepare(data) # type:np.ndarray result = predict(prepare_data) # type:np.ndarray _, ner_result = decode_result(result=result, sent_pre=prepare_data, sent=data) pickle.dump(ner_result, open(args.file_root + file + "_ner.pkl", 'wb')) # Part2 : 将当前文章按照(句号/问号/感叹号)作为划分,并记录到dict中 start, end = 0, 0 sentence_split_result = [] stop_tokens = ["。", "!", "?"] for idx, c in enumerate(data): if c in stop_tokens: end = idx sentence_split_result.append((start, end)) start = end + 1 pickle.dump(sentence_split_result, open(args.file_root + file + "_sentence_split.pkl", 'wb'))
def prepare_data(self): with open(self._w2v_file_path, 'w', encoding='utf-8') as f: file_names = scan_files(self.root) for name in file_names: data = load_file(self.root, name, "txt") data = " ".join(data) f.write(data) f.write("\n") return
def put_neg_cells(background, label, size): # collect all negative cells for specific label neg_cells = [] sub_paths = neg_cells_path_map[label] for sub_path in sub_paths: # neg_cells += scan_files(os.path.join(neg_cells_paths[sub_path[0]], sub_path[1]), postfix=".bmp") if sub_path[1] == "all": sub_neg_cells = scan_files(neg_cells_paths[sub_path[0]], postfix=".bmp") else: sub_neg_cells = scan_files(os.path.join(neg_cells_paths[sub_path[0]], sub_path[1]), postfix=".bmp") neg_cells += random.sample(sub_neg_cells, min(sub_path[2], len(sub_neg_cells))) # get number and names of negative cells neg_cells_cnt = random.randint(neg_cells_num[0], neg_cells_num[1]) neg_cells_for_patch = random.sample(neg_cells, min(neg_cells_cnt, len(neg_cells))) # print("total", len(neg_cells), "choose", len(neg_cells_for_patch)) # get possible cell positions in background neg_cells_possible = [] dets = [] for neg_cell in neg_cells_for_patch: # tokens = re.findall(r"\d+", neg_cell) # should follow ..._w123_h234.bmp format # neg_w, neg_h = math.ceil(int(tokens[-2])/2), math.ceil(int(tokens[-1])/2) neg_img = cv2.imread(neg_cell) neg_h, neg_w, _ = neg_img.shape neg_x = random.randint(0, size-neg_w) neg_y = random.randint(0, size-neg_h) neg_cells_possible.append([neg_cell, (neg_x, neg_y, neg_w, neg_h)]) dets.append([neg_x, neg_y, neg_x+neg_w, neg_y+neg_h, 1]) # remove overlapping cells keep = py_cpu_nms(np.array(dets), thresh=0.1) neg_cells_ready = [neg_cells_possible[i] for i in keep] # put cells on background for neg_cell in neg_cells_ready: neg_img = cv2.imread(neg_cell[0]) # neg_img = cv2.pyrDown(neg_img) neg_x, neg_y, neg_w, neg_h = neg_cell[1] background[neg_y:neg_y+neg_h, neg_x:neg_x+neg_w, :] = neg_img return background
def scan_param_files(path): if not path.endswith('/'): path.append('/') model_files = scan_files(path, r'dict_model(.*)\.py', raise_not_found_error=False) optimizer_files = scan_files(path, r'dict_optimizer(.*)\.py', raise_not_found_error=False) trainer_files = scan_files(path, r'dict_trainer(.*)\.py', raise_not_found_error=False) data_loader_files = scan_files(path, r'dict_data_loader(.*)\.py', raise_not_found_error=False) config_files = scan_files(path, r'config(.*)\.py', raise_not_found_error=False) ''' if raise_not_found_error: # raise error if did not find any param dict if len(model_files)==0: raise Exception('No available model param dict in %s'%str(path)) if len(optimizer_files)==0: raise Exception('No available optimizer param dict in %s'%str(path)) if len(trainer_files)==0: raise Exception('No available trainer param dict in %s'%str(path)) if len(data_loader_files)==0: raise Exception('No available data_loader param dict in %s'%str(path)) ''' return { 'model_files': model_files, 'optimizer_files': optimizer_files, 'trainer_files': trainer_files, 'data_loader_files': data_loader_files, 'config_files': config_files } '''
def put_cells(pos_cells_path, save_path, postfix=".bmp"): os.makedirs(save_path, exist_ok=True) files = scan_files(pos_cells_path, postfix=postfix) print("# files:", len(files)) executor = ProcessPoolExecutor(max_workers=cpu_count()) tasks = [] batch_size = 100 for i in range(0, len(files), batch_size): batch = files[i : i+batch_size] tasks.append(executor.submit(batch_put_cell, batch, save_path)) job_count = len(tasks) for future in as_completed(tasks): # result = future.result() # get the returning result from calling fuction job_count -= 1 print("One Job Done, Remaining Job Count: %s" % (job_count))
def get_background(cell_name, useback, size): if useback == "white": background = np.ones((size, size, 3)) elif useback == "black": background = np.zeros((size, size, 3)) elif useback == "positive": # use negative images from positive wsis neg_files = [] for sub_dir in os.listdir(neg_background_path): if os.path.basename(cell_name).startswith(sub_dir): neg_files = scan_files(os.path.join(neg_background_path, sub_dir), postfix=".bmp") break if len(neg_files) >= 1: neg_randf = random.sample(neg_files, 1)[0] background = cv2.imread(neg_randf) else: background = np.zeros((size, size, 3)) else: background = np.zeros((size, size, 3)) return background
# cut tif file to 608x608 images. For each tif, it will generate a folder to put 608 images ####################### print(colorama.Fore.GREEN + "[INFO] cut 608 images from tif file" + colorama.Fore.WHITE) os.makedirs(output_tif_608s, exist_ok=True) tif_name_ext = get_unrunned_tif(input_tif_files, save_path) if len(tif_name_ext) == 0: print(colorama.Fore.RED + "[INFO] data processing finished" + colorama.Fore.WHITE) sys.exit() asap_to_image(os.path.join(input_tif_files, tif_name_ext), output_tif_608s) # get the list of 608 image full pathnames ######################################################################## tif_name = os.path.splitext(tif_name_ext)[0] image_path = os.path.join(output_tif_608s, tif_name) images = scan_files(image_path) # generate txt file for current tif ############################################################################### print(colorama.Fore.GREEN + "[INFO] generate txt for " + tif_name + colorama.Fore.WHITE) gen_txt_for_dir(images, output_tif_608s, tif_name) # run darknet test ################################################################################################ darknet_path = darknet_dir segment(darknet_path, image_path) os.remove(image_path + ".txt") # evaluate predictions and convert coordinates to xmls ############################################################ print(colorama.Fore.GREEN + "[INFO] evaluate predictions and write coordinates into xmls" + colorama.Fore.WHITE)
def generate_tuples(self, data_dir: str): """ 用于从源数据中,用多线程的方式生成tuples Args: data_dir: 数据存储的路径,其中包括: eg. 源文章名称 __ data/round2/0.txt NER结果名称 __ data/round2/0_ner.pkl 文章分句结果 __ data/round2/0_sentence_split.pkl """ # Step1 : load word2idx and emb_matrix self.config.load_word2idx_embmatrix() # Step2 : 生成候选关系对 instances = list() file_names = scan_files(data_dir) for file in file_names: passage = load_file(data_dir, file, "txt") # type:str sent_split = pickle.load( open(data_dir + file + "_sentence_split.pkl", "rb")) # type:List[tuple] ner_result = pickle.load(open(data_dir + file + "_ner.pkl", "rb")) # type:List[tuple] sent_split.sort(key=lambda x: x[0]) # Step2.1 : 找出属于e1与e2的实体 e1_entities, e2_entities = list(), list() for e in ner_result: # e是个4元组,例如:('Disease', 1, 10, '糖尿病下肢动脉病变') if e[0] == self.config.e1_type: e1_entities.append(e) elif e[0] == self.config.e2_type: e2_entities.append(e) e1_entities.sort(key=lambda x: x[1]) e2_entities.sort(key=lambda x: x[1]) # Step2.2 : 对每一个e1去找到候选的e2,并确定三元组<BEF,BET,AFT,sequence_tag> for e1 in e1_entities: e1_start, e1_end = e1[1], e1[2] cur_sentence_idx = -1 for idx, s in enumerate(sent_split): if s[0] <= e1_start and s[1] >= e1_end: cur_sentence_idx = idx break # 根据当前实体的位置确定了寻找e2的上下界:即 上一句 + 当前句 + 下一句 search_e2_start = sent_split[ cur_sentence_idx - 1 if cur_sentence_idx > 1 else 0][0] search_e2_end = sent_split[cur_sentence_idx + 1 if cur_sentence_idx < len(sent_split) - 1 \ else len(sent_split) - 1][1] for i in range(len(e2_entities)): e2 = e2_entities[i] e2_start = e2[1] e2_end = e2[2] if e2_end < search_e2_start: continue elif e2_start > search_e2_end: break elif e2_start >= search_e2_start and e2_end <= search_e2_end: if e1_end == e2_start: # 情况(1):e1在e2前,且紧挨着 before = passage[search_e2_start:e1_start] between = "" after = passage[e2_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=True, before=before, between=between, after=after, config=self.config) instances.append(t) elif e2_end == e1_start: # 情况(2):e1在e2后,且紧挨着 before = passage[search_e2_start:e2_start] between = "" after = passage[e1_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=False, before=before, between=between, after=after, config=self.config) instances.append(t) elif e1_end < e2_start: # 情况(3):e1在e2前,不挨着 before = passage[search_e2_start:e1_start] between = passage[e1_end:e2_start] after = passage[e2_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=True, before=before, between=between, after=after, config=self.config) instances.append(t) elif e2_end < e1_start: # 情况(4):e1在e2后,不挨着 before = passage[search_e2_start:e2_start] between = passage[e2_end:e1_start] after = passage[e1_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=False, before=before, between=between, after=after, config=self.config) instances.append(t) # Stpe3 : 持久化 pickle.dump( instances, open("./saved_model_files/RE_candidate_instances.pkl", "wb"))