def ceshi_run(stm_dict): #output_tmvar2 = 'tmVarJava/output/ceshi.txt.PubTator' output_tmvar2 = 'tmVarJava/output/29848cb18c2db29141bae9c5f7cc97b1d5175f4960eed341cca78cd9.PubTator.PubTator' dict_list = pubtator2dict_list(output_tmvar2, is_raw_text=True) is_raw_text, cur_thread_name = True, threading.current_thread().getName() ner_start_time = time.time() tagged_docs, num_entities = biobert_recognize(stm_dict, dict_list, is_raw_text, cur_thread_name) ner_time = time.time() - ner_start_time print( datetime.now().strftime(stm_dict['time_format']), '[%s] NER %.3f sec, #entities: %d' % (cur_thread_name, ner_time, num_entities)) return (tagged_docs, num_entities)
def tag_entities(self, text, cur_thread_name, is_raw_text, reuse=False): assert self.stm_dict is not None n_ascii_letters = 0 for l in text: if l not in string.ascii_letters: continue n_ascii_letters += 1 if n_ascii_letters == 0: text = 'No ascii letters. Please enter your text in English.' text_hash = hashlib.sha224(text.encode('utf-8')).hexdigest() print(datetime.now().strftime(self.stm_dict['time_format']), '[{}] text_hash: {}'.format(cur_thread_name, text_hash)) bern_output_path = './output/bern_demo_{}.json'.format(text_hash) if reuse and os.path.exists(bern_output_path): print(datetime.now().strftime(self.stm_dict['time_format']), '[{}] Found prev. output'.format(cur_thread_name)) with open(bern_output_path, 'r', encoding='utf-8') as f_out: return json.load(f_out) home_gnormplus = self.stm_dict['gnormplus_home'] input_gnormplus = os.path.join(home_gnormplus, 'input', '{}.PubTator'.format(text_hash)) output_gnormplus = os.path.join(home_gnormplus, 'output', '{}.PubTator'.format(text_hash)) home_tmvar2 = self.stm_dict['tmvar2_home'] input_dir_tmvar2 = os.path.join(home_tmvar2, 'input') input_tmvar2 = os.path.join(input_dir_tmvar2, '{}.PubTator'.format(text_hash)) output_tmvar2 = os.path.join(home_tmvar2, 'output', '{}.PubTator.PubTator'.format(text_hash)) # Write input str to a .PubTator format file with open(input_gnormplus, 'w', encoding='utf-8') as f: # only title f.write(text_hash + '|t|') f.write('\n') f.write(text_hash + '|a|' + text + '\n\n') # Run GNormPlus gnormplus_start_time = time.time() tell_inputfile(self.stm_dict['gnormplus_host'], self.stm_dict['gnormplus_port'], '{}.PubTator'.format(text_hash)) print( datetime.now().strftime(self.stm_dict['time_format']), '[{}] GNormPlus {:.3f} sec'.format( cur_thread_name, time.time() - gnormplus_start_time)) # Move a GNormPlus output file to the tmVar2 input directory shutil.move(output_gnormplus, input_tmvar2) # Run tmVar 2.0 tmvar2_start_time = time.time() tell_inputfile(self.stm_dict['tmvar2_host'], self.stm_dict['tmvar2_port'], '{}.PubTator'.format(text_hash)) print( datetime.now().strftime(self.stm_dict['time_format']), '[{}] tmVar 2.0 {:.3f} sec'.format(cur_thread_name, time.time() - tmvar2_start_time)) # Convert tmVar 2.0 outputs (?.PubTator.PubTator) to python dict dict_list = pubtator2dict_list(output_tmvar2, is_raw_text=True) # Delete temp files os.remove(input_gnormplus) os.remove(input_tmvar2) os.remove(output_tmvar2) # error if type(dict_list) is str: print(dict_list) return None # Run BioBERT of Lee et al., 2019 start_time = time.time() tagged_docs, num_entities = \ self.biobert_recognize(dict_list, is_raw_text, cur_thread_name) if tagged_docs is None: return None assert len(tagged_docs) == 1 print( datetime.now().strftime(self.stm_dict['time_format']), '[%s] NER %.3f sec, #entities: %d' % (cur_thread_name, time.time() - start_time, num_entities)) # Normalization models if num_entities > 0: # print(datetime.now().strftime(time_format), # '[{}] Normalization models..'.format(cur_thread_name)) tagged_docs = self.normalizer.normalize(text_hash, tagged_docs, cur_thread_name, is_raw_text=is_raw_text) # Convert to PubAnnotation JSON tagged_docs[0] = get_pub_annotation(tagged_docs[0], is_raw_text=is_raw_text) # Save a BERN result with open(bern_output_path, 'w', encoding='utf-8') as f_out: json.dump(tagged_docs[0], f_out) return tagged_docs[0]
def tag_entities(self, cur_thread_name, is_raw_text, reuse=False): assert self.stm_dict is not None get_start_t = time.time() elapsed_time_dict = dict() home_gnormplus = self.stm_dict['gnormplus_home'] input_gnormplus = os.path.join(home_gnormplus, 'input') output_gnormplus = os.path.join(home_gnormplus, 'output') home_tmvar2 = self.stm_dict['tmvar2_home'] input_dir_tmvar2 = os.path.join(home_tmvar2, 'input') input_tmvar2 = os.path.join(input_dir_tmvar2) output_tmvar2 = os.path.join(home_tmvar2, 'output') # Run GNormPlus gnormplus_start_time = time.time() # 这里肯定要修改================================================ shell_script = '''cd GNormPlusJava;java -Xmx12G -Xms12G -jar GNormPlus.jar input output setup.txt;cd -;''' # % (input_gnormplus, output_gnormplus) print(shell_script) os.system(shell_script) gnormplus_time = time.time() - gnormplus_start_time elapsed_time_dict['gnormplus'] = round(gnormplus_time, 3) print( datetime.now().strftime(self.stm_dict['time_format']), '[{}] GNormPlus {:.3f} sec'.format(cur_thread_name, gnormplus_time)) # GNorm的输出作为tmVar的输入,其实上面和下面的命令里已经都写死了 input_tmvar2 = output_gnormplus # Run tmVar 2.0 tmvar2_start_time = time.time() # 这里肯定要修改================================================ shell_script = '''cd tmVarJava; java -Xmx12G -Xms12G -jar tmVar.jar ../GNormPlusJava/input output; cd -;''' # % (input_tmvar2, output_tmvar2) os.system(shell_script) tmvar2_time = time.time() - tmvar2_start_time elapsed_time_dict['tmvar2'] = round(tmvar2_time, 3) print(datetime.now().strftime(self.stm_dict['time_format']), '[{}] tmVar 2.0 {:.3f} sec'.format(cur_thread_name, tmvar2_time)) # Convert tmVar 2.0 outputs (?.PubTator.PubTator) to python dict file_list = glob.glob(output_tmvar2 + "/*.PubTator.PubTator") dict_list = [ pubtator2dict_list(i, is_raw_text=True) for i in file_list ] # 至此所有的结果已经到dict_list了 # Run BioBERT of Lee et al., 2019 ner_start_time = time.time() # 这里设为False会报错 is_raw_text = True tagged_docs_list = [] for dict_l in dict_list: tagged_docs, num_entities = \ biobert_recognize(self.stm_dict,dict_l, is_raw_text, cur_thread_name) tagged_docs_list.append((tagged_docs, num_entities)) if tagged_docs_list is None: return None ner_time = time.time() - ner_start_time elapsed_time_dict['ner'] = round(ner_time, 3) print( datetime.now().strftime(self.stm_dict['time_format']), '[%s] NER %.3f sec, #entities: %d' % (cur_thread_name, ner_time, num_entities)) #print(tagged_docs_list) #with open("ceshi_normllllll.list",'w') as ceshide: # ceshide.write(json.dumps(tagged_docs_list)) #return #for tagged_docs in tagged_docs_list: # if tagged_docs is None: # return None # assert len(tagged_docs) == 1 # Normalization models # 这里需要把load_dict.sh里的三个python脚本和两个jar全部run起来才跑的通 os.system('sh load_dicts.sh') # 这里需要等服务启动起来后在跑,所以等一分钟 time.sleep(60) normalization_time = 0. new_tagged_docs_list = [] for tagged_docs, num_entities in tagged_docs_list: if tagged_docs is None: continue text_hash = tagged_docs[0]['pmid'] if num_entities > 0: normalization_start_time = time.time() tagged_docs = self.normalizer.normalize( text_hash, tagged_docs, cur_thread_name, is_raw_text=is_raw_text) normalization_time = time.time() - normalization_start_time elapsed_time_dict['normalization'] = round(normalization_time, 3) # Convert to PubAnnotation JSON elapsed_time_dict['total'] = round(time.time() - get_start_t, 3) tagged_docs[0] = get_pub_annotation( tagged_docs[0], is_raw_text=is_raw_text, elapsed_time_dict=elapsed_time_dict) new_tagged_docs_list.append(tagged_docs[0]) # Save a BERN result bern_output_path = './output/bern_demo_{}.json'.format(text_hash) if reuse and os.path.exists(bern_output_path): print(datetime.now().strftime(self.stm_dict['time_format']), '[{}] Found prev. output'.format(cur_thread_name)) with open(bern_output_path, 'r', encoding='utf-8') as f_out: return json.load(f_out) with open(bern_output_path, 'w', encoding='utf-8') as f_out: json.dump(tagged_docs[0], f_out, sort_keys=True) return tagged_docs[0]