def main(_): os.environ['CUDA_VISIBLE_DEVICES'] = "0" biobert = BioBERT(FLAGS) from convert import pubtator_biocxml2dict_list import json dl = pubtator_biocxml2dict_list( [26658955, 24189420, 22579007, 29185436]) for d in dl: print(d['pmid']) with open('/media/donghyeon/f7c53837-2156-4793-b2b1-4b0578dffef1' '/biobert/BioBert_NER/BioBERTNER/data/' + d['pmid'] + '.json', 'w', encoding='utf-8') as f_out: json.dump(d, f_out) biobert.recognize([d]) biobert.close() show_prof_data()
def do_GET(self): get_start_t = time.time() parsed_path = urlparse.urlparse(self.path) cur_thread_name = threading.current_thread().getName() message = '\n'.join([ 'CLIENT VALUES:', 'client_address=%s (%s)' % (self.client_address, self.address_string()), 'command=%s' % self.command, 'path=%s' % self.path, 'real path=%s' % parsed_path.path, 'query=%s' % parsed_path.query, 'request_version=%s' % self.request_version, '', 'SERVER VALUES:', 'server_version=%s' % self.server_version, 'sys_version=%s' % self.sys_version, 'protocol_version=%s' % self.protocol_version, 'thread_name=%s' % cur_thread_name, ]) self.send_response(200) self.end_headers() elapsed_time_dict = dict() time_format = self.stm_dict['time_format'] available_formats = self.stm_dict['available_formats'] if parsed_path.query is None: err_msg = 'No url query' print(datetime.now().strftime(time_format), '[' + cur_thread_name + ']', err_msg) message += '\n' + err_msg self.wfile.write(message.encode('utf-8')) return indent = None # print(datetime.now().strftime(time_format), # 'query', parsed_path.query) qs_dict = urlparse.parse_qs(parsed_path.query) # print(datetime.now().strftime(time_format), 'qs_dict', qs_dict) if 'pmid' not in qs_dict or len(qs_dict['pmid']) == 0: err_msg = 'No pmid param' print(datetime.now().strftime(time_format), '[' + cur_thread_name + ']', err_msg) message += '\n' + err_msg self.wfile.write(message.encode('utf-8')) return pmid_list = qs_dict['pmid'][0].split(',') # print(datetime.now().strftime(time_format), 'pmid', pmid_list) if len(pmid_list) > self.stm_dict['n_pmid_limit']: err_msg = 'Too many (> {}) pmids: {}'.format( self.stm_dict['n_pmid_limit'], len(pmid_list)) print(datetime.now().strftime(time_format), '[' + cur_thread_name + ']', err_msg) message += '\n' + err_msg self.wfile.write(message.encode('utf-8')) return out_format = available_formats[0] if 'format' in qs_dict and len(qs_dict['format']) > 0: if qs_dict['format'][0] in available_formats: out_format = qs_dict['format'][0] else: print('Unavailable format', qs_dict['format'][0]) # print(datetime.now().strftime(time_format), # 'pmid:', pmid_list, ', format:', out_format) if 'indent' in qs_dict and len(qs_dict['indent']) > 0: indent = qs_dict['indent'][0] if 'true' == indent.lower(): indent = 4 else: indent = None text_hash = \ hashlib.sha224(qs_dict['pmid'][0].encode('utf-8')).hexdigest() print(datetime.now().strftime(time_format), '[{}] text_hash: {}'.format(cur_thread_name, text_hash)) # bern_output_path = './output/bern_api_{}.{}'.format(text_hash, # out_format) # # Re-use prev. outputs # if os.path.exists(bern_output_path): # with open(bern_output_path, 'r', encoding='utf-8') as f_out: # if out_format == 'json': # message = \ # json.dumps(json.load(f_out), indent=indent, # sort_keys=indent is not None) # elif out_format == 'pubtator': # message = f_out.read() # else: # raise ValueError('Wrong format: {}'.format(out_format)) # # self.wfile.write(message.encode('utf-8')) # print(datetime.now().strftime(time_format), # '[{}] Done. Found prev. output. Total {:.3f} sec\n'. # format(cur_thread_name, time.time() - get_start_t)) # return is_raw_text = False tmtool_start_t = time.time() dict_list = pubtator_biocxml2dict_list(pmid_list) tmtool_time = time.time() - tmtool_start_t elapsed_time_dict['tmtool'] = round(tmtool_time, 3) if dict_list is None: error_dict = self.get_err_dict() error_dict['pmid'] = pmid_list[0] if len(pmid_list) == 1 else '' error_dict['abstract'] = 'error: tmtool: no response' print(datetime.now().strftime(time_format), '[' + cur_thread_name + ']', error_dict['abstract']) if out_format == available_formats[0]: self.wfile.write( json.dumps([ get_pub_annotation(error_dict, is_raw_text=is_raw_text) ], indent=indent, sort_keys=indent is not None).encode('utf-8')) elif out_format == available_formats[1]: self.wfile.write(get_pubtator([error_dict]).encode('utf-8')) return elif type(dict_list) is str: error_dict = self.get_err_dict() error_dict['pmid'] = pmid_list[0] if len(pmid_list) == 1 else '' if 'currently unavailable' in dict_list: error_dict['abstract'] = 'error: tmtool: currently unavailable' elif 'invalid version format' in dict_list: error_dict[ 'abstract'] = 'error: tmtool: invalid version format' else: error_dict['abstract'] = 'error: tmtool: {}'.format( dict_list.replace('\n', '')) print(datetime.now().strftime(time_format), '[' + cur_thread_name + ']', error_dict['abstract']) if out_format == available_formats[0]: self.wfile.write( json.dumps([ get_pub_annotation(error_dict, is_raw_text=is_raw_text) ], indent=indent, sort_keys=indent is not None).encode('utf-8')) elif out_format == available_formats[1]: self.wfile.write(get_pubtator([error_dict]).encode('utf-8')) return print( datetime.now().strftime(time_format), '[{}] tmTool: PubMed & GNormPlus & tmVar {:.3f} sec'.format( cur_thread_name, tmtool_time)) # Run BioBERT NER models of Lee et al., 2019 ner_start_time = time.time() tagged_docs, num_entities = \ self.biobert_recognize(dict_list, is_raw_text, cur_thread_name) ner_time = time.time() - ner_start_time elapsed_time_dict['ner'] = round(ner_time, 3) if tagged_docs is None: error_dict = self.get_err_dict() error_dict['pmid'] = pmid_list[0] if len(pmid_list) == 1 else '' error_dict['abstract'] = 'error: BioBERT NER, out of index range' if out_format == available_formats[0]: self.wfile.write( json.dumps([ get_pub_annotation(error_dict, is_raw_text=is_raw_text) ], indent=indent, sort_keys=indent is not None).encode('utf-8')) elif out_format == available_formats[1]: self.wfile.write(get_pubtator([error_dict]).encode('utf-8')) return print( datetime.now().strftime(time_format), '[%s] NER %.3f sec, #entities: %d, #articles: %d' % (cur_thread_name, ner_time, num_entities, len(tagged_docs))) # Normalization models normalization_time = 0. if num_entities > 0: # print(datetime.now().strftime(time_format), # '[{}] Normalization models..'.format(cur_thread_name)) normalization_start_time = time.time() tagged_docs = self.normalizer.normalize(text_hash, tagged_docs, cur_thread_name, is_raw_text=is_raw_text) normalization_time = time.time() - normalization_start_time elapsed_time_dict['normalization'] = round(normalization_time, 3) # apply output format if out_format == available_formats[0]: elapsed_time_dict['total'] = round(time.time() - get_start_t, 3) # PubAnnotation JSON pubannotation_res = list() for d in tagged_docs: pubannotation_res.append( get_pub_annotation(d, is_raw_text=is_raw_text, elapsed_time_dict=elapsed_time_dict)) self.wfile.write( json.dumps(pubannotation_res, indent=indent, sort_keys=indent is not None).encode('utf-8')) # # Save a BERN result # with open(bern_output_path, 'w', encoding='utf-8') as f_out: # json.dump(pubannotation_res, f_out) elif out_format == available_formats[1]: # PubTator self.wfile.write(get_pubtator(tagged_docs).encode('utf-8')) # # Save a BERN result # with open(bern_output_path, 'w', encoding='utf-8') as f_out: # f_out.write(pubtator_res) print( datetime.now().strftime(time_format), '[{}] Done. Total {:.3f} sec\n'.format(cur_thread_name, time.time() - get_start_t)) return
t = t.replace('\r', '') t = t.replace('\n', ' ') t = t.replace('\t', ' ') t = t.replace(' \u2028 ', ' ') t = t.replace('\u2029', ' ') return t if __name__ == '__main__': from convert import pubtator_biocxml2dict_list # testpmids = [25681199, 29446767] testpmids = [29446767, 25681199] # print(query_pubtator2([29446767,25681199])) # print(query_pubtator2([25681199])) print(pubtator_biocxml2dict_list(testpmids)) import sys sys.exit(0) get_pmc_archive( os.path.expanduser('~') + '/bestplus/pmc/oa_non_comm_use_pdf.txt', os.path.expanduser('~') + '/bestplus/pmc/non_comm', os.path.expanduser('~') + '/bestplus/pmc/oa_comm_use_file_list.txt', os.path.expanduser('~') + '/bestplus/pmc/comm' ) from convert import pubtatorstr2dict_list # raw texts & demo bern_raw_res = query_raw_bern('Results We identified five activating mutations in the PIK3CA gene in affected tissues from 6 of the 9 patients studied; one of the variants (NM_006218.2:c.248T>C; p.Phe83Ser) has not been previously described in developmental disorders.')