def evaluate(xml_path, output_pred_path, raw_data_path, doc_list, output_format): gold_count = 0 pred_count = 0 true_count = 0 print('xml_path: %s', xml_path) print('doc_list: %s', len(doc_list)) for file_id in range(len(doc_list)): print('path: ', os.path.join(xml_path, doc_list[file_id] + "_tag")) print( 'path: %s', os.path.exists( os.path.join(xml_path, doc_list[file_id] + "_tag.txt"))) if os.path.exists( os.path.join(xml_path, doc_list[file_id] + "_tag.txt")): gold_tag_dict = get_gold_dict( read.readfrom_json( os.path.join(xml_path, doc_list[file_id] + "_tag"))) output_path = os.path.join(output_pred_path, doc_list[file_id], doc_list[file_id] + output_format) raw_text_path = os.path.join(raw_data_path, doc_list[file_id]) pre_tag_dict = process.extract_xmltag_anafora_pred( output_path, read.readfrom_txt(raw_text_path)) scores = calculate_score(gold_tag_dict, pre_tag_dict) gold_count += scores[0] pred_count += scores[1] true_count += scores[2] metrics(true_count, pred_count, gold_count)
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path, xml_path, file_format): max_len_all = list() char_vocab = defaultdict(float) for data_id in range(0, len(file_dir)): raw_text_path = os.path.join(raw_data_path, file_dir[data_id], file_dir[data_id]) preprocessed_file_path = os.path.join(preprocessed_path, file_dir[data_id], file_dir[data_id]) raw_text = read.readfrom_txt(raw_text_path) raw_text = process.text_normalize(raw_text) sent_span_list_file, max_len_file, char_vocab = split_by_sentence( raw_text, char_vocab) max_len_all += max_len_file read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file) if xml_path != "": xml_file_path = os.path.join(xml_path, file_dir[data_id], file_dir[data_id] + file_format) posi_info_dict = process.extract_xmltag_anafora( xml_file_path, raw_text) sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file, posi_info_dict) read.savein_json(preprocessed_file_path + "_tag", sent_tag_list_file) max_len_all.sort(reverse=True) max_len_file_name = "/".join( preprocessed_path.split('/')[:-1]) + "/max_len_sent" read.savein_json(max_len_file_name, max_len_all)
def textfile2list_twa_st(): data = read.readfrom_txt(file_path_st) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in twa_cuis: txt_list[line[0]] = line[2] read.save_in_json("data/TwADR-L/cui_st_dict", txt_list)
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path, xml_path, file_format): max_len_all = list() char_vocab = defaultdict(float) pos_vocab = defaultdict(float) unicode_vocab = defaultdict(float) word_vocab = defaultdict(float) for data_id in range(0, len(file_dir)): raw_text_path = os.path.join(raw_data_path, file_dir[data_id], file_dir[data_id]) preprocessed_file_path = os.path.join(preprocessed_path, file_dir[data_id], file_dir[data_id]) raw_text = read.readfrom_txt(raw_text_path) raw_text = process.text_normalize(raw_text) sent_span_list_file, max_len_file, char_vocab = split_by_sentence( raw_text, char_vocab) max_len_all += max_len_file pos_sentences, pos_vocab = process.get_pos_sentence( sent_span_list_file, pos_vocab) #pos_sentences = read.readfrom_json("data/pos_sentences")#read.savein_json("data/pos_sentences",pos_sentences) word_sentences, word_vocab = process.get_words(sent_span_list_file, word_vocab) pos_sentences_character = process.word_pos_2_character_pos( sent_span_list_file, pos_sentences) unico_sentences_characte, unicode_vocab = process.get_unicode( sent_span_list_file, unicode_vocab) read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file) read.savein_json(preprocessed_file_path + "_pos", pos_sentences_character) read.savein_json(preprocessed_file_path + "_unicodecategory", unico_sentences_characte) read.savein_json(preprocessed_file_path + "_words", word_sentences) if xml_path != "": xml_file_path = os.path.join(xml_path, file_dir[data_id], file_dir[data_id] + file_format) posi_info_dict = process.extract_xmltag_anafora( xml_file_path, raw_text) sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file, posi_info_dict) read.savein_json(preprocessed_file_path + "_tag", sent_tag_list_file) #read.savein_json("data/word_vocab", word_vocab) max_len_all.sort(reverse=True) max_len_file_name = "/".join( preprocessed_path.split('/')[:-1]) + "/max_len_sent" read.savein_json(max_len_file_name, max_len_all)
def textfile2list_smm4h_st(): file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF" code_cuis = read.read_from_json("data/SMM4H/code_cuis") cuis = [cuis[0] for _, cuis in code_cuis.items()] data = read.readfrom_txt(file_path_st) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in cuis: txt_list[line[0]] = line[2] read.save_in_json("data/SMM4H/cui_st_dict", txt_list)
def textfile2list_twa(): data = read.readfrom_txt(file_path_synonym) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in twa_cuis: if line[0] not in txt_list: txt_list[line[0]] = [line[14]] else: txt_list[line[0]] += [line[14]] read.save_in_json("data/TwADR-L/cui_dict", txt_list)
def textfile2list_smm4h(): code_cuis = read.read_from_json("data/SMM4H/code_cuis") cuis = [cuis[0] for _, cuis in code_cuis.items()] file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" data = read.readfrom_txt(file_path_synonym) txt_list = [] for line in data.splitlines(): line = line.split('|') if line[0] in cuis: txt_list.append(line) read.save_in_json("data/SMM4H/synonyms_all", txt_list)
def textfile2list_twa(): data = read.readfrom_txt(file_path) cuis_twa = [] txt_list = [] for line in data.splitlines(): if "SNO" in line: print(line) line = line.split('|') if line[0] in twa: cuis_twa.append(line[0]) txt_list.append(line) read.save_in_json("data/TwADR-L/synonyms", txt_list) read.save_in_json("data/TwADR-L/cuis", list(set(cuis_twa)))
def textfile2list_smm4h(): file_path = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" smm4h = read.read_from_json("data/SMM4H/labels_ori") data = read.readfrom_txt(file_path) cuis_smm4h = [] txt_list = [] for line in data.splitlines(): # if "MDR" in line: # print(line) line = line.split('|') if "MDR" in line[11]: cuis_smm4h.append(line[0]) txt_list.append(line) read.save_in_json("data/SMM4H/synonyms", txt_list) read.save_in_json("data/SMM4H/cuis", list(set(cuis_smm4h)))
def textfile2list_ask(): data = read.readfrom_txt(file_path) cui_code_ask = {} codes = [] for line in data.splitlines(): line = line.split('|') if line[13] in ask: codes.append(line[13]) if line[0] not in cui_code_ask: cui_code_ask[line[0]] = [line[13]] else: cui_code_ask[line[0]] += [line[13]] # txt_list.append(line) # read.save_in_json("data/AskAPatient/synonyms",txt_list) read.save_in_json("data/AskAPatient/cui_codes", cui_code_ask) read.save_in_json("data/AskAPatient/codes", list(set(codes)))
def get_snomed_rxnorm_umls(): rxnorm_term = read.read_from_json( "/extra/dongfangxu9/umls/processed/rxnorm_dict") snomed_term = read.read_from_json( "/extra/dongfangxu9/umls/processed/snomed_dict") cui_all = list(set(list(rxnorm_term.keys()) + list(snomed_term.keys()))) cui_all_synonyms = {} # print(len(cui_all)) data = read.readfrom_txt( "/extra/dongfangxu9/umls/umls_2017_subset/2017AB/META/MRCONSO.RRF") for line in data.splitlines(): line_split = line.split('|') if line_split[0] in cui_all: cui_all_synonyms = add_dict(cui_all_synonyms, line_split[0], line_split)
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path, xml_path, file_format): max_len_all = list() char_vocab = defaultdict(float) for data_id in range(0, len(file_dir)): #raw_text_path = os.path.join(raw_data_path,file_dir[data_id],file_dir[data_id]) #preprocessed_file_path = os.path.join(preprocessed_path,file_dir[data_id],file_dir[data_id]) raw_text_path = os.path.join(raw_data_path, file_dir[data_id]) preprocessed_file_path = os.path.join(preprocessed_path, file_dir[data_id]) raw_text = read.readfrom_txt(raw_text_path) raw_text = process.text_normalize(raw_text) #print('raw_text - %s' % raw_text) #print('raw_text AFTER NORMALIZE - %s' % raw_text) sent_span_list_file, max_len_file, char_vocab = split_by_sentence( raw_text, char_vocab) #print('sent_span_list_file - %s, max_len_file - %s,char_vocab - %s ' % (sent_span_list_file, max_len_file, char_vocab)) max_len_all += max_len_file read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file) if xml_path != "": xml_file_path = os.path.join(xml_path, file_dir[data_id], file_dir[data_id] + file_format) #xml_file_path = os.path.join(xml_path, file_dir[data_id] + file_format) print('xml_file_path - %s' % xml_file_path) posi_info_dict = process.extract_xmltag_anafora( xml_file_path, raw_text) print('posi_info_dict - ') for key, value in posi_info_dict.items(): print(key, value) sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file, posi_info_dict) #print('sent_tag_list_file - %s' % sent_tag_list_file) read.savein_json(preprocessed_file_path + "_tag", sent_tag_list_file) print('max_len_all - %s' % max_len_all) max_len_all.sort(reverse=True) max_len_file_name = "/".join( preprocessed_path.split('/')[:-1]) + "/max_len_sent" read.savein_json(max_len_file_name, max_len_all)
def process_umls(): cui_all_snomed = {} cui_all_rxnorm = {} data = read.readfrom_txt( "/extra/dongfangxu9/umls/umls_2017_subset/2017AB/META/MRCONSO.RRF") for line in data.splitlines(): line_split = line.split('|') #### 11 = vocabulary, 12 = term type 14 = term name, 16 = suppress 0 = cui if "SNOMEDCT" in line_split[11]: if line_split[16] == "N": cui_all_snomed = add_dict(cui_all_snomed, line_split[0], line_split[14]) if "RXNORM" in line_split[11]: if line_split[16] == "N": cui_all_rxnorm = add_dict(cui_all_rxnorm, line_split[0], line_split[14]) read.save_in_json("/extra/dongfangxu9/umls/processed/snomed_dict", cui_all_snomed) read.save_in_json("/extra/dongfangxu9/umls/processed/rxnorm_dict", cui_all_rxnorm)