def init_features (): feature_set = ufile.read_csv_as_dict ('data\\numeric_features.csv', 0, 1, True) global greater, greater_equal, greater_equal2, lower, lower_equal, lower_equal2, equal, between, selects, connect, features, temporal, temporal_con, error1, error2, symbols, numbers, unit_special, unit_ori, unit_ori_s, unit_exp, negation greater, greater_equal, greater_equal2, lower, lower_equal, lower_equal2, equal, between, selects, connect, features, temporal, temporal_con, error1, error2, symbols, numbers, unit_special, unit_ori, unit_ori_s, unit_exp, negation = \ feature_set["greater"], feature_set["greater_equal"], feature_set["greater_equal2"], feature_set["lower"], feature_set["lower_equal"], feature_set["lower_equal2"], feature_set["equal"], feature_set["between"], feature_set["selects"], feature_set["connect"], feature_set["features"], feature_set["temporal"], feature_set["temporal_con"], feature_set["error1"], feature_set["error2"], feature_set["symbols"], feature_set["numbers"], feature_set["unit_special"], feature_set["unit_ori"], feature_set["unit_ori_s"], feature_set["unit_exp"], feature_set["negation"] temporal = temporal + '|' + temporal.replace('|', 's|') + 's' unit = (unit_ori + "|" + unit_ori_s.replace("|", "s|") + "s|" + unit_ori_s + "|" + temporal) return ""
def extract_variables (fdin, ffea, ffea2, var, cores): # read input dataset if fdin is None or fdin =="": return False trials = ufile.read_csv (fdin) if trials is None or len(trials) <= 0: print(ext_print) print('input data error, please check either no such file or no data --- interrupting') return False print(ext_print) print('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea =="": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print(ext_print) print('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var: fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] = key # read feature list - umls if ffea2 is None or ffea2 =="": return False fea_dict_umls = ufile.read_csv_as_dict (ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print(ext_print) print('no feature data available --- interrupting') return False output = Manager().list() jobs = [] for i in range(1,cores+1): t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output)) jobs.append(t) t.start() for j in jobs: j.join() fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv (fout, output) print(ext_print) print('saved processed results into: %s' % fout) return True
def extract_variables (fdin, ffea, ffea2, var, cores): # read input dataset if fdin is None or fdin =="": return False trials = ufile.read_csv (fdin) if trials is None or len(trials) <= 0: print ext_print ('input data error, please check either no such file or no data --- interrupting') return False print ext_print ('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea =="": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print ext_print ('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var: fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] = key # read feature list - umls if ffea2 is None or ffea2 =="": return False fea_dict_umls = ufile.read_csv_as_dict (ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print ext_print ('no feature data available --- interrupting') return False output = Manager().list() jobs = [] for i in xrange(1,cores+1): t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output)) jobs.append(t) t.start() for j in jobs: j.join() fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv (fout, output) print ext_print ('saved processed results into: %s' % fout) return True
def extract_variables(fdin, ffea, ffea2, var): # read input data if fdin is None or fdin == "": return False trials = ufile.read_csv(fdin) if trials is None or len(trials) <= 0: print(ext_print) print( 'input data error, please check either no such file or no data --- interrupting' ) return False print(ext_print) print('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea == "": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items(ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print(ext_print) print('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var: fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] = key # read feature list - UMLS (can be replaced by full UMLS) if ffea2 is None or ffea2 == "": return False fea_dict_umls = ufile.read_csv_as_dict(ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print(ext_print) print('no feature data available --- interrupting') return False #load numeric feature list Valx_core.init_features() output = [] for i in range(len(trials)): if i % 1000 == 0: print('processing %d' % i) # pre-processing eligibility criteria text text = Valx_core.preprocessing( trials[i][1]) # trials[i][1] is the eligibility criteria text (sections_num, candidates_num) = Valx_core.extract_candidates_numeric( text) # extract candidates containing numeric features for j in range(len(candidates_num)): # for each candidate exp_text = Valx_core.formalize_expressions( candidates_num[j]) # identify and formalize values (exp_text, key_ngrams) = Valx_core.identify_variable( exp_text, feature_dict_dk, fea_dict_umls ) # identify variable mentions and map them to names (variables, vars_values) = Valx_core.associate_variable_values(exp_text) all_exps = [] for k in range(len(variables)): curr_var = variables[k] curr_exps = vars_values[k] if curr_var in features: fea_list = features[curr_var] curr_exps = Valx_core.context_validation( curr_exps, fea_list[1], fea_list[2]) curr_exps = Valx_core.normalization( fea_list[3], curr_exps) # unit conversion and value normalization curr_exps = Valx_core.hr_validation( curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation if len(curr_exps) > 0: if var == "All" or var.lower() == curr_var.lower( ) or var.lower() in curr_var.lower(): all_exps += curr_exps if len(all_exps) > 0: output.append( (trials[i][0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result # output result fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv(fout, output) print(ext_print) print('saved processed results into: %s' % fout) return True
def extract_variables (fdin, ffea, ffea2, var): # read input data if fdin is None or fdin =="": return False trials = ufile.read_csv (fdin) if trials is None or len(trials) <= 0: print ext_print ('input data error, please check either no such file or no data --- interrupting') return False print ext_print ('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea =="": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print ext_print ('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var:fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] =key # read feature list - UMLS (can be replaced by full UMLS) if ffea2 is None or ffea2 =="": return False fea_dict_umls = ufile.read_csv_as_dict (ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print ext_print ('no feature data available --- interrupting') return False #load numeric feature list Valx_core.init_features() output = [] for i in xrange(len(trials)): if i%1000 == 0: print ('processing %d' % i) # pre-processing eligibility criteria text text = Valx_core.preprocessing(trials[i][1]) # trials[i][1] is the eligibility criteria text (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(text) # extract candidates containing numeric features for j in xrange(len(candidates_num)): # for each candidate exp_text = Valx_core.formalize_expressions(candidates_num[j]) # identify and formalize values (exp_text, key_ngrams) = Valx_core.identify_variable(exp_text, feature_dict_dk, fea_dict_umls) # identify variable mentions and map them to names (variables, vars_values) = Valx_core.associate_variable_values(exp_text) all_exps = [] for k in xrange(len(variables)): curr_var = variables[k] curr_exps = vars_values[k] if curr_var in features: fea_list = features[curr_var] curr_exps = Valx_core.context_validation(curr_exps, fea_list[1], fea_list[2]) curr_exps = Valx_core.normalization(fea_list[3], curr_exps) # unit conversion and value normalization curr_exps = Valx_core.hr_validation (curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation if len(curr_exps) > 0: if var == "All" or var.lower() == curr_var.lower() or var.lower() in curr_var.lower(): all_exps += curr_exps if len(all_exps) > 0: output.append((trials[i][0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result # output result fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv (fout, output) print ext_print ('saved processed results into: %s' % fout) return True
def compare_all(fin1, fdin2): # read input data if fin1 is None or fin1 == "": return False fin_files = fin1.split(';') # read input data if fdin2 is None or fdin2 == "": return False words_sims = ufile.read_csv_as_dict(fdin2, 0, 2) # a specific file or a directory output, output_performance = [], [] output.append(("ID", "Sentence", "Target word", "By Gold", "By system")) for fin_file in fin_files: texts = ufile.read_csv(fin_file) # a specific file or a directory final_golds, final_system = [], [] for text in texts: key = text[0] sentence = text[1] # get all sentences target_word = text[2] golds = {} # gold word gold_temps = text[3].split(';') for gold_temp in gold_temps: tems = gold_temp.split(':') golds[tems[0]] = int(tems[1]) final_golds.append(golds) #所有golds组成一个列表,每一个目标词的gold是其中的一个元素 if key not in words_sims: exit("No key in processed similarity file!") wordnet_result = ast.literal_eval(words_sims[key]) final_system.append(wordnet_result[:]) output.append( (key, sentence, target_word, golds, wordnet_result[:])) #print final_golds output.append(()) # ===========evaluation output_performance.append(("=====Accuracy@N=======", )) for N in xrange(10): num_correct = 0 for i in xrange(len(final_golds)): gold = final_golds[i] # dictionary sys = final_system[i] # array for j in xrange(len(sys)): if j > N: break if sys[j][0] in gold: # sys = "finally:0.2" num_correct += 1 break accuracy = round(num_correct / float(len(final_golds)), 3) print("Accuracy@" + str(N + 1), accuracy, "%d of %d are correct" % (num_correct, len(final_golds))) output_performance.append( ("Accuracy@" + str(N + 1), accuracy, "%d of %d are correct" % (num_correct, len(final_golds)))) output_performance.append(("=====best P&R=======", )) fenzi, num_resp, = 0.0, 0 for i in xrange(len(final_golds)): gold = final_golds[i] # dictionary sys = final_system[i] # 每一个目标词的候选词列表 if len(sys) > 0: num_resp += 1 #有候选词的目标词个数 best_sys = sys[0][0] if best_sys in gold: # sys = "finally:0.2" fenzi += float(gold[best_sys]) / sum(gold.values()) print("best P fenmu is %d,fenzi is %f" % (num_resp, fenzi)) P = round(fenzi / float(num_resp), 3) R = round(fenzi / float(len(final_golds)), 3) output_performance.append(("Best Precision", P)) output_performance.append(("Best Recall", R)) output_performance.append(("Best F1", F1(P, R))) output_performance.append(("=====oot P&R=======", )) fenzi, num_resp, = 0.0, 0 for i in xrange(len(final_golds)): gold = final_golds[i] # dictionary sys = final_system[i] # array if len(sys) > 0: num_resp += 1 for each_sys in sys: if each_sys[0] in gold: # each_sys = "finally:0.2" fenzi += float(gold[each_sys[0]]) / sum(gold.values()) print("Oot P fenmu is %d,fenzi is %f" % (num_resp, fenzi)) P = round(fenzi / float(num_resp), 3) R = round(fenzi / float(len(final_golds)), 3) output_performance.append(("oot Precision", P)) output_performance.append(("oot Recall", R)) output_performance.append(("oot F1", F1(P, R))) output_performance.append(()) output_performance.append(("=====Candidates generation rate=======", )) rate = round(num_resp / float(len(final_golds)), 3) print rate output_performance.append(("Candidates generation rate", rate)) output.extend(output_performance) # get output data directory fout = fdin2.replace(".csv", "_Evaluation.csv") ufile.write_csv(fout, output) print 'saved result into: %s' % fout return True
# Valx: A system for extracting and structuring numeric lab test comparison statements from text # Created by Tony HAO, [email protected] # Please kindly cite the paper: Tianyong Hao, Hongfang Liu, Chunhua Weng. Valx: A system for extracting and structuring numeric lab test comparison statements from text. Methods of Information in Medicine. Vol. 55: Issue 3, pp. 266-275, 2016 import W_utility.file as ufile from W_utility.log import ext_print import os, sys, re import Valx_core fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items( 'data/variable_features_dk.csv') fea_dict_umls = ufile.read_csv_as_dict('data/variable_features_umls.csv') var = 'All' #load numeric feature list Valx_core.init_features() def extract_values(text): # read input data if text is None or text == "": return False # trials = ufile.read_csv (fdin) trials = [text] if trials is None or len(trials) == 0: return False # read feature list - domain knowledge if fea_dict_dk is None or len(fea_dict_dk) <= 0: return False # get feature info