예제 #1
0
파일: Valx_core.py 프로젝트: wabc1994/Valx
def init_features ():
    feature_set = ufile.read_csv_as_dict ('data\\numeric_features.csv', 0, 1, True)
    global greater, greater_equal, greater_equal2, lower, lower_equal, lower_equal2, equal, between, selects, connect, features, temporal, temporal_con, error1, error2, symbols, numbers, unit_special, unit_ori, unit_ori_s, unit_exp, negation
    greater, greater_equal, greater_equal2, lower, lower_equal, lower_equal2, equal, between, selects, connect, features, temporal, temporal_con, error1, error2, symbols, numbers, unit_special, unit_ori, unit_ori_s, unit_exp, negation = \
    feature_set["greater"], feature_set["greater_equal"], feature_set["greater_equal2"], feature_set["lower"], feature_set["lower_equal"], feature_set["lower_equal2"], feature_set["equal"], feature_set["between"], feature_set["selects"], feature_set["connect"], feature_set["features"], feature_set["temporal"], feature_set["temporal_con"], feature_set["error1"], feature_set["error2"], feature_set["symbols"], feature_set["numbers"], feature_set["unit_special"], feature_set["unit_ori"], feature_set["unit_ori_s"], feature_set["unit_exp"], feature_set["negation"]
    temporal = temporal + '|' + temporal.replace('|', 's|') + 's'
    unit = (unit_ori + "|" + unit_ori_s.replace("|", "s|") + "s|" + unit_ori_s + "|" + temporal)
    return ""
예제 #2
0
def extract_variables (fdin, ffea, ffea2, var, cores):
    # read input dataset
    if fdin is None or fdin =="": return False
    trials = ufile.read_csv (fdin)
    if trials is None or len(trials) <= 0:
        print(ext_print)
        print('input data error, please check either no such file or no data --- interrupting')
        return False
    print(ext_print)
    print('found a total of %d data items' % len(trials))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print(ext_print)
        print('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var: fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] = key

    # read feature list - umls
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print(ext_print)
        print('no feature data available --- interrupting')
        return False


    output = Manager().list()
    jobs = []
    for i in range(1,cores+1):
        t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output))
        jobs.append(t)
        t.start()    
    for j in jobs: j.join()

    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv (fout, output)
    print(ext_print)
    print('saved processed results into: %s' % fout)
    return True
예제 #3
0
def extract_variables (fdin, ffea, ffea2, var, cores):
    # read input dataset
    if fdin is None or fdin =="": return False
    trials = ufile.read_csv (fdin)
    if trials is None or len(trials) <= 0:
        print ext_print ('input data error, please check either no such file or no data --- interrupting')
        return False
    print ext_print ('found a total of %d data items' % len(trials))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var: fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] = key

    # read feature list - umls
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False


    output = Manager().list()
    jobs = []
    for i in xrange(1,cores+1):
        t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output))
        jobs.append(t)
        t.start()    
    for j in jobs: j.join()

    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv (fout, output)
    print ext_print ('saved processed results into: %s' % fout)
    return True
예제 #4
0
def extract_variables(fdin, ffea, ffea2, var):
    # read input data
    if fdin is None or fdin == "": return False
    trials = ufile.read_csv(fdin)
    if trials is None or len(trials) <= 0:
        print(ext_print)
        print(
            'input data error, please check either no such file or no data --- interrupting'
        )
        return False
    print(ext_print)
    print('found a total of %d data items' % len(trials))

    # read feature list - domain knowledge
    if ffea is None or ffea == "": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items(ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print(ext_print)
        print('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var: fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] = key

    # read feature list - UMLS (can be replaced by full UMLS)
    if ffea2 is None or ffea2 == "": return False
    fea_dict_umls = ufile.read_csv_as_dict(ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print(ext_print)
        print('no feature data available --- interrupting')
        return False

    #load numeric feature list
    Valx_core.init_features()

    output = []
    for i in range(len(trials)):
        if i % 1000 == 0:
            print('processing %d' % i)
        # pre-processing eligibility criteria text
        text = Valx_core.preprocessing(
            trials[i][1])  # trials[i][1] is the eligibility criteria text
        (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(
            text)  # extract candidates containing numeric features
        for j in range(len(candidates_num)):  # for each candidate
            exp_text = Valx_core.formalize_expressions(
                candidates_num[j])  # identify and formalize values
            (exp_text, key_ngrams) = Valx_core.identify_variable(
                exp_text, feature_dict_dk, fea_dict_umls
            )  # identify variable mentions and map them to names
            (variables,
             vars_values) = Valx_core.associate_variable_values(exp_text)
            all_exps = []
            for k in range(len(variables)):
                curr_var = variables[k]
                curr_exps = vars_values[k]
                if curr_var in features:
                    fea_list = features[curr_var]
                    curr_exps = Valx_core.context_validation(
                        curr_exps, fea_list[1], fea_list[2])
                    curr_exps = Valx_core.normalization(
                        fea_list[3],
                        curr_exps)  # unit conversion and value normalization
                    curr_exps = Valx_core.hr_validation(
                        curr_exps, float(fea_list[4]),
                        float(fea_list[5]))  # heuristic rule-based validation
                if len(curr_exps) > 0:
                    if var == "All" or var.lower() == curr_var.lower(
                    ) or var.lower() in curr_var.lower():
                        all_exps += curr_exps

            if len(all_exps) > 0:
                output.append(
                    (trials[i][0], sections_num[j], candidates_num[j],
                     exp_text, str(all_exps).replace("u'",
                                                     "'")))  # output result

    # output result
    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv(fout, output)
    print(ext_print)
    print('saved processed results into: %s' % fout)
    return True
예제 #5
0
파일: Valx_CTgov.py 프로젝트: Tony-Hao/Valx
def extract_variables (fdin, ffea, ffea2, var):
    # read input data
    if fdin is None or fdin =="": return False
    trials = ufile.read_csv (fdin)
    if trials is None or len(trials) <= 0:
        print ext_print ('input data error, please check either no such file or no data --- interrupting')
        return False
    print ext_print ('found a total of %d data items' % len(trials))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var:fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] =key

    # read feature list - UMLS (can be replaced by full UMLS)
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False

    #load numeric feature list
    Valx_core.init_features()

    output = []
    for i in xrange(len(trials)):
        if i%1000 == 0:
            print ('processing %d' % i)
        # pre-processing eligibility criteria text
        text = Valx_core.preprocessing(trials[i][1]) # trials[i][1] is the eligibility criteria text
        (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(text) # extract candidates containing numeric features
        for j in xrange(len(candidates_num)): # for each candidate
            exp_text = Valx_core.formalize_expressions(candidates_num[j]) # identify and formalize values
            (exp_text, key_ngrams) = Valx_core.identify_variable(exp_text, feature_dict_dk, fea_dict_umls) # identify variable mentions and map them to names
            (variables, vars_values) = Valx_core.associate_variable_values(exp_text)
            all_exps = []
            for k in xrange(len(variables)):
                curr_var = variables[k]
                curr_exps = vars_values[k]
                if curr_var in features:
                    fea_list = features[curr_var]
                    curr_exps = Valx_core.context_validation(curr_exps, fea_list[1], fea_list[2])                           
                    curr_exps = Valx_core.normalization(fea_list[3], curr_exps) # unit conversion and value normalization
                    curr_exps = Valx_core.hr_validation (curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation
                if len(curr_exps) > 0:
                    if var == "All" or var.lower() == curr_var.lower() or var.lower() in curr_var.lower(): all_exps += curr_exps                     
                 
            if len(all_exps) > 0: output.append((trials[i][0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result

    # output result
    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv (fout, output)
    print ext_print ('saved processed results into: %s' % fout)
    return True
예제 #6
0
def compare_all(fin1, fdin2):
    # read input data
    if fin1 is None or fin1 == "":
        return False
    fin_files = fin1.split(';')

    # read input data
    if fdin2 is None or fdin2 == "":
        return False
    words_sims = ufile.read_csv_as_dict(fdin2, 0,
                                        2)  # a specific file or a directory
    output, output_performance = [], []
    output.append(("ID", "Sentence", "Target word", "By Gold", "By system"))
    for fin_file in fin_files:
        texts = ufile.read_csv(fin_file)  # a specific file or a directory
        final_golds, final_system = [], []
        for text in texts:
            key = text[0]
            sentence = text[1]  # get all sentences
            target_word = text[2]
            golds = {}  # gold word
            gold_temps = text[3].split(';')
            for gold_temp in gold_temps:
                tems = gold_temp.split(':')
                golds[tems[0]] = int(tems[1])
            final_golds.append(golds)  #所有golds组成一个列表,每一个目标词的gold是其中的一个元素
            if key not in words_sims:
                exit("No key in processed similarity file!")
            wordnet_result = ast.literal_eval(words_sims[key])
            final_system.append(wordnet_result[:])
            output.append(
                (key, sentence, target_word, golds, wordnet_result[:]))
        #print final_golds
        output.append(())
        # ===========evaluation
        output_performance.append(("=====Accuracy@N=======", ))
        for N in xrange(10):
            num_correct = 0
            for i in xrange(len(final_golds)):
                gold = final_golds[i]  # dictionary
                sys = final_system[i]  # array
                for j in xrange(len(sys)):
                    if j > N:
                        break
                    if sys[j][0] in gold:  # sys = "finally:0.2"
                        num_correct += 1
                        break

            accuracy = round(num_correct / float(len(final_golds)), 3)
            print("Accuracy@" + str(N + 1), accuracy,
                  "%d of %d are correct" % (num_correct, len(final_golds)))
            output_performance.append(
                ("Accuracy@" + str(N + 1), accuracy,
                 "%d of %d are correct" % (num_correct, len(final_golds))))

        output_performance.append(("=====best P&R=======", ))
        fenzi, num_resp, = 0.0, 0
        for i in xrange(len(final_golds)):
            gold = final_golds[i]  # dictionary
            sys = final_system[i]  # 每一个目标词的候选词列表
            if len(sys) > 0:
                num_resp += 1  #有候选词的目标词个数
                best_sys = sys[0][0]
                if best_sys in gold:  # sys = "finally:0.2"
                    fenzi += float(gold[best_sys]) / sum(gold.values())
        print("best P fenmu is %d,fenzi is %f" % (num_resp, fenzi))
        P = round(fenzi / float(num_resp), 3)
        R = round(fenzi / float(len(final_golds)), 3)
        output_performance.append(("Best Precision", P))
        output_performance.append(("Best Recall", R))
        output_performance.append(("Best F1", F1(P, R)))

        output_performance.append(("=====oot P&R=======", ))
        fenzi, num_resp, = 0.0, 0
        for i in xrange(len(final_golds)):
            gold = final_golds[i]  # dictionary
            sys = final_system[i]  # array
            if len(sys) > 0:
                num_resp += 1
                for each_sys in sys:
                    if each_sys[0] in gold:  # each_sys = "finally:0.2"
                        fenzi += float(gold[each_sys[0]]) / sum(gold.values())
        print("Oot P fenmu is %d,fenzi is %f" % (num_resp, fenzi))
        P = round(fenzi / float(num_resp), 3)
        R = round(fenzi / float(len(final_golds)), 3)
        output_performance.append(("oot Precision", P))
        output_performance.append(("oot Recall", R))
        output_performance.append(("oot F1", F1(P, R)))
        output_performance.append(())
        output_performance.append(("=====Candidates generation rate=======", ))
        rate = round(num_resp / float(len(final_golds)), 3)
        print rate
        output_performance.append(("Candidates generation rate", rate))
    output.extend(output_performance)
    # get output data directory
    fout = fdin2.replace(".csv", "_Evaluation.csv")
    ufile.write_csv(fout, output)
    print 'saved result into: %s' % fout
    return True
예제 #7
0
# Valx: A system for extracting and structuring numeric lab test comparison statements from text
# Created by Tony HAO, [email protected]
# Please kindly cite the paper: Tianyong Hao, Hongfang Liu, Chunhua Weng. Valx: A system for extracting and structuring numeric lab test comparison statements from text. Methods of Information in Medicine. Vol. 55: Issue 3, pp. 266-275, 2016

import W_utility.file as ufile
from W_utility.log import ext_print
import os, sys, re
import Valx_core

fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items(
    'data/variable_features_dk.csv')
fea_dict_umls = ufile.read_csv_as_dict('data/variable_features_umls.csv')
var = 'All'

#load numeric feature list
Valx_core.init_features()


def extract_values(text):
    # read input data
    if text is None or text == "": return False
    # trials = ufile.read_csv (fdin)
    trials = [text]
    if trials is None or len(trials) == 0:
        return False

    # read feature list - domain knowledge
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        return False

    # get feature info