def compare_all(fin1, fdin2):
    # read input data
    if fin1 is None or fin1 =="":
        return False
    texts = ufile.read_csv(fin1) # a specific file or a directory
    # read input data
    if fdin2 is None or fdin2 =="":
        return False
    EDBlist = ufile.load_files (fdin2) # a specific file or a directory
    result = []
    cur = 0
    for text in texts:
        cur += 1
        result_items_new =[]
        result_items = ast.literal_eval(text[2])
        #print result_items
        for result_item in result_items:
            #print result_item[0] in EDBlist
            if result_item[0] in EDBlist:
                result_items_new.append(result_item)
        result.append((text[0], text[1], str(result_items_new)))
       
    # output result
    fout = os.path.splitext(fin1)[0] + "_EDB.csv"
    ufile.write_csv(fout, result)
    print 'saved result into: %s' % fout
    
    print ext_print ('all tasks completed\n')
    return True
Exemple #2
0
def extract_variables (fdin, ffea, ffea2, var, cores):
    # read input dataset
    if fdin is None or fdin =="": return False
    trials = ufile.read_csv (fdin)
    if trials is None or len(trials) <= 0:
        print(ext_print)
        print('input data error, please check either no such file or no data --- interrupting')
        return False
    print(ext_print)
    print('found a total of %d data items' % len(trials))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print(ext_print)
        print('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var: fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] = key

    # read feature list - umls
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print(ext_print)
        print('no feature data available --- interrupting')
        return False


    output = Manager().list()
    jobs = []
    for i in range(1,cores+1):
        t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output))
        jobs.append(t)
        t.start()    
    for j in jobs: j.join()

    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv (fout, output)
    print(ext_print)
    print('saved processed results into: %s' % fout)
    return True
Exemple #3
0
def compare_all(fin1, fin2, fout1=None):
    # Referee related words of target words to reduce the size of loaded file into memory
    if fin1 is None or fin1 == "":
        return False
    texts1 = ufile.read_csv(fin1)  # a specific file or a directory
    texts2 = ufile.read_csv(fin2)
    ranked_result = []
    for i in range(len(texts1)):
        # print texts1[i]
        can_words1 = ast.literal_eval(texts1[i][2])
        # print can_words1
        can_words2 = ast.literal_eval(texts2[i][2])
        can1 = dict(can_words1)
        beta = 0.55
        for key, value in can1.items():
            can1[key] = round(value*float(beta), 20)
        # can1 = sorted(can1.items(), key=operator.itemgetter(1), reverse=True)
        can2 = dict(can_words2)
        for key, value in can2.items():
            can2[key] = round(value*float(1-beta), 20)
        for k, v in can2.items():
            if k in can1.keys():
                # can1[k]=round((can1[k]+v)/float(2),20)
                can1[k] = round((can1[k]+v), 20)
                can2.pop(k)
            else:
                can2[k] = can2[k] / 2
        for k2, v2 in can1.items():
            if k2 not in can2.keys():
                can1[k2] = can1[k2] / 2
        can1.update(can2)
        sorted_ranks = sorted(can1.items(), key=operator.itemgetter(1), reverse=True)
        ranked_result.append((texts1[i][0], texts1[i][1], sorted_ranks))
    fout = os.path.splitext(fin1)[0] + "_" + str(beta) + "_merged.csv"
    ufile.write_csv(fout, ranked_result)
    print('saved result into: %s' % fout)
    return True
def extract_variables (fdin, ffea, ffea2, var, cores):
    # read input dataset
    if fdin is None or fdin =="": return False
    trials = ufile.read_csv (fdin)
    if trials is None or len(trials) <= 0:
        print ext_print ('input data error, please check either no such file or no data --- interrupting')
        return False
    print ext_print ('found a total of %d data items' % len(trials))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var: fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] = key

    # read feature list - umls
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False


    output = Manager().list()
    jobs = []
    for i in xrange(1,cores+1):
        t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output))
        jobs.append(t)
        t.start()    
    for j in jobs: j.join()

    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv (fout, output)
    print ext_print ('saved processed results into: %s' % fout)
    return True
Exemple #5
0
def compare_all(fin1):
    # read input data
    if fin1 is None or fin1 == "":
        return False
    texts = ufile.read_csv(fin1)  # a specific file or a directory
    result = []
    start_time = time.time()
    cur = 0
    for text in texts:
        simi_valuesList = []
        cur += 1
        if len(text[1].split('.')) > 1:
            target_word, pos = text[1].split('.')[0], text[1].split('.')[1]
        else:
            target_word, pos = text[1], None
        print "%d of %d" % (cur, len(texts)), target_word
        candidatewords = text[2]
        candidatewords = ast.literal_eval(candidatewords)
        simi_values = []
        for candidate in candidatewords:
            #print "candidate:"
            #print candidate
            word2 = candidate[0]
            # print word2
            try:
                simi_values = gensim_model.similarity(target_word, word2)
            except KeyError:
                simi_values = 0
            # word_sim[word2] = round(float(simi_values), 5)
            simi_valuesList.append((word2, round(float(simi_values), 5)))
        simi_valuesList.sort(key=operator.itemgetter(1),
                             reverse=True)  # sort by rank value
        print "simi_valuesList:"
        print simi_valuesList[:30]
        result.append((text[0], text[1], simi_valuesList[:30]))
        print result
    print("--- %s seconds ---" % (time.time() - start_time))
    fout = os.path.splitext(fin1)[0] + "_rank.csv"
    ufile.write_csv(fout, result)
    print 'saved result into: %s' % fout
    print ext_print('all tasks completed\n')
    return True
def POS_tagging(fdin, fout=None):
    # read input data
    if fdin is None or fdin == "":
        return False
    texts = ufile.read_csv(fdin)  # a specific file or a directory
    #nlp=spacy.load("en")
    result = []
    for text in texts:
        sentence = text[1].lower()
        print text[0]
        target_word = text[2]
        if len(target_word.split('.')) == 1:
            print nltk.word_tokenize(sentence)
            pos_tags = nltk.pos_tag(nltk.word_tokenize(sentence))
            print pos_tags

            for tag in pos_tags:
                if target_word in tag:
                    if (tag[1] in ['NN', 'NNS', 'NNP', 'NNPS']):
                        target_word += "." + 'n'
                    elif (tag[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']):
                        target_word += "." + 'v'
                    elif (tag[1] in ['RB', 'RBR', 'RBS', 'WRB']):
                        target_word += "." + 'r'
                    elif (tag[1] in ['JJ', 'JJR', 'JJS']):
                        target_word += "." + 'a'
                    print target_word
                    break

        result.append((text[0], text[1], target_word, text[3]))

    # get output data directory
    if fout is None:
        fout = fdin.replace('.csv', '_pos.csv')
    ufile.write_csv(fout, result)
    print 'saved result into: %s' % fout

    return True
def N_fold(fin1):
    if fin1 is None or fin1 =="":
        return False
    orig_texts = ufile.read_csv (fin1)
    Train,Test=[],[]
    train, test = train_test_split(orig_texts, test_size=0.666, random_state=1)
    #train, validation = train_test_split(train, test_size=0.5, random_state=1)
    #print "Train_dataset:",train
    print "Train_length:",len(train)
    ufile.write_csv('E:\Simplify\_Results/train_set.csv', train)
    #print "Test_dataset:",test
    print "Test_length:",len(test)
    ufile.write_csv('E:\Simplify\_Results/test_set.csv', test)

    #print "Validation:",validation
    #print "Validation_length:",len(validation)
    #ufile.write_csv('E:\Simplify\_Results/validation_set.csv', validation)

    #kfold = KFold(n_splits=2, shuffle=True, random_state=1)
    #X_train, X_test = train_test_split(orig_texts, test_size = 0.66, random_state = 42)
    #print X_train
    #print len(X_train)

    '''for train, test in kfold.split(orig_texts):
def Insert_DB(fin):
    for root, dir, files in os.walk(fin):
        for f in files:
            if not f.endswith(".csv"):
                continue
            print ext_print(f)

            output = []
            # read input data
            fdin = os.path.join(root, f)
            rows = ufile.read_csv(fdin)
            for row in rows:
                # param = (PMID, JournalTitle, PubDate, ArticleTitle, Abstract, Keywords)
                PubDate = row[2]

                if PubDate != '2009':  # To split tasks into differnt machines with MySQL, process tables separatly
                    continue
                table = "article_" + PubDate
                #
                # if (row[2] == '' or row[2] is None): PubDate = 0
                # PubDate = int(PubDate)
                # table = 'article_0-1950'
                # if 2000 >= PubDate >= 1951:
                #     table = 'article_1951-2000'
                # if 2005 >= PubDate >= 2001:
                #     table = 'article_2001-2005'
                # elif PubDate > 2005:
                #     table = 'article_'+ str(PubDate)
				
                param = (row[0], row[1], PubDate, row[3], row[4], row[5])
                sql = "INSERT INTO `" + table + "` (`PMID`, `JournalTitle`, `PubDate`, `ArticleTitle`, `Abstract`, `Keywords`) VALUES(%s, %s, %s, %s, %s, %s);"
                msg = db.execute(sql, param)
                if msg != 1: print msg

    print ext_print('all tasks completed\n')
    return True
Exemple #9
0
def extract_variables(fdin, ffea, ffea2, var):
    # read input data
    if fdin is None or fdin == "": return False
    trials = ufile.read_csv(fdin)
    if trials is None or len(trials) <= 0:
        print(ext_print)
        print(
            'input data error, please check either no such file or no data --- interrupting'
        )
        return False
    print(ext_print)
    print('found a total of %d data items' % len(trials))

    # read feature list - domain knowledge
    if ffea is None or ffea == "": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items(ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print(ext_print)
        print('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var: fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] = key

    # read feature list - UMLS (can be replaced by full UMLS)
    if ffea2 is None or ffea2 == "": return False
    fea_dict_umls = ufile.read_csv_as_dict(ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print(ext_print)
        print('no feature data available --- interrupting')
        return False

    #load numeric feature list
    Valx_core.init_features()

    output = []
    for i in range(len(trials)):
        if i % 1000 == 0:
            print('processing %d' % i)
        # pre-processing eligibility criteria text
        text = Valx_core.preprocessing(
            trials[i][1])  # trials[i][1] is the eligibility criteria text
        (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(
            text)  # extract candidates containing numeric features
        for j in range(len(candidates_num)):  # for each candidate
            exp_text = Valx_core.formalize_expressions(
                candidates_num[j])  # identify and formalize values
            (exp_text, key_ngrams) = Valx_core.identify_variable(
                exp_text, feature_dict_dk, fea_dict_umls
            )  # identify variable mentions and map them to names
            (variables,
             vars_values) = Valx_core.associate_variable_values(exp_text)
            all_exps = []
            for k in range(len(variables)):
                curr_var = variables[k]
                curr_exps = vars_values[k]
                if curr_var in features:
                    fea_list = features[curr_var]
                    curr_exps = Valx_core.context_validation(
                        curr_exps, fea_list[1], fea_list[2])
                    curr_exps = Valx_core.normalization(
                        fea_list[3],
                        curr_exps)  # unit conversion and value normalization
                    curr_exps = Valx_core.hr_validation(
                        curr_exps, float(fea_list[4]),
                        float(fea_list[5]))  # heuristic rule-based validation
                if len(curr_exps) > 0:
                    if var == "All" or var.lower() == curr_var.lower(
                    ) or var.lower() in curr_var.lower():
                        all_exps += curr_exps

            if len(all_exps) > 0:
                output.append(
                    (trials[i][0], sections_num[j], candidates_num[j],
                     exp_text, str(all_exps).replace("u'",
                                                     "'")))  # output result

    # output result
    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv(fout, output)
    print(ext_print)
    print('saved processed results into: %s' % fout)
    return True
def compare_all(fin1, fin2, n, fin3, fout1=None):
    # Referee related words of target words to reduce the size of loaded file into memory
    if fin1 is None or fin1 == "":
        return False
    orig_texts = ufile.read_csv(fin1)  # a specific file or a directory
    print orig_texts
    Related_words = {}
    for text in orig_texts:
        target_word = text[2].split('.')[0]
        words = text[1].lower().split()  # sentence words
        if (target_word in words) and len(words) > 1:
            temp_ngrams = find_ngrams(
                words, n)  # get all of sentence ngram candidates
            for ngram in temp_ngrams:
                if target_word in ngram:  # get target_word`s candidates
                    for te in ngram:
                        if te != target_word:
                            Related_words[te] = 1  # key(te)value=1
    print ext_print("Identified all related words")

    # Referee candidate words to reduce the size of loaded file into memory
    if fin3 is None or fin3 == "":
        return False
    candidate_words = {}
    for fin3_each in fin3.split(";"):
        test_data = ufile.read_csv(fin3_each)  # a specific file or a directory
        for i in range(len(test_data)):
            can_words = ast.literal_eval(
                test_data[i][2])  # parse string to array
            for can_word in can_words:
                if can_word[0] not in candidate_words:
                    candidate_words[can_word[0]] = 1
    print ext_print("Identified all candidate words")

    # read Google 1T corpus
    print ext_print("start to load Google 1T grams")
    Goole_grams, count, max_fre, c1, c2 = {}, 0, 0, 0, 0
    if fin2 is None or fin2 == "":
        return False
    fid = open(fin2, 'r')
    for line in fid:
        line = line.lower()
        count += 1
        if count % 10000000 == 0:
            print count
        if len(line) > 0:
            tem = line.split('\t')
            '''if len(tem) > 1:
                if tem[0] not in Goole_grams:
                    Goole_grams[tem[0]] = tem[1]
                    if long(tem[1]) > max_fre:  # reduce ordering calculations
                        max_fre = long(tem[1])'''
            if len(tem) == 1:
                c1 += 1
            if len(tem) > 1:
                c2 += 1
                temws = tem[0].split()
                find_candidate, find_related = False, False  # reduce memory usage
                for temw in temws:
                    if temw in candidate_words:
                        find_candidate = True
                    elif temw in Related_words:
                        find_related = True
                if find_candidate and find_related:
                    Goole_grams[tem[0]] = tem[1]
                    if long(tem[1]) > max_fre:  # reduce ordering calculations
                        max_fre = long(tem[1])

    fid.close()
    print count
    print("c1=%d,c2=%d" % (c1, c2))
    print ext_print("all files loaded")
    #     max_fre = max(map(float, Goole_grams.values())) # reduce memory usage
    if max_fre == 0:
        print ext_print("Data error! please check!")
        return
    else:
        print ext_print("Total number is %d" % len(Goole_grams))
    lemmatizer = WordNetLemmatizer()

    #betas = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    # betas = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35]
    betas = [0.5]
    m, t, p = 0, 0, 0
    for beta in betas:
        # read candidate words
        for fin3_each in fin3.split(";"):
            candidate_words = ufile.read_csv(
                fin3_each)  # a specific file or a directory

            ranked_result = []
            for i in xrange(len(orig_texts)):
                text = orig_texts[i]
                can_words = ast.literal_eval(
                    candidate_words[i][2])  # parse string to array
                words = text[1].lower().split()
                target_word = text[2].split('.')[0]
                # print target_word
                if (target_word in words) and len(words) > 1:
                    candiate_ngrams, temp_ngrams = [], find_ngrams(
                        words, n)  # get ngram candidates
                    for ngram in temp_ngrams:
                        if target_word in ngram:
                            candiate_ngrams.append(
                                (ngram, ngram.index(target_word)))
                    ranks = {}
                    for can_word in can_words:
                        can_word, can_word_value, fre_can_word, max_context = can_word[
                            0], float(
                                can_word[1]
                            ), 0.0, 0.0  # can_word is candidate_word,can_word[0] is delete value just key
                        lemma_can_word = lemmatizer.lemmatize(can_word)
                        for (
                                ngram, k
                        ) in candiate_ngrams:  # k is the site of target_word
                            lst = list(ngram)
                            le_lst = list(ngram)
                            lst[k] = can_word
                            can_context = ' '.join(
                                lst
                            )  # candidate_word replace ngram target_word
                            le_lst[k] = lemma_can_word
                            le_context = ''.join(le_lst)
                            t += 1
                            if can_context in Goole_grams:
                                m += 1
                                fre_can_word = float(Goole_grams[can_context])
                                max_context = max(max_context, fre_can_word)
                            elif le_context in Goole_grams:
                                p += 1
                                fre_can_word = float(Goole_grams[can_context])
                                max_context = max(max_context, fre_can_word)
                        # change strategies for calculating 1gram, 2gram, 3gram, or their combination
                        ranks[can_word] = (
                            1 - beta) * can_word_value + beta * math.sqrt(
                                max_context / float(max_fre))
                    sorted_ranks = sorted(ranks.items(),
                                          key=operator.itemgetter(1),
                                          reverse=True)  # sort by rank value
                    ranked_result.append((text[0], text[2], sorted_ranks))

                    # print ranked_result

                else:
                    ranked_result.append((text[0], text[2], can_words))
            # get output data directory
            fout1 = fin3_each.replace(
                ".csv", "_Rank" + str(n) + "gram+" + str(beta) + ".csv")
            ufile.write_csv(fout1, ranked_result)
            print ext_print('saved result into: %s' % fout1)

    return True
Exemple #11
0
def extract_variables (fdin, ffea, ffea2, var):
    # read input data
    if fdin is None or fdin =="": return False
    trials = ufile.read_csv (fdin)
    if trials is None or len(trials) <= 0:
        print ext_print ('input data error, please check either no such file or no data --- interrupting')
        return False
    print ext_print ('found a total of %d data items' % len(trials))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var:fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] =key

    # read feature list - UMLS (can be replaced by full UMLS)
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False

    #load numeric feature list
    Valx_core.init_features()

    output = []
    for i in xrange(len(trials)):
        if i%1000 == 0:
            print ('processing %d' % i)
        # pre-processing eligibility criteria text
        text = Valx_core.preprocessing(trials[i][1]) # trials[i][1] is the eligibility criteria text
        (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(text) # extract candidates containing numeric features
        for j in xrange(len(candidates_num)): # for each candidate
            exp_text = Valx_core.formalize_expressions(candidates_num[j]) # identify and formalize values
            (exp_text, key_ngrams) = Valx_core.identify_variable(exp_text, feature_dict_dk, fea_dict_umls) # identify variable mentions and map them to names
            (variables, vars_values) = Valx_core.associate_variable_values(exp_text)
            all_exps = []
            for k in xrange(len(variables)):
                curr_var = variables[k]
                curr_exps = vars_values[k]
                if curr_var in features:
                    fea_list = features[curr_var]
                    curr_exps = Valx_core.context_validation(curr_exps, fea_list[1], fea_list[2])                           
                    curr_exps = Valx_core.normalization(fea_list[3], curr_exps) # unit conversion and value normalization
                    curr_exps = Valx_core.hr_validation (curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation
                if len(curr_exps) > 0:
                    if var == "All" or var.lower() == curr_var.lower() or var.lower() in curr_var.lower(): all_exps += curr_exps                     
                 
            if len(all_exps) > 0: output.append((trials[i][0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result

    # output result
    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv (fout, output)
    print ext_print ('saved processed results into: %s' % fout)
    return True
def compare_all(fin1, fdin2):
    # read input data
    if fin1 is None or fin1 == "":
        return False
    fin_files = fin1.split(';')

    # read input data
    if fdin2 is None or fdin2 == "":
        return False
    words_sims = ufile.read_csv_as_dict(fdin2, 0,
                                        2)  # a specific file or a directory
    output, output_performance = [], []
    output.append(("ID", "Sentence", "Target word", "By Gold", "By system"))
    for fin_file in fin_files:
        texts = ufile.read_csv(fin_file)  # a specific file or a directory
        final_golds, final_system = [], []
        for text in texts:
            key = text[0]
            sentence = text[1]  # get all sentences
            target_word = text[2]
            golds = {}  # gold word
            gold_temps = text[3].split(';')
            for gold_temp in gold_temps:
                tems = gold_temp.split(':')
                golds[tems[0]] = int(tems[1])
            final_golds.append(golds)  #所有golds组成一个列表,每一个目标词的gold是其中的一个元素
            if key not in words_sims:
                exit("No key in processed similarity file!")
            wordnet_result = ast.literal_eval(words_sims[key])
            final_system.append(wordnet_result[:])
            output.append(
                (key, sentence, target_word, golds, wordnet_result[:]))
        #print final_golds
        output.append(())
        # ===========evaluation
        output_performance.append(("=====Accuracy@N=======", ))
        for N in xrange(10):
            num_correct = 0
            for i in xrange(len(final_golds)):
                gold = final_golds[i]  # dictionary
                sys = final_system[i]  # array
                for j in xrange(len(sys)):
                    if j > N:
                        break
                    if sys[j][0] in gold:  # sys = "finally:0.2"
                        num_correct += 1
                        break

            accuracy = round(num_correct / float(len(final_golds)), 3)
            print("Accuracy@" + str(N + 1), accuracy,
                  "%d of %d are correct" % (num_correct, len(final_golds)))
            output_performance.append(
                ("Accuracy@" + str(N + 1), accuracy,
                 "%d of %d are correct" % (num_correct, len(final_golds))))

        output_performance.append(("=====best P&R=======", ))
        fenzi, num_resp, = 0.0, 0
        for i in xrange(len(final_golds)):
            gold = final_golds[i]  # dictionary
            sys = final_system[i]  # 每一个目标词的候选词列表
            if len(sys) > 0:
                num_resp += 1  #有候选词的目标词个数
                best_sys = sys[0][0]
                if best_sys in gold:  # sys = "finally:0.2"
                    fenzi += float(gold[best_sys]) / sum(gold.values())
        print("best P fenmu is %d,fenzi is %f" % (num_resp, fenzi))
        P = round(fenzi / float(num_resp), 3)
        R = round(fenzi / float(len(final_golds)), 3)
        output_performance.append(("Best Precision", P))
        output_performance.append(("Best Recall", R))
        output_performance.append(("Best F1", F1(P, R)))

        output_performance.append(("=====oot P&R=======", ))
        fenzi, num_resp, = 0.0, 0
        for i in xrange(len(final_golds)):
            gold = final_golds[i]  # dictionary
            sys = final_system[i]  # array
            if len(sys) > 0:
                num_resp += 1
                for each_sys in sys:
                    if each_sys[0] in gold:  # each_sys = "finally:0.2"
                        fenzi += float(gold[each_sys[0]]) / sum(gold.values())
        print("Oot P fenmu is %d,fenzi is %f" % (num_resp, fenzi))
        P = round(fenzi / float(num_resp), 3)
        R = round(fenzi / float(len(final_golds)), 3)
        output_performance.append(("oot Precision", P))
        output_performance.append(("oot Recall", R))
        output_performance.append(("oot F1", F1(P, R)))
        output_performance.append(())
        output_performance.append(("=====Candidates generation rate=======", ))
        rate = round(num_resp / float(len(final_golds)), 3)
        print rate
        output_performance.append(("Candidates generation rate", rate))
    output.extend(output_performance)
    # get output data directory
    fout = fdin2.replace(".csv", "_Evaluation.csv")
    ufile.write_csv(fout, output)
    print 'saved result into: %s' % fout
    return True
Exemple #13
0
def compare_all(fin1, fdin2, fdin3):
    # read input data
    if fin1 is None or fin1 =="":
        return False
    texts = ufile.read_csv(fin1) # a specific file or a directory
    # read input data
    if fdin2 is None or fdin2 =="":
        return False
    EDBlist = ufile.load_files (fdin2) # a specific file or a directory
    # read input data
    if fdin3 is None or fdin3 =="":
        return False
    FreCorpus = ufile.read_file_dict_tokenized(fdin3, '\t')
    
    result = []
    words_sims = {}
    cur = 0
    for text in texts:
        cur += 1
        if len(text[2].split('.')) > 1:
            target_word, pos = text[2].split('.')[0], text[2].split('.')[1]
        else:
            target_word, pos = text[2], None
        print "%d of %d" % (cur, len(texts)), target_word
        simi_values = []
        if target_word not in words_sims:
            processed = []
            processed.append(target_word)
            # step 1 ============== 
            can_words =[]
            syn = wordnet.synsets(target_word)
            if len(syn) > 0:
                for l in syn[0].lemmas():
                    if l.name() not in can_words:
                        can_words.append(l.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)
            # step 2 ==============  
            can_words =[]
            syn = wordnet.synsets(target_word)
            if len(syn) > 0:
                syn_word = syn[0].hypernyms()
                for l in syn_word:
                    if (l.pos() in ['v', 'n', 'a']):
                        for k in l.lemmas():
                            if k.name() not in can_words:
                                can_words.append(k.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)
            # step 3 ==============  
            can_words =[]
            for syn in wordnet.synsets(target_word):
                for l in syn.lemmas():
                    if l.name() not in can_words:
                        can_words.append(l.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)
            # step 4 ==============  
            can_words =[]
            for syn in wordnet.synsets(target_word):
                syn_word = syn.hypernyms()
                for l in syn_word:
                    if (l.pos() in ['v', 'n', 'a']):
                        for k in l.lemmas():
                            if k.name() not in can_words:
                                can_words.append(k.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)                  
            #=================================
            words_sims[target_word] = simi_values
            print simi_values[:2]
        else:
            simi_values = words_sims[target_word]
        result.append((text[0], text[2], simi_values))
       
    # output result
    fout = os.path.splitext(fin1)[0] + "_4steps.csv"
    ufile.write_csv(fout, result)
    print 'saved result into: %s' % fout
    
    print ext_print ('all tasks completed\n')
    return True
Exemple #14
0
def compare_all(fin1, fin2, fin3, fout1=None):
    # read input data
    if fin1 is None or fin1 == "":
        return False
    orig_texts = ufile.read_csv(fin1, '\t')  # a specific file or a directory
    # read candidate words
    if fin2 is None or fin2 == "":
        return False
    candidate_words = ufile.read_file_tokenized(
        fin2, '\t')  # a specific file or a directory
    candidates = {}
    for candidate in candidate_words:
        if candidate[0] not in candidates:
            if len(candidate) > 1:
                candidates[candidate[0]] = candidate[1]
            else:
                candidates[candidate[0]] = ""
    #print candidates
    # read Google 1T corpus
    if fin3 is None or fin3 == "":
        return False
    GooleCorpus = {}
    fid = open(fin3, 'r')
    for line in fid:
        line = line.strip().lower()
        if len(line) > 0:
            tem = line.split('\t')
            if tem[0] not in GooleCorpus:
                GooleCorpus[tem[0]] = tem[1]
    #print GooleCorpus
    fid.close()
    # main program running
    ranked_result = []
    for text in orig_texts:
        print text
        sentence = text[1]  # get all sentences
        target_word = text[2].split(".")[0]
        #print target_word
        # get compact context window
        can_phrases = sentence.lower().split()
        words = []
        if target_word in can_phrases:
            can_phrases.remove(target_word)
            for word in can_phrases:
                if word_checking_stop(word) == 0:
                    words.append(word)
        # vector of target_word is words
        ranks = {}
        for fin2_each in fin2.split(";"):
            test_data = ufile.read_csv(
                fin2_each)  # a specific file or a directory
            for i in xrange(len(test_data)):
                can_words = ast.literal_eval(test_data[i][2])
        print can_words
        #can_words = candidates[target_word].strip(',').split(',')
        for can_word in can_words:
            context_weights, can_weights = 0, 0
            #context_weights = []
            #can_weights = []  # for each can_word, get a vector
            can_word = can_word[0]
            fre_can_word = 1
            if can_word in GooleCorpus:
                fre_can_word = GooleCorpus(
                    can_word)  # frequency of candidate word
            #print fre_can_word
            fre_both = 1  # avoid x/0 problem
            for word in words:
                for key, value in GooleCorpus.items():
                    tems = key.split(' ')
                    if can_word in tems and word in tems:
                        fre_both += int(value)
                context_weights = 1
                can_weights = (float(fre_both) / float(fre_can_word) / 3.0)
            print context_weights, can_weights
            ranks[can_word] = cosine_distance(context_weights, can_weights)
            # print can_word, can_weights, ranks[can_word]

        sorted_ranks = sorted(ranks.items(),
                              key=operator.itemgetter(1),
                              reverse=True)  # sort by rank value
        print sorted_ranks
        sorted_rank_str = ""
        for sorted_item in sorted_ranks:
            sorted_rank_str += sorted_item[0] + ":" + str(sorted_item[1]) + ";"
        ranked_result.append((text[0], text[2], sorted_ranks[:]))

    # get output data directory
    fout1 = os.path.splitext(fin2)[0] + "_ranked.csv"
    ufile.write_csv(fout1, ranked_result)
    print 'saved result into: %s' % fout1
Exemple #15
0
def compare_all(fin1, fdin2, method, threasholds):
    # read input data
    if fin1 is None or fin1 == "":
        return False
    texts = ufile.read_csv(fin1)  # a specific file or a directory
    # read input data
    if fdin2 is None or fdin2 == "":
        return False
    EDBlist = ufile.load_files(fdin2)  # a specific file or a directory

    threasholds = threasholds.split(';')

    # 过滤掉原型词与同词根的词
    porter_stemmer = PorterStemmer()
    wnl = WordNetLemmatizer()
    gold, fre = [], []
    for threashold in threasholds:
        result = []
        words_sims = {}
        start_time = time.time()
        cur = 0
        for text in texts:
            cur += 1
            for i in range(len(text[3].split(";"))):
                fre.append(text[3].split(";")[i].split(":")[1])
                gold.append(text[3].split(";")[i].split(":")[0])
            if len(text[2].split('.')) > 1:
                target_word, pos = text[2].split('.')[0], text[2].split('.')[1]
                stemming_tw = porter_stemmer.stem(target_word)
                lemma_tw = wordnet.morphy(target_word, pos=pos)
                #lemma_tw = wnl.lemmatize(target_word, pos)
                print lemma_tw

            else:
                target_word, pos = text[2], None
                stemming_tw = porter_stemmer.stem(target_word)
                lemma_tw = wordnet.morphy(target_word)
                #lemma_tw = wnl.lemmatize(target_word, pos)

            print("%d of %d" % (cur, len(texts)), target_word)
            simi_values = []

            if target_word not in words_sims:
                word_sim = {}
                for word2 in EDBlist:
                    stemming_cw = porter_stemmer.stem(word2)
                    lemma_word = wordnet.morphy(word2)
                    if word2 not in word_sim:
                        #if target_word !=word2:
                        if target_word != word2 and stemming_cw != stemming_tw and lemma_word != lemma_tw:
                            # simi_value=compare_allsynsets(method, target_word, word2, TWpos, SYNpos, pos)
                            # simi_value = compare_allsynsets(method, target_word, word2, TWpos, pos)
                            # simi_value = compare_allsynsets(method, target_word, word2, SYNpos)
                            simi_value = compare_allsynsets(
                                method, target_word, word2)
                            if simi_value > float(threashold):
                                word_sim[word2] = round(float(simi_value), 3)
                simi_values = sorted(word_sim.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)  # sort by rank value
                words_sims[target_word] = simi_values
            else:
                simi_values = words_sims[target_word]
            result.append((text[0], text[2], simi_values))
        print("--- %s seconds ---" % (time.time() - start_time))
        # output result
        fout = os.path.splitext(fin1)[0] + "_%s_%s.csv" % (method, threashold)
        # if SYNpos:
        #     fout = fout.replace(".csv", "_SYNpos.csv")
        # if TWpos:
        #     fout = fout.replace(".csv", "_TWpos.csv")
        ufile.write_csv(fout, result)
        print('saved result into: %s' % fout)

    print(ext_print('all tasks completed\n'))
    return True