def compare_all(fin1, fdin2): # read input data if fin1 is None or fin1 =="": return False texts = ufile.read_csv(fin1) # a specific file or a directory # read input data if fdin2 is None or fdin2 =="": return False EDBlist = ufile.load_files (fdin2) # a specific file or a directory result = [] cur = 0 for text in texts: cur += 1 result_items_new =[] result_items = ast.literal_eval(text[2]) #print result_items for result_item in result_items: #print result_item[0] in EDBlist if result_item[0] in EDBlist: result_items_new.append(result_item) result.append((text[0], text[1], str(result_items_new))) # output result fout = os.path.splitext(fin1)[0] + "_EDB.csv" ufile.write_csv(fout, result) print 'saved result into: %s' % fout print ext_print ('all tasks completed\n') return True
def extract_variables (fdin, ffea, ffea2, var, cores): # read input dataset if fdin is None or fdin =="": return False trials = ufile.read_csv (fdin) if trials is None or len(trials) <= 0: print(ext_print) print('input data error, please check either no such file or no data --- interrupting') return False print(ext_print) print('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea =="": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print(ext_print) print('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var: fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] = key # read feature list - umls if ffea2 is None or ffea2 =="": return False fea_dict_umls = ufile.read_csv_as_dict (ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print(ext_print) print('no feature data available --- interrupting') return False output = Manager().list() jobs = [] for i in range(1,cores+1): t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output)) jobs.append(t) t.start() for j in jobs: j.join() fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv (fout, output) print(ext_print) print('saved processed results into: %s' % fout) return True
def compare_all(fin1, fin2, fout1=None): # Referee related words of target words to reduce the size of loaded file into memory if fin1 is None or fin1 == "": return False texts1 = ufile.read_csv(fin1) # a specific file or a directory texts2 = ufile.read_csv(fin2) ranked_result = [] for i in range(len(texts1)): # print texts1[i] can_words1 = ast.literal_eval(texts1[i][2]) # print can_words1 can_words2 = ast.literal_eval(texts2[i][2]) can1 = dict(can_words1) beta = 0.55 for key, value in can1.items(): can1[key] = round(value*float(beta), 20) # can1 = sorted(can1.items(), key=operator.itemgetter(1), reverse=True) can2 = dict(can_words2) for key, value in can2.items(): can2[key] = round(value*float(1-beta), 20) for k, v in can2.items(): if k in can1.keys(): # can1[k]=round((can1[k]+v)/float(2),20) can1[k] = round((can1[k]+v), 20) can2.pop(k) else: can2[k] = can2[k] / 2 for k2, v2 in can1.items(): if k2 not in can2.keys(): can1[k2] = can1[k2] / 2 can1.update(can2) sorted_ranks = sorted(can1.items(), key=operator.itemgetter(1), reverse=True) ranked_result.append((texts1[i][0], texts1[i][1], sorted_ranks)) fout = os.path.splitext(fin1)[0] + "_" + str(beta) + "_merged.csv" ufile.write_csv(fout, ranked_result) print('saved result into: %s' % fout) return True
def extract_variables (fdin, ffea, ffea2, var, cores): # read input dataset if fdin is None or fdin =="": return False trials = ufile.read_csv (fdin) if trials is None or len(trials) <= 0: print ext_print ('input data error, please check either no such file or no data --- interrupting') return False print ext_print ('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea =="": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print ext_print ('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var: fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] = key # read feature list - umls if ffea2 is None or ffea2 =="": return False fea_dict_umls = ufile.read_csv_as_dict (ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print ext_print ('no feature data available --- interrupting') return False output = Manager().list() jobs = [] for i in xrange(1,cores+1): t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output)) jobs.append(t) t.start() for j in jobs: j.join() fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv (fout, output) print ext_print ('saved processed results into: %s' % fout) return True
def compare_all(fin1): # read input data if fin1 is None or fin1 == "": return False texts = ufile.read_csv(fin1) # a specific file or a directory result = [] start_time = time.time() cur = 0 for text in texts: simi_valuesList = [] cur += 1 if len(text[1].split('.')) > 1: target_word, pos = text[1].split('.')[0], text[1].split('.')[1] else: target_word, pos = text[1], None print "%d of %d" % (cur, len(texts)), target_word candidatewords = text[2] candidatewords = ast.literal_eval(candidatewords) simi_values = [] for candidate in candidatewords: #print "candidate:" #print candidate word2 = candidate[0] # print word2 try: simi_values = gensim_model.similarity(target_word, word2) except KeyError: simi_values = 0 # word_sim[word2] = round(float(simi_values), 5) simi_valuesList.append((word2, round(float(simi_values), 5))) simi_valuesList.sort(key=operator.itemgetter(1), reverse=True) # sort by rank value print "simi_valuesList:" print simi_valuesList[:30] result.append((text[0], text[1], simi_valuesList[:30])) print result print("--- %s seconds ---" % (time.time() - start_time)) fout = os.path.splitext(fin1)[0] + "_rank.csv" ufile.write_csv(fout, result) print 'saved result into: %s' % fout print ext_print('all tasks completed\n') return True
def POS_tagging(fdin, fout=None): # read input data if fdin is None or fdin == "": return False texts = ufile.read_csv(fdin) # a specific file or a directory #nlp=spacy.load("en") result = [] for text in texts: sentence = text[1].lower() print text[0] target_word = text[2] if len(target_word.split('.')) == 1: print nltk.word_tokenize(sentence) pos_tags = nltk.pos_tag(nltk.word_tokenize(sentence)) print pos_tags for tag in pos_tags: if target_word in tag: if (tag[1] in ['NN', 'NNS', 'NNP', 'NNPS']): target_word += "." + 'n' elif (tag[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']): target_word += "." + 'v' elif (tag[1] in ['RB', 'RBR', 'RBS', 'WRB']): target_word += "." + 'r' elif (tag[1] in ['JJ', 'JJR', 'JJS']): target_word += "." + 'a' print target_word break result.append((text[0], text[1], target_word, text[3])) # get output data directory if fout is None: fout = fdin.replace('.csv', '_pos.csv') ufile.write_csv(fout, result) print 'saved result into: %s' % fout return True
def N_fold(fin1): if fin1 is None or fin1 =="": return False orig_texts = ufile.read_csv (fin1) Train,Test=[],[] train, test = train_test_split(orig_texts, test_size=0.666, random_state=1) #train, validation = train_test_split(train, test_size=0.5, random_state=1) #print "Train_dataset:",train print "Train_length:",len(train) ufile.write_csv('E:\Simplify\_Results/train_set.csv', train) #print "Test_dataset:",test print "Test_length:",len(test) ufile.write_csv('E:\Simplify\_Results/test_set.csv', test) #print "Validation:",validation #print "Validation_length:",len(validation) #ufile.write_csv('E:\Simplify\_Results/validation_set.csv', validation) #kfold = KFold(n_splits=2, shuffle=True, random_state=1) #X_train, X_test = train_test_split(orig_texts, test_size = 0.66, random_state = 42) #print X_train #print len(X_train) '''for train, test in kfold.split(orig_texts):
def Insert_DB(fin): for root, dir, files in os.walk(fin): for f in files: if not f.endswith(".csv"): continue print ext_print(f) output = [] # read input data fdin = os.path.join(root, f) rows = ufile.read_csv(fdin) for row in rows: # param = (PMID, JournalTitle, PubDate, ArticleTitle, Abstract, Keywords) PubDate = row[2] if PubDate != '2009': # To split tasks into differnt machines with MySQL, process tables separatly continue table = "article_" + PubDate # # if (row[2] == '' or row[2] is None): PubDate = 0 # PubDate = int(PubDate) # table = 'article_0-1950' # if 2000 >= PubDate >= 1951: # table = 'article_1951-2000' # if 2005 >= PubDate >= 2001: # table = 'article_2001-2005' # elif PubDate > 2005: # table = 'article_'+ str(PubDate) param = (row[0], row[1], PubDate, row[3], row[4], row[5]) sql = "INSERT INTO `" + table + "` (`PMID`, `JournalTitle`, `PubDate`, `ArticleTitle`, `Abstract`, `Keywords`) VALUES(%s, %s, %s, %s, %s, %s);" msg = db.execute(sql, param) if msg != 1: print msg print ext_print('all tasks completed\n') return True
def extract_variables(fdin, ffea, ffea2, var): # read input data if fdin is None or fdin == "": return False trials = ufile.read_csv(fdin) if trials is None or len(trials) <= 0: print(ext_print) print( 'input data error, please check either no such file or no data --- interrupting' ) return False print(ext_print) print('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea == "": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items(ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print(ext_print) print('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var: fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] = key # read feature list - UMLS (can be replaced by full UMLS) if ffea2 is None or ffea2 == "": return False fea_dict_umls = ufile.read_csv_as_dict(ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print(ext_print) print('no feature data available --- interrupting') return False #load numeric feature list Valx_core.init_features() output = [] for i in range(len(trials)): if i % 1000 == 0: print('processing %d' % i) # pre-processing eligibility criteria text text = Valx_core.preprocessing( trials[i][1]) # trials[i][1] is the eligibility criteria text (sections_num, candidates_num) = Valx_core.extract_candidates_numeric( text) # extract candidates containing numeric features for j in range(len(candidates_num)): # for each candidate exp_text = Valx_core.formalize_expressions( candidates_num[j]) # identify and formalize values (exp_text, key_ngrams) = Valx_core.identify_variable( exp_text, feature_dict_dk, fea_dict_umls ) # identify variable mentions and map them to names (variables, vars_values) = Valx_core.associate_variable_values(exp_text) all_exps = [] for k in range(len(variables)): curr_var = variables[k] curr_exps = vars_values[k] if curr_var in features: fea_list = features[curr_var] curr_exps = Valx_core.context_validation( curr_exps, fea_list[1], fea_list[2]) curr_exps = Valx_core.normalization( fea_list[3], curr_exps) # unit conversion and value normalization curr_exps = Valx_core.hr_validation( curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation if len(curr_exps) > 0: if var == "All" or var.lower() == curr_var.lower( ) or var.lower() in curr_var.lower(): all_exps += curr_exps if len(all_exps) > 0: output.append( (trials[i][0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result # output result fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv(fout, output) print(ext_print) print('saved processed results into: %s' % fout) return True
def compare_all(fin1, fin2, n, fin3, fout1=None): # Referee related words of target words to reduce the size of loaded file into memory if fin1 is None or fin1 == "": return False orig_texts = ufile.read_csv(fin1) # a specific file or a directory print orig_texts Related_words = {} for text in orig_texts: target_word = text[2].split('.')[0] words = text[1].lower().split() # sentence words if (target_word in words) and len(words) > 1: temp_ngrams = find_ngrams( words, n) # get all of sentence ngram candidates for ngram in temp_ngrams: if target_word in ngram: # get target_word`s candidates for te in ngram: if te != target_word: Related_words[te] = 1 # key(te)value=1 print ext_print("Identified all related words") # Referee candidate words to reduce the size of loaded file into memory if fin3 is None or fin3 == "": return False candidate_words = {} for fin3_each in fin3.split(";"): test_data = ufile.read_csv(fin3_each) # a specific file or a directory for i in range(len(test_data)): can_words = ast.literal_eval( test_data[i][2]) # parse string to array for can_word in can_words: if can_word[0] not in candidate_words: candidate_words[can_word[0]] = 1 print ext_print("Identified all candidate words") # read Google 1T corpus print ext_print("start to load Google 1T grams") Goole_grams, count, max_fre, c1, c2 = {}, 0, 0, 0, 0 if fin2 is None or fin2 == "": return False fid = open(fin2, 'r') for line in fid: line = line.lower() count += 1 if count % 10000000 == 0: print count if len(line) > 0: tem = line.split('\t') '''if len(tem) > 1: if tem[0] not in Goole_grams: Goole_grams[tem[0]] = tem[1] if long(tem[1]) > max_fre: # reduce ordering calculations max_fre = long(tem[1])''' if len(tem) == 1: c1 += 1 if len(tem) > 1: c2 += 1 temws = tem[0].split() find_candidate, find_related = False, False # reduce memory usage for temw in temws: if temw in candidate_words: find_candidate = True elif temw in Related_words: find_related = True if find_candidate and find_related: Goole_grams[tem[0]] = tem[1] if long(tem[1]) > max_fre: # reduce ordering calculations max_fre = long(tem[1]) fid.close() print count print("c1=%d,c2=%d" % (c1, c2)) print ext_print("all files loaded") # max_fre = max(map(float, Goole_grams.values())) # reduce memory usage if max_fre == 0: print ext_print("Data error! please check!") return else: print ext_print("Total number is %d" % len(Goole_grams)) lemmatizer = WordNetLemmatizer() #betas = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1] # betas = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35] betas = [0.5] m, t, p = 0, 0, 0 for beta in betas: # read candidate words for fin3_each in fin3.split(";"): candidate_words = ufile.read_csv( fin3_each) # a specific file or a directory ranked_result = [] for i in xrange(len(orig_texts)): text = orig_texts[i] can_words = ast.literal_eval( candidate_words[i][2]) # parse string to array words = text[1].lower().split() target_word = text[2].split('.')[0] # print target_word if (target_word in words) and len(words) > 1: candiate_ngrams, temp_ngrams = [], find_ngrams( words, n) # get ngram candidates for ngram in temp_ngrams: if target_word in ngram: candiate_ngrams.append( (ngram, ngram.index(target_word))) ranks = {} for can_word in can_words: can_word, can_word_value, fre_can_word, max_context = can_word[ 0], float( can_word[1] ), 0.0, 0.0 # can_word is candidate_word,can_word[0] is delete value just key lemma_can_word = lemmatizer.lemmatize(can_word) for ( ngram, k ) in candiate_ngrams: # k is the site of target_word lst = list(ngram) le_lst = list(ngram) lst[k] = can_word can_context = ' '.join( lst ) # candidate_word replace ngram target_word le_lst[k] = lemma_can_word le_context = ''.join(le_lst) t += 1 if can_context in Goole_grams: m += 1 fre_can_word = float(Goole_grams[can_context]) max_context = max(max_context, fre_can_word) elif le_context in Goole_grams: p += 1 fre_can_word = float(Goole_grams[can_context]) max_context = max(max_context, fre_can_word) # change strategies for calculating 1gram, 2gram, 3gram, or their combination ranks[can_word] = ( 1 - beta) * can_word_value + beta * math.sqrt( max_context / float(max_fre)) sorted_ranks = sorted(ranks.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value ranked_result.append((text[0], text[2], sorted_ranks)) # print ranked_result else: ranked_result.append((text[0], text[2], can_words)) # get output data directory fout1 = fin3_each.replace( ".csv", "_Rank" + str(n) + "gram+" + str(beta) + ".csv") ufile.write_csv(fout1, ranked_result) print ext_print('saved result into: %s' % fout1) return True
def extract_variables (fdin, ffea, ffea2, var): # read input data if fdin is None or fdin =="": return False trials = ufile.read_csv (fdin) if trials is None or len(trials) <= 0: print ext_print ('input data error, please check either no such file or no data --- interrupting') return False print ext_print ('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea =="": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print ext_print ('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var:fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] =key # read feature list - UMLS (can be replaced by full UMLS) if ffea2 is None or ffea2 =="": return False fea_dict_umls = ufile.read_csv_as_dict (ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print ext_print ('no feature data available --- interrupting') return False #load numeric feature list Valx_core.init_features() output = [] for i in xrange(len(trials)): if i%1000 == 0: print ('processing %d' % i) # pre-processing eligibility criteria text text = Valx_core.preprocessing(trials[i][1]) # trials[i][1] is the eligibility criteria text (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(text) # extract candidates containing numeric features for j in xrange(len(candidates_num)): # for each candidate exp_text = Valx_core.formalize_expressions(candidates_num[j]) # identify and formalize values (exp_text, key_ngrams) = Valx_core.identify_variable(exp_text, feature_dict_dk, fea_dict_umls) # identify variable mentions and map them to names (variables, vars_values) = Valx_core.associate_variable_values(exp_text) all_exps = [] for k in xrange(len(variables)): curr_var = variables[k] curr_exps = vars_values[k] if curr_var in features: fea_list = features[curr_var] curr_exps = Valx_core.context_validation(curr_exps, fea_list[1], fea_list[2]) curr_exps = Valx_core.normalization(fea_list[3], curr_exps) # unit conversion and value normalization curr_exps = Valx_core.hr_validation (curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation if len(curr_exps) > 0: if var == "All" or var.lower() == curr_var.lower() or var.lower() in curr_var.lower(): all_exps += curr_exps if len(all_exps) > 0: output.append((trials[i][0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result # output result fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv (fout, output) print ext_print ('saved processed results into: %s' % fout) return True
def compare_all(fin1, fdin2): # read input data if fin1 is None or fin1 == "": return False fin_files = fin1.split(';') # read input data if fdin2 is None or fdin2 == "": return False words_sims = ufile.read_csv_as_dict(fdin2, 0, 2) # a specific file or a directory output, output_performance = [], [] output.append(("ID", "Sentence", "Target word", "By Gold", "By system")) for fin_file in fin_files: texts = ufile.read_csv(fin_file) # a specific file or a directory final_golds, final_system = [], [] for text in texts: key = text[0] sentence = text[1] # get all sentences target_word = text[2] golds = {} # gold word gold_temps = text[3].split(';') for gold_temp in gold_temps: tems = gold_temp.split(':') golds[tems[0]] = int(tems[1]) final_golds.append(golds) #所有golds组成一个列表,每一个目标词的gold是其中的一个元素 if key not in words_sims: exit("No key in processed similarity file!") wordnet_result = ast.literal_eval(words_sims[key]) final_system.append(wordnet_result[:]) output.append( (key, sentence, target_word, golds, wordnet_result[:])) #print final_golds output.append(()) # ===========evaluation output_performance.append(("=====Accuracy@N=======", )) for N in xrange(10): num_correct = 0 for i in xrange(len(final_golds)): gold = final_golds[i] # dictionary sys = final_system[i] # array for j in xrange(len(sys)): if j > N: break if sys[j][0] in gold: # sys = "finally:0.2" num_correct += 1 break accuracy = round(num_correct / float(len(final_golds)), 3) print("Accuracy@" + str(N + 1), accuracy, "%d of %d are correct" % (num_correct, len(final_golds))) output_performance.append( ("Accuracy@" + str(N + 1), accuracy, "%d of %d are correct" % (num_correct, len(final_golds)))) output_performance.append(("=====best P&R=======", )) fenzi, num_resp, = 0.0, 0 for i in xrange(len(final_golds)): gold = final_golds[i] # dictionary sys = final_system[i] # 每一个目标词的候选词列表 if len(sys) > 0: num_resp += 1 #有候选词的目标词个数 best_sys = sys[0][0] if best_sys in gold: # sys = "finally:0.2" fenzi += float(gold[best_sys]) / sum(gold.values()) print("best P fenmu is %d,fenzi is %f" % (num_resp, fenzi)) P = round(fenzi / float(num_resp), 3) R = round(fenzi / float(len(final_golds)), 3) output_performance.append(("Best Precision", P)) output_performance.append(("Best Recall", R)) output_performance.append(("Best F1", F1(P, R))) output_performance.append(("=====oot P&R=======", )) fenzi, num_resp, = 0.0, 0 for i in xrange(len(final_golds)): gold = final_golds[i] # dictionary sys = final_system[i] # array if len(sys) > 0: num_resp += 1 for each_sys in sys: if each_sys[0] in gold: # each_sys = "finally:0.2" fenzi += float(gold[each_sys[0]]) / sum(gold.values()) print("Oot P fenmu is %d,fenzi is %f" % (num_resp, fenzi)) P = round(fenzi / float(num_resp), 3) R = round(fenzi / float(len(final_golds)), 3) output_performance.append(("oot Precision", P)) output_performance.append(("oot Recall", R)) output_performance.append(("oot F1", F1(P, R))) output_performance.append(()) output_performance.append(("=====Candidates generation rate=======", )) rate = round(num_resp / float(len(final_golds)), 3) print rate output_performance.append(("Candidates generation rate", rate)) output.extend(output_performance) # get output data directory fout = fdin2.replace(".csv", "_Evaluation.csv") ufile.write_csv(fout, output) print 'saved result into: %s' % fout return True
def compare_all(fin1, fdin2, fdin3): # read input data if fin1 is None or fin1 =="": return False texts = ufile.read_csv(fin1) # a specific file or a directory # read input data if fdin2 is None or fdin2 =="": return False EDBlist = ufile.load_files (fdin2) # a specific file or a directory # read input data if fdin3 is None or fdin3 =="": return False FreCorpus = ufile.read_file_dict_tokenized(fdin3, '\t') result = [] words_sims = {} cur = 0 for text in texts: cur += 1 if len(text[2].split('.')) > 1: target_word, pos = text[2].split('.')[0], text[2].split('.')[1] else: target_word, pos = text[2], None print "%d of %d" % (cur, len(texts)), target_word simi_values = [] if target_word not in words_sims: processed = [] processed.append(target_word) # step 1 ============== can_words =[] syn = wordnet.synsets(target_word) if len(syn) > 0: for l in syn[0].lemmas(): if l.name() not in can_words: can_words.append(l.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) # step 2 ============== can_words =[] syn = wordnet.synsets(target_word) if len(syn) > 0: syn_word = syn[0].hypernyms() for l in syn_word: if (l.pos() in ['v', 'n', 'a']): for k in l.lemmas(): if k.name() not in can_words: can_words.append(k.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) # step 3 ============== can_words =[] for syn in wordnet.synsets(target_word): for l in syn.lemmas(): if l.name() not in can_words: can_words.append(l.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) # step 4 ============== can_words =[] for syn in wordnet.synsets(target_word): syn_word = syn.hypernyms() for l in syn_word: if (l.pos() in ['v', 'n', 'a']): for k in l.lemmas(): if k.name() not in can_words: can_words.append(k.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) #================================= words_sims[target_word] = simi_values print simi_values[:2] else: simi_values = words_sims[target_word] result.append((text[0], text[2], simi_values)) # output result fout = os.path.splitext(fin1)[0] + "_4steps.csv" ufile.write_csv(fout, result) print 'saved result into: %s' % fout print ext_print ('all tasks completed\n') return True
def compare_all(fin1, fin2, fin3, fout1=None): # read input data if fin1 is None or fin1 == "": return False orig_texts = ufile.read_csv(fin1, '\t') # a specific file or a directory # read candidate words if fin2 is None or fin2 == "": return False candidate_words = ufile.read_file_tokenized( fin2, '\t') # a specific file or a directory candidates = {} for candidate in candidate_words: if candidate[0] not in candidates: if len(candidate) > 1: candidates[candidate[0]] = candidate[1] else: candidates[candidate[0]] = "" #print candidates # read Google 1T corpus if fin3 is None or fin3 == "": return False GooleCorpus = {} fid = open(fin3, 'r') for line in fid: line = line.strip().lower() if len(line) > 0: tem = line.split('\t') if tem[0] not in GooleCorpus: GooleCorpus[tem[0]] = tem[1] #print GooleCorpus fid.close() # main program running ranked_result = [] for text in orig_texts: print text sentence = text[1] # get all sentences target_word = text[2].split(".")[0] #print target_word # get compact context window can_phrases = sentence.lower().split() words = [] if target_word in can_phrases: can_phrases.remove(target_word) for word in can_phrases: if word_checking_stop(word) == 0: words.append(word) # vector of target_word is words ranks = {} for fin2_each in fin2.split(";"): test_data = ufile.read_csv( fin2_each) # a specific file or a directory for i in xrange(len(test_data)): can_words = ast.literal_eval(test_data[i][2]) print can_words #can_words = candidates[target_word].strip(',').split(',') for can_word in can_words: context_weights, can_weights = 0, 0 #context_weights = [] #can_weights = [] # for each can_word, get a vector can_word = can_word[0] fre_can_word = 1 if can_word in GooleCorpus: fre_can_word = GooleCorpus( can_word) # frequency of candidate word #print fre_can_word fre_both = 1 # avoid x/0 problem for word in words: for key, value in GooleCorpus.items(): tems = key.split(' ') if can_word in tems and word in tems: fre_both += int(value) context_weights = 1 can_weights = (float(fre_both) / float(fre_can_word) / 3.0) print context_weights, can_weights ranks[can_word] = cosine_distance(context_weights, can_weights) # print can_word, can_weights, ranks[can_word] sorted_ranks = sorted(ranks.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value print sorted_ranks sorted_rank_str = "" for sorted_item in sorted_ranks: sorted_rank_str += sorted_item[0] + ":" + str(sorted_item[1]) + ";" ranked_result.append((text[0], text[2], sorted_ranks[:])) # get output data directory fout1 = os.path.splitext(fin2)[0] + "_ranked.csv" ufile.write_csv(fout1, ranked_result) print 'saved result into: %s' % fout1
def compare_all(fin1, fdin2, method, threasholds): # read input data if fin1 is None or fin1 == "": return False texts = ufile.read_csv(fin1) # a specific file or a directory # read input data if fdin2 is None or fdin2 == "": return False EDBlist = ufile.load_files(fdin2) # a specific file or a directory threasholds = threasholds.split(';') # 过滤掉原型词与同词根的词 porter_stemmer = PorterStemmer() wnl = WordNetLemmatizer() gold, fre = [], [] for threashold in threasholds: result = [] words_sims = {} start_time = time.time() cur = 0 for text in texts: cur += 1 for i in range(len(text[3].split(";"))): fre.append(text[3].split(";")[i].split(":")[1]) gold.append(text[3].split(";")[i].split(":")[0]) if len(text[2].split('.')) > 1: target_word, pos = text[2].split('.')[0], text[2].split('.')[1] stemming_tw = porter_stemmer.stem(target_word) lemma_tw = wordnet.morphy(target_word, pos=pos) #lemma_tw = wnl.lemmatize(target_word, pos) print lemma_tw else: target_word, pos = text[2], None stemming_tw = porter_stemmer.stem(target_word) lemma_tw = wordnet.morphy(target_word) #lemma_tw = wnl.lemmatize(target_word, pos) print("%d of %d" % (cur, len(texts)), target_word) simi_values = [] if target_word not in words_sims: word_sim = {} for word2 in EDBlist: stemming_cw = porter_stemmer.stem(word2) lemma_word = wordnet.morphy(word2) if word2 not in word_sim: #if target_word !=word2: if target_word != word2 and stemming_cw != stemming_tw and lemma_word != lemma_tw: # simi_value=compare_allsynsets(method, target_word, word2, TWpos, SYNpos, pos) # simi_value = compare_allsynsets(method, target_word, word2, TWpos, pos) # simi_value = compare_allsynsets(method, target_word, word2, SYNpos) simi_value = compare_allsynsets( method, target_word, word2) if simi_value > float(threashold): word_sim[word2] = round(float(simi_value), 3) simi_values = sorted(word_sim.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value words_sims[target_word] = simi_values else: simi_values = words_sims[target_word] result.append((text[0], text[2], simi_values)) print("--- %s seconds ---" % (time.time() - start_time)) # output result fout = os.path.splitext(fin1)[0] + "_%s_%s.csv" % (method, threashold) # if SYNpos: # fout = fout.replace(".csv", "_SYNpos.csv") # if TWpos: # fout = fout.replace(".csv", "_TWpos.csv") ufile.write_csv(fout, result) print('saved result into: %s' % fout) print(ext_print('all tasks completed\n')) return True