def compare_all(fin1, fdin2): # read input data if fin1 is None or fin1 =="": return False texts = ufile.read_csv(fin1) # a specific file or a directory # read input data if fdin2 is None or fdin2 =="": return False EDBlist = ufile.load_files (fdin2) # a specific file or a directory result = [] cur = 0 for text in texts: cur += 1 result_items_new =[] result_items = ast.literal_eval(text[2]) #print result_items for result_item in result_items: #print result_item[0] in EDBlist if result_item[0] in EDBlist: result_items_new.append(result_item) result.append((text[0], text[1], str(result_items_new))) # output result fout = os.path.splitext(fin1)[0] + "_EDB.csv" ufile.write_csv(fout, result) print 'saved result into: %s' % fout print ext_print ('all tasks completed\n') return True
def Extract_nonGT(fdin, fout, fin_, fout_, c): #----------------------------------initialize and load supporting data # read input data all_texts = [] if fdin is None or fdin == "": return False elif fdin.endswith(".txt"): all_texts = ufile.load_files(fdin) if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] for text in all_texts: text = text.lower() result = GAXer_Ggender(text) output.append(result) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.txt" ufile.write_file(fout, output, False) elif fdin.endswith(".csv"): all_texts = ufile.load_files(fdin) # a specific file or a directory all_texts_ = ufile.load_files(fin_) # a specific file or a directory if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] output_ = [] i = 0 cnt = 0 cho = 0 j = 100 jump = int(j * random.random()) + 2 goadList = {} for t in all_texts_: goadList[t[0]] = 1 for texts in all_texts: if i % 1000 == 0: print ext_print('processing %d' % i) i += 1 # if str(texts[0])<>'NCT00002967': # continue cop = texts inclusive = texts[5].lower() inclusive = inclusive[0:inclusive.find('exclusi')] combine_texts = texts[2].lower() + ". " + texts[3].lower( ) + ". " + texts[4].lower() + ". " + inclusive pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:] result = GAXer_Ggender(combine_texts, pre_label) ''' if 'Transgender' not in str(result): FindSame = texts[0] in goadList.keys() if not FindSame: if cho==jump: output_.append((cop[0],cop[1],cop[2],cop[3],cop[4],cop[5])) cnt+=1 jump=int(j*random.random())+2 cho=0 cho+=1 ''' if 'Transgender' not in str(result): FindSame = texts[0] in goadList.keys() if not FindSame: output_.append( (cop[0], cop[1], cop[2], cop[3], cop[4], cop[5])) cnt += 1 if cnt == c: break if len(result) == 0 or (len(texts[1]) > 0 and len(result) == 1 and pre_label in result): continue else: t = texts[0] t = t.replace('"', '') t = str(t) output.append((t, texts[1], str(result))) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.csv" ufile.write_csv(fout, output) ufile.write_csv(fout_, output_) print ext_print('saved processed results into: %s' % fout) print ext_print('all tasks completed\n') return True
def GAXer_wrapper(fdin, fout=None): #----------------------------------initialize and load supporting data # read input data all_texts = [] if fdin is None or fdin == "": return False elif fdin.endswith(".txt"): all_texts = ufile.load_files(fdin) if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] for text in all_texts: text = text.lower() result = GAXer_Ggender(text) output.append(result) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.txt" ufile.write_file(fout, output, False) elif fdin.endswith(".csv"): all_texts = ufile.load_files(fdin) # a specific file or a directory if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] i = 0 for texts in all_texts: if i % 1000 == 0: print ext_print('processing %d' % i) i += 1 # if str(texts[0])<>'NCT00002967': # continue inclusive = texts[5].lower() inclusive = inclusive[0:inclusive.find('exclusi')] # combine_texts = texts[2].lower() + ". " + texts[3].lower() + ". " + texts[4].lower() + ". " + inclusive combine_texts = texts[3].lower() + ". " + texts[4].lower( ) + ". " + inclusive pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:] result = GAXer_Ggender(combine_texts, pre_label) # print result # if len(result)==0 or (len(texts[1])>0 and len(result)==1 and pre_label in result): if len(result) == 0: continue else: t = texts[0] t = t.replace('"', '') t = str(t) output.append((t, texts[1], str(result))) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.csv" ufile.write_csv(fout, output) print ext_print('saved processed results into: %s' % fout) print ext_print('all tasks completed\n') return True
def compare_all(fin1, fdin2, fdin3): # read input data if fin1 is None or fin1 =="": return False texts = ufile.read_csv(fin1) # a specific file or a directory # read input data if fdin2 is None or fdin2 =="": return False EDBlist = ufile.load_files (fdin2) # a specific file or a directory # read input data if fdin3 is None or fdin3 =="": return False FreCorpus = ufile.read_file_dict_tokenized(fdin3, '\t') result = [] words_sims = {} cur = 0 for text in texts: cur += 1 if len(text[2].split('.')) > 1: target_word, pos = text[2].split('.')[0], text[2].split('.')[1] else: target_word, pos = text[2], None print "%d of %d" % (cur, len(texts)), target_word simi_values = [] if target_word not in words_sims: processed = [] processed.append(target_word) # step 1 ============== can_words =[] syn = wordnet.synsets(target_word) if len(syn) > 0: for l in syn[0].lemmas(): if l.name() not in can_words: can_words.append(l.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) # step 2 ============== can_words =[] syn = wordnet.synsets(target_word) if len(syn) > 0: syn_word = syn[0].hypernyms() for l in syn_word: if (l.pos() in ['v', 'n', 'a']): for k in l.lemmas(): if k.name() not in can_words: can_words.append(k.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) # step 3 ============== can_words =[] for syn in wordnet.synsets(target_word): for l in syn.lemmas(): if l.name() not in can_words: can_words.append(l.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) # step 4 ============== can_words =[] for syn in wordnet.synsets(target_word): syn_word = syn.hypernyms() for l in syn_word: if (l.pos() in ['v', 'n', 'a']): for k in l.lemmas(): if k.name() not in can_words: can_words.append(k.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) #================================= words_sims[target_word] = simi_values print simi_values[:2] else: simi_values = words_sims[target_word] result.append((text[0], text[2], simi_values)) # output result fout = os.path.splitext(fin1)[0] + "_4steps.csv" ufile.write_csv(fout, result) print 'saved result into: %s' % fout print ext_print ('all tasks completed\n') return True
def compare_all(fin1, fdin2, method, threasholds): # read input data if fin1 is None or fin1 == "": return False texts = ufile.read_csv(fin1) # a specific file or a directory # read input data if fdin2 is None or fdin2 == "": return False EDBlist = ufile.load_files(fdin2) # a specific file or a directory threasholds = threasholds.split(';') # 过滤掉原型词与同词根的词 porter_stemmer = PorterStemmer() wnl = WordNetLemmatizer() gold, fre = [], [] for threashold in threasholds: result = [] words_sims = {} start_time = time.time() cur = 0 for text in texts: cur += 1 for i in range(len(text[3].split(";"))): fre.append(text[3].split(";")[i].split(":")[1]) gold.append(text[3].split(";")[i].split(":")[0]) if len(text[2].split('.')) > 1: target_word, pos = text[2].split('.')[0], text[2].split('.')[1] stemming_tw = porter_stemmer.stem(target_word) lemma_tw = wordnet.morphy(target_word, pos=pos) #lemma_tw = wnl.lemmatize(target_word, pos) print lemma_tw else: target_word, pos = text[2], None stemming_tw = porter_stemmer.stem(target_word) lemma_tw = wordnet.morphy(target_word) #lemma_tw = wnl.lemmatize(target_word, pos) print("%d of %d" % (cur, len(texts)), target_word) simi_values = [] if target_word not in words_sims: word_sim = {} for word2 in EDBlist: stemming_cw = porter_stemmer.stem(word2) lemma_word = wordnet.morphy(word2) if word2 not in word_sim: #if target_word !=word2: if target_word != word2 and stemming_cw != stemming_tw and lemma_word != lemma_tw: # simi_value=compare_allsynsets(method, target_word, word2, TWpos, SYNpos, pos) # simi_value = compare_allsynsets(method, target_word, word2, TWpos, pos) # simi_value = compare_allsynsets(method, target_word, word2, SYNpos) simi_value = compare_allsynsets( method, target_word, word2) if simi_value > float(threashold): word_sim[word2] = round(float(simi_value), 3) simi_values = sorted(word_sim.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value words_sims[target_word] = simi_values else: simi_values = words_sims[target_word] result.append((text[0], text[2], simi_values)) print("--- %s seconds ---" % (time.time() - start_time)) # output result fout = os.path.splitext(fin1)[0] + "_%s_%s.csv" % (method, threashold) # if SYNpos: # fout = fout.replace(".csv", "_SYNpos.csv") # if TWpos: # fout = fout.replace(".csv", "_TWpos.csv") ufile.write_csv(fout, result) print('saved result into: %s' % fout) print(ext_print('all tasks completed\n')) return True