def temporal_processing(fin, fout=None, type="testing", fin_t=None, rep_enable=False, rep_word="", event=False, X3=False): # read the input data if (fin) is None: print ext_print('no input file found --- interrupting') return texts = ufile.read_file(fin, 1, False) if texts is None or len(texts) <= 0: print ext_print('no text available for processing --- interrupting') return print ext_print('start to process temporal information in text file %s' % fin) if type == "training": tpatts = temporal_training(texts) # output pattern result if (fout is None) or (fout == ""): fout = os.path.splitext(fin)[0] + "_pat" + os.path.splitext(fin)[1] ufile.write_file(fout, sorted(tpatts, key=tpatts.get, reverse=True), False) print ext_print('saved trained patterns into: %s' % fout) elif type == "testing": # read the pattern data if (fin_t) is None: print ext_print('no pattern file found --- interrupting') return tpatts = ufile.read_file(fin_t, 1, False) if tpatts is None or len(tpatts) <= 0: print ext_print( 'no patterns available for processing --- interrupting') return result = temporal_testing(texts, tpatts, rep_enable, rep_word, event) if X3: result = using_TimeX3(result) # output result if (fout is None) or (fout == ""): if X3: fout = os.path.splitext(fin)[0] + "_TEXer.xml" else: fout = os.path.splitext(fin)[0] + "_TEXer" + os.path.splitext( fin)[1] ufile.write_file(fout, result, False) print ext_print('saved processed results into: %s' % fout) print ext_print('all tasks completed\n') return True
def file_merge(fdin, fout, columns, format): # read input data if fdin is None or fdin == "": return False texts = ufile.read(fdin) # a specific file or a directory result = [] print texts if columns == "all": result = texts else: cols = columns.split('|') for text in texts: if len(cols) == 1: result.append(text[int(cols) - 1]) else: for col in cols: result.append(text[int(col) - 1]) print ext_print('get %d in total' % len(result)) # get output data directory if fout is None: fout = os.path.splitext(fdin)[0] + "_merged" + format # output detailed result into file if format == "" or ".txt": ufile.write_file(fout, result, False) elif format == ".csv": ufile.write_csv(fout, result) print ext_print('saved result into: %s' % fout) print ext_print('all tasks completed\n') return True
def Extract_nonGT(fdin, fout, fin_, fout_, c): #----------------------------------initialize and load supporting data # read input data all_texts = [] if fdin is None or fdin == "": return False elif fdin.endswith(".txt"): all_texts = ufile.load_files(fdin) if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] for text in all_texts: text = text.lower() result = GAXer_Ggender(text) output.append(result) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.txt" ufile.write_file(fout, output, False) elif fdin.endswith(".csv"): all_texts = ufile.load_files(fdin) # a specific file or a directory all_texts_ = ufile.load_files(fin_) # a specific file or a directory if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] output_ = [] i = 0 cnt = 0 cho = 0 j = 100 jump = int(j * random.random()) + 2 goadList = {} for t in all_texts_: goadList[t[0]] = 1 for texts in all_texts: if i % 1000 == 0: print ext_print('processing %d' % i) i += 1 # if str(texts[0])<>'NCT00002967': # continue cop = texts inclusive = texts[5].lower() inclusive = inclusive[0:inclusive.find('exclusi')] combine_texts = texts[2].lower() + ". " + texts[3].lower( ) + ". " + texts[4].lower() + ". " + inclusive pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:] result = GAXer_Ggender(combine_texts, pre_label) ''' if 'Transgender' not in str(result): FindSame = texts[0] in goadList.keys() if not FindSame: if cho==jump: output_.append((cop[0],cop[1],cop[2],cop[3],cop[4],cop[5])) cnt+=1 jump=int(j*random.random())+2 cho=0 cho+=1 ''' if 'Transgender' not in str(result): FindSame = texts[0] in goadList.keys() if not FindSame: output_.append( (cop[0], cop[1], cop[2], cop[3], cop[4], cop[5])) cnt += 1 if cnt == c: break if len(result) == 0 or (len(texts[1]) > 0 and len(result) == 1 and pre_label in result): continue else: t = texts[0] t = t.replace('"', '') t = str(t) output.append((t, texts[1], str(result))) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.csv" ufile.write_csv(fout, output) ufile.write_csv(fout_, output_) print ext_print('saved processed results into: %s' % fout) print ext_print('all tasks completed\n') return True
def GAXer_wrapper(fdin, fout=None): #----------------------------------initialize and load supporting data # read input data all_texts = [] if fdin is None or fdin == "": return False elif fdin.endswith(".txt"): all_texts = ufile.load_files(fdin) if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] for text in all_texts: text = text.lower() result = GAXer_Ggender(text) output.append(result) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.txt" ufile.write_file(fout, output, False) elif fdin.endswith(".csv"): all_texts = ufile.load_files(fdin) # a specific file or a directory if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] i = 0 for texts in all_texts: if i % 1000 == 0: print ext_print('processing %d' % i) i += 1 # if str(texts[0])<>'NCT00002967': # continue inclusive = texts[5].lower() inclusive = inclusive[0:inclusive.find('exclusi')] # combine_texts = texts[2].lower() + ". " + texts[3].lower() + ". " + texts[4].lower() + ". " + inclusive combine_texts = texts[3].lower() + ". " + texts[4].lower( ) + ". " + inclusive pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:] result = GAXer_Ggender(combine_texts, pre_label) # print result # if len(result)==0 or (len(texts[1])>0 and len(result)==1 and pre_label in result): if len(result) == 0: continue else: t = texts[0] t = t.replace('"', '') t = str(t) output.append((t, texts[1], str(result))) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.csv" ufile.write_csv(fout, output) print ext_print('saved processed results into: %s' % fout) print ext_print('all tasks completed\n') return True
def compare_all(fin1, fdin2, fout1=None): # read input data if fin1 is None or fin1 == "": return False texts = ufile.read_file_tokenized(fin1, '\t') # a specific file or a directory word_list = [] for text in texts: sentence = text[0] # get all sentences target_word = text[1] # get compact context window can_phrases = NLP_sent.phrase_splitting(sentence) words = [] for phrase in can_phrases: all_words = NLP_word.word_splitting(phrase.lower()) if target_word in all_words: all_words.remove(target_word) for word in all_words: if word_checking_stop(word) == 0: if word not in word_list: word_list.append(word) break # get output data directory if fout1 is None: fout = os.path.splitext(fin1)[0] + "_wordFeatures.csv" ufile.write_csv(fout1, word_list) print 'saved result into: %s' % fout # read 1T corpus data if fdin2 is None or fdin2 == "": return False # judge a single file or a directory for root, dir, files in os.walk(fdin2): for filename in files: f = os.path.join(root, filename) print f New1T = [] cur = 0 fid = open(f, 'r') for line in fid: cur += 1 if (cur % 1000000 == 0): print filename, cur line = line.strip().lower() if len(line) > 0: tem = line.split('\t') tem1 = tem[0].split(' ') for tem_word in tem1: if word_checking_speical(tem_word) > 0: break if tem_word in word_list: New1T.append(line) break fid.close() # get output data directory fout2 = fdin2 + "_" + filename ufile.write_file(fout2, New1T, False) print 'saved result into: %s' % fout2 print 'all tasks completed\n' return True