def txt2matrix_fortrain(ann_dir,mytrain,tag_included,filename,curpath): txt_files=readfromdir.get_file_list(ann_dir,['txt']) print "there's "+ str(len(txt_files))+" in total!" i=0 for txt_file in txt_files: i+=1 # read files myraw=codecs.open(txt_file).read() match=re.search('^(.*)\.txt',txt_file) name=match.group(1) ann_file=name+'_new.ann' print "reading file from",txt_file,ann_file,"..." myann=codecs.open(ann_file,"r") #print myann # output features text_tagged=labeling.ann_tagging(myann,myraw,tag_included) lines=" ".join(text_tagged.split(r'[;\n]')) sents=nltk.sent_tokenize(lines) lines=" ### ".join(sents) term_list, tag_list,index_list=t2c.txt2conll(lines,1) # "1" here represents it's a training texts with annoatioin; "0" represents raw texts sents=" ".join(term_list).split("###") type_list=[] pos_list=[] # extract umls concepts: j=0 for sent in sents: if j>=len(term_list): break metamap_output=umls_identify.formating_for_metamap(curpath,sent,filename) one_sent_term,type_list=umls_identify.label_umls_cui(metamap_output,sent) pos_list=POS.pos_tagging(one_sent_term) pos_list.append(".") type_list.append("O") terms=sent.split() sent_id=0 for t in terms: if term_list[j]== "###": j=j+1 term=term_list[j] lemma=st.stem(term) #vector=word2vec.ouput_embedding(model,term.lower(),50) bc=BrownClustering.bc_indexing(term.lower(),bc_index) print>> mytrain, term_list[j]+"\t"+lemma+"\t"+pos_list[sent_id]+"\t"+type_list[sent_id]+"\t"+bc+"\t"+index_list[j]+"\t"+tag_list[j] sent_id+=1 j=j+1 print>>mytrain if i%5==0: print str(i) +" files finished"
def txt2matrix_fortest(sent,crf_matrix,filename): sent=sent.rstrip() metamap_output=umls_identify.formating_for_metamap(curpath,sent,filename) one_sent_term,type_list=umls_identify.label_umls_cui(metamap_output,sent) pos_list=POS.pos_tagging(one_sent_term) pos_list.append(".") type_list.append("O") terms=sent.split() term_id=0 for t in one_sent_term: term=t lemma=st.stem(term) #vector=word2vec.ouput_embedding(model,term.lower(),50) bc=BrownClustering.bc_indexing(term.lower(),bc_index) print >>crf_matrix, t+"\t"+lemma+"\t"+pos_list[term_id]+"\t"+type_list[term_id]+"\t"+bc term_id+=1 print >>crf_matrix
def txt2matrix_fortest(sent, crf_matrix, filename): sent = sent.rstrip() metamap_output = umls_identify.formating_for_metamap( curpath, sent, filename) one_sent_term, type_list = umls_identify.label_umls_cui( metamap_output, sent) pos_list = POS.pos_tagging(one_sent_term) pos_list.append(".") type_list.append("O") terms = sent.split() term_id = 0 for t in one_sent_term: term = t lemma = st.stem(term) #vector=word2vec.ouput_embedding(model,term.lower(),50) bc = BrownClustering.bc_indexing(term.lower(), bc_index) print >> crf_matrix, t + "\t" + lemma + "\t" + pos_list[ term_id] + "\t" + type_list[term_id] + "\t" + bc term_id += 1 print >> crf_matrix
def txt2matrix_fortrain(ann_dir, mytrain, tag_included, filename, curpath): txt_files = readfromdir.get_file_list(ann_dir, ['txt']) print "there's " + str(len(txt_files)) + " in total!" i = 0 for txt_file in txt_files: i += 1 # read files myraw = codecs.open(txt_file).read() match = re.search('^(.*)\.txt', txt_file) name = match.group(1) ann_file = name + '_new.ann' print "reading file from", txt_file, ann_file, "..." myann = codecs.open(ann_file, "r") #print myann # output features text_tagged = labeling.ann_tagging(myann, myraw, tag_included) lines = " ".join(text_tagged.split(r'[;\n]')) sents = nltk.sent_tokenize(lines) lines = " ### ".join(sents) term_list, tag_list, index_list = t2c.txt2conll( lines, 1 ) # "1" here represents it's a training texts with annoatioin; "0" represents raw texts sents = " ".join(term_list).split("###") type_list = [] pos_list = [] # extract umls concepts: j = 0 for sent in sents: if j >= len(term_list): break metamap_output = umls_identify.formating_for_metamap( curpath, sent, filename) one_sent_term, type_list = umls_identify.label_umls_cui( metamap_output, sent) pos_list = POS.pos_tagging(one_sent_term) pos_list.append(".") type_list.append("O") terms = sent.split() sent_id = 0 for t in terms: if term_list[j] == "###": j = j + 1 term = term_list[j] lemma = st.stem(term) #vector=word2vec.ouput_embedding(model,term.lower(),50) bc = BrownClustering.bc_indexing(term.lower(), bc_index) print >> mytrain, term_list[j] + "\t" + lemma + "\t" + pos_list[ sent_id] + "\t" + type_list[ sent_id] + "\t" + bc + "\t" + index_list[ j] + "\t" + tag_list[j] sent_id += 1 j = j + 1 print >> mytrain if i % 5 == 0: print str(i) + " files finished"