def parse_file(file): tree = ET.parse(file) root = tree.getroot() mails = [] space = re.compile(r' ',re.S) sp = re.compile(r'\-+',re.S) gt = re.compile(r'\>',re.S) for i in root: tmp = mail_struct(len(mails)) for j in i: if(j.tag == 'name'): #print(space.sub('_',j.text)) tmp.name = space.sub('_',j.text) tmp.remove_stop_ver_title = stop_word_remove(j.text) #print(j.text) elif(j.tag == 'DOC'): thread_num = -1 for k in j: if k.tag == 'Subject': tmp.subject.append(k.text) if(k.tag == 'Text'): for l in k: res = gt.search(l.text) ''' if(res is not None or l.text[0] == '>' or(l.text[0] =='_' and l.text[1] == '__')): continue if(l.text[0] == 'h' and l.text[1]=='t' and l.text[2] == 't' and l.text[3] == 'p'): continue ''' tmp_text = l.text.replace('\n',' ') tmp_text = sp.sub('',tmp_text) if(tmp_text.isspace()): continue t_s = sentence_struct(tmp_text,l.attrib['id']) tmp.text.append(t_s) if t_s.index.split('.')[0] != thread_num: thread_num = t_s.index.split('.')[0] tmp.thread.append(thread_struct(int(thread_num))) #thread_num = t_s.index.split('.')[0] tmp.thread[-1].sentences.append(t_s) #print t_s.sentence #tmp.id.append(l.attrib['id']) #print(l.text) mails.append(tmp) for i in mails: for j in i.text: for k in range(len(j.remove_stop_ver)): if(len(j.remove_stop_ver[k])<2 or j.remove_stop_ver[k] == 'gt' or j.remove_stop_ver[k] == 'gtgt'): j.remove_stop_ver[k] = '' #print(j.remove_stop_ver) j.remove_stop_ver = (filter(None,j.remove_stop_ver)) return mails
def process_document(title,sen2=1,exist=1): article,art_title = crawl_article(title) #sen2 = 1 #exist = 1 if(sen2 == 1): input_name = title+'_sen2_test.txt' feature_name = title+'_sen2_features.txt' output_name = title+'_sen2_output.txt' vec = construct_2sen_vec(article) else: input_name = title+'_test.txt' feature_name = title+'_features.txt' output_name = title+'_output.txt' vec = construct_sentence_vec(article) ''' if(exist == 0): construct_input_file(vec,input_name,art_title) os.system('python takelab_sts/takelab_simple_features.py '+input_name+' > '+feature_name) os.system('svm-predict '+feature_name+' model.txt ' + output_name) os.system('python postprocess_scores.py '+ input_name+' ' + output_name) matrix,title_sim = construct_similarity_matrix(output_name,len(vec)) ''' tmp = mail_struct() tmp.name = title tmp.text = vec bot = re.compile(r'\_',re.S) tmp.remove_stop_ver_title = stop_word_remove(bot.sub(' ',title)) matrix,title_sim = cal_similarity(tmp) important = lexrank(matrix,len(vec),0.001) for k in range(len(important)): if(math.isnan(important[k])): important[k] = 0 important = (important-min(important))/(max(important)-min(important)) title_sim = (title_sim-min(title_sim))/(max(title_sim)-min(title_sim)) sort_index = numpy.argsort(important)[::-1] return vec,important,title_sim