def construct_2sen_vec(article): index = 0 sentence_vec = [] for i in article: for j in range(len(i)-1): sentence_vec.append(sentence_struct(i[j]+i[j+1],index)) index += 1 return sentence_vec
def construct_sentence_vec(article): index = 0 sentence_vec = [] for i in article: for j in i: sentence_vec.append(sentence_struct(j,index)) index += 1 return sentence_vec
def parse_file(file): tree = ET.parse(file) root = tree.getroot() mails = [] space = re.compile(r' ',re.S) sp = re.compile(r'\-+',re.S) gt = re.compile(r'\>',re.S) for i in root: tmp = mail_struct(len(mails)) for j in i: if(j.tag == 'name'): #print(space.sub('_',j.text)) tmp.name = space.sub('_',j.text) tmp.remove_stop_ver_title = stop_word_remove(j.text) #print(j.text) elif(j.tag == 'DOC'): thread_num = -1 for k in j: if k.tag == 'Subject': tmp.subject.append(k.text) if(k.tag == 'Text'): for l in k: res = gt.search(l.text) ''' if(res is not None or l.text[0] == '>' or(l.text[0] =='_' and l.text[1] == '__')): continue if(l.text[0] == 'h' and l.text[1]=='t' and l.text[2] == 't' and l.text[3] == 'p'): continue ''' tmp_text = l.text.replace('\n',' ') tmp_text = sp.sub('',tmp_text) if(tmp_text.isspace()): continue t_s = sentence_struct(tmp_text,l.attrib['id']) tmp.text.append(t_s) if t_s.index.split('.')[0] != thread_num: thread_num = t_s.index.split('.')[0] tmp.thread.append(thread_struct(int(thread_num))) #thread_num = t_s.index.split('.')[0] tmp.thread[-1].sentences.append(t_s) #print t_s.sentence #tmp.id.append(l.attrib['id']) #print(l.text) mails.append(tmp) for i in mails: for j in i.text: for k in range(len(j.remove_stop_ver)): if(len(j.remove_stop_ver[k])<2 or j.remove_stop_ver[k] == 'gt' or j.remove_stop_ver[k] == 'gtgt'): j.remove_stop_ver[k] = '' #print(j.remove_stop_ver) j.remove_stop_ver = (filter(None,j.remove_stop_ver)) return mails