Esempio n. 1
0
def construct_2sen_vec(article):
	index = 0
        sentence_vec = []
        for i in article:
                for j in range(len(i)-1):
                        sentence_vec.append(sentence_struct(i[j]+i[j+1],index))
                        index += 1
        return sentence_vec
Esempio n. 2
0
def construct_sentence_vec(article):
	index = 0
	sentence_vec = []
	for i in article:
		for j in i:
			sentence_vec.append(sentence_struct(j,index))
			index += 1
	return sentence_vec
def parse_file(file):
        tree = ET.parse(file)
        root = tree.getroot()
	mails = []
	space = re.compile(r' ',re.S)
	sp = re.compile(r'\-+',re.S)
	gt = re.compile(r'\&gt',re.S)
        for i in root:
		tmp = mail_struct(len(mails))
		for j in i:
			if(j.tag == 'name'):
				#print(space.sub('_',j.text))
				tmp.name = space.sub('_',j.text)
				tmp.remove_stop_ver_title = stop_word_remove(j.text)
				#print(j.text)
			elif(j.tag == 'DOC'):
				thread_num = -1
				for k in j:
					if k.tag == 'Subject':
						tmp.subject.append(k.text)
					if(k.tag == 'Text'):
						for l in k:
							res = gt.search(l.text)
							'''
							if(res is not None or l.text[0] == '>' or(l.text[0] =='_' and l.text[1] == '__')):
								continue
							if(l.text[0] == 'h' and l.text[1]=='t' and l.text[2] == 't' and l.text[3] == 'p'):
								continue
							'''
							tmp_text = l.text.replace('\n',' ')
							tmp_text = sp.sub('',tmp_text)
							if(tmp_text.isspace()):
								continue
							t_s = sentence_struct(tmp_text,l.attrib['id'])
							tmp.text.append(t_s)

							if t_s.index.split('.')[0] != thread_num:
								thread_num = t_s.index.split('.')[0]
								tmp.thread.append(thread_struct(int(thread_num)))
								#thread_num = t_s.index.split('.')[0]
							tmp.thread[-1].sentences.append(t_s)
							#print t_s.sentence
							#tmp.id.append(l.attrib['id'])
							#print(l.text)
		mails.append(tmp)
	for i in mails:
		for j in i.text:
			for k in range(len(j.remove_stop_ver)):
				if(len(j.remove_stop_ver[k])<2 or j.remove_stop_ver[k] == 'gt' or j.remove_stop_ver[k] == 'gtgt'):
					j.remove_stop_ver[k] = ''
			#print(j.remove_stop_ver)
			j.remove_stop_ver = (filter(None,j.remove_stop_ver))
	return mails