Example #1
0
def shorten(msg, MAX_CHARS = 300):
	sentences = segment(msg)

	if sum([len(s) for s in sentences]) + len(sentences) < MAX_CHARS:
		if len(sentences) != 0:
			return ' '.join(sentences)
		else:
			summary = msg['subject']

			if len(summary) == 0:
				summary = strip(msg['plain_body'])
			
			return summary[0:MAX_CHARS]

	ranks = rank_sentences(sentences)

	summary, sentence_index = '', 0

	while len(summary) < MAX_CHARS and sentence_index < len(ranks):
		sentence = ranks[sentence_index][2]

		#Segment sentence into clauses and remove spurious ones
		summary = summary + ' ' + sentence

		sentence_index += 1

	summary = ' '.join(summary.split())

	return summary[0:MAX_CHARS]
Example #2
0
def segment(msg):
	text = strip(msg['plain_body'])
	text = '\n'.join([l for l in text.split('\n') if len(l) > 0 and l[0] != '>'])

	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

	sentences = tokenizer.tokenize(text, realign_boundaries=True)

	if len(sentences) > 2:
		sentences = sentences[0:len(sentences)-1]

	reply_pattern = "On [A-Z][a-z]+, [A-Z][a-z]+(.)? [0-9]+, [0-9]+( at [0-9]+:[0-9]+ [AMP]+)?, [A-Za-z\s]+ <[^@]+@[^@]+\.[^@]+>.?wrote:"
	output = [msg['subject']]

	for sentence in sentences:
		match = re.search(reply_pattern, sentence)

		if match != None:
			sentence = sentence[:match.start()]

		if len(sentence) > 300:
			for new_sent in sentence.split('\r\n\r\n'):
				output.append(new_sent)
		else:
			output.append(sentence)

	return [sentence for sentence in output if is_valid_sentence(sentence)]