def shorten(msg, MAX_CHARS = 300): sentences = segment(msg) if sum([len(s) for s in sentences]) + len(sentences) < MAX_CHARS: if len(sentences) != 0: return ' '.join(sentences) else: summary = msg['subject'] if len(summary) == 0: summary = strip(msg['plain_body']) return summary[0:MAX_CHARS] ranks = rank_sentences(sentences) summary, sentence_index = '', 0 while len(summary) < MAX_CHARS and sentence_index < len(ranks): sentence = ranks[sentence_index][2] #Segment sentence into clauses and remove spurious ones summary = summary + ' ' + sentence sentence_index += 1 summary = ' '.join(summary.split()) return summary[0:MAX_CHARS]
def segment(msg): text = strip(msg['plain_body']) text = '\n'.join([l for l in text.split('\n') if len(l) > 0 and l[0] != '>']) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = tokenizer.tokenize(text, realign_boundaries=True) if len(sentences) > 2: sentences = sentences[0:len(sentences)-1] reply_pattern = "On [A-Z][a-z]+, [A-Z][a-z]+(.)? [0-9]+, [0-9]+( at [0-9]+:[0-9]+ [AMP]+)?, [A-Za-z\s]+ <[^@]+@[^@]+\.[^@]+>.?wrote:" output = [msg['subject']] for sentence in sentences: match = re.search(reply_pattern, sentence) if match != None: sentence = sentence[:match.start()] if len(sentence) > 300: for new_sent in sentence.split('\r\n\r\n'): output.append(new_sent) else: output.append(sentence) return [sentence for sentence in output if is_valid_sentence(sentence)]