def get_important_words(self, emails, path=None): cleaner = Cleaner() complete_email_text = '' for email in emails: email_header = cleaner.delete_tags(email.header) email_body = cleaner.delete_tags(email.body) topic_line = re.findall(r'Topic.*\n', email_header)[0] topic_line = topic_line[6:].strip() complete_email_text = complete_email_text + topic_line + '\n' + email_body + '\n' # Cleaning the text complete_email_text = re.sub('\n', ' ', complete_email_text) complete_email_text = re.sub('\s', ' ', complete_email_text) complete_email_text = re.sub(' +', ' ', complete_email_text) complete_email_text = tb(complete_email_text) bloblist = [complete_email_text] words = [] # Test # print(bloblist) for i, blob in enumerate(bloblist): scores = {word: self.tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words: words.append(word) # Delete Stop-Words words = self.delete_stopwords(words) if path is not None: with open(path, 'w') as current_file: for word in words: current_file.write('{}\n'.format(word)) return words