Esempio n. 1
0
    def get_important_words(self, emails, path=None):

        cleaner = Cleaner()

        complete_email_text = ''

        for email in emails:
            email_header = cleaner.delete_tags(email.header)
            email_body = cleaner.delete_tags(email.body)

            topic_line = re.findall(r'Topic.*\n', email_header)[0]
            topic_line = topic_line[6:].strip()

            complete_email_text = complete_email_text + topic_line + '\n' + email_body + '\n'

        # Cleaning the text
        complete_email_text = re.sub('\n', ' ', complete_email_text)
        complete_email_text = re.sub('\s', ' ', complete_email_text)
        complete_email_text = re.sub(' +', ' ', complete_email_text)

        complete_email_text = tb(complete_email_text)
        bloblist = [complete_email_text]

        words = []

        # Test
        # print(bloblist)

        for i, blob in enumerate(bloblist):
            scores = {word: self.tfidf(word, blob, bloblist) for word in blob.words}
            sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            for word, score in sorted_words:
                words.append(word)

            # Delete Stop-Words
            words = self.delete_stopwords(words)

            if path is not None:
                with open(path, 'w') as current_file:
                    for word in words:
                        current_file.write('{}\n'.format(word))

        return words