Exemple #1
0
    def test_from_file_obj(self):
        with ported_open(mail_test_2) as fp:
            mail = mailparser.parse_from_file_obj(fp)
        trust = "smtp.customers.net"

        self.assertEqual(False, mail.has_defects)

        result = mail.mail
        self.assertIsInstance(result, dict)
        self.assertNotIn("defects", result)
        self.assertNotIn("anomalies", result)
        self.assertIn("has_defects", result)

        result = mail.get_server_ipaddress(trust)
        self.assertIsInstance(result, six.text_type)

        result = mail.mail_json
        self.assertIsInstance(result, six.text_type)

        result = mail.headers
        self.assertIsInstance(result, dict)

        result = mail.headers_json
        self.assertIsInstance(result, six.text_type)

        result = mail.body
        self.assertIsInstance(result, six.text_type)

        result = mail.date
        self.assertIsInstance(result, datetime.datetime)

        result = mail.from_
        self.assertIsInstance(result, list)

        result = mail.to
        self.assertIsInstance(result, list)
        self.assertEquals(len(result), 2)
        self.assertIsInstance(result[0], tuple)
        self.assertEquals(len(result[0]), 2)

        result = mail.subject
        self.assertIsInstance(result, six.text_type)

        result = mail.message_id
        self.assertIsInstance(result, six.text_type)

        result = mail.attachments
        self.assertIsInstance(result, list)

        result = mail.date
        self.assertIsInstance(result, datetime.datetime)

        result = mail.defects
        self.assertIsInstance(result, list)

        result = mail.timezone
        self.assertEquals(result, "+1")
Exemple #2
0
def main():
    args = get_args().parse_args()

    if args.file:
        if args.outlook:
            parser = mailparser.parse_from_file_msg(args.file)
        else:
            parser = mailparser.parse_from_file(args.file)
    elif args.string:
        parser = mailparser.parse_from_string(args.string)
    elif args.stdin:
        if args.outlook:
            raise MailParserOutlookError(
                "You can't use stdin with msg Outlook")
        parser = mailparser.parse_from_file_obj(sys.stdin)

    if args.json:
        safe_print(parser.mail_json)

    if args.body:
        safe_print(parser.body)

    if args.headers:
        safe_print(parser.headers_json)

    if args.to:
        safe_print(parser.to_json)

    if args.delivered_to:
        safe_print(parser.delivered_to_json)

    if args.from_:
        safe_print(parser.from_json)

    if args.subject:
        safe_print(parser.subject)

    if args.receiveds:
        safe_print(parser.received_json)

    if args.defects:
        for i in parser.defects_categories:
            safe_print(i)

    if args.senderip:
        r = parser.get_server_ipaddress(args.senderip)
        if r:
            safe_print(r)
        else:
            safe_print("Not Found")

    if args.attachments or args.attachments_hash:
        print_attachments(parser.attachments, args.attachments_hash)

    if args.mail_hash:
        print_mail_fingerprints(parser.body.encode("utf-8"))
Exemple #3
0
 def parse_mail(self, path):
     encoding_modes = ["latin1", "utf8"]
     for mode in encoding_modes:
         try:
             with open(path, "r", encoding=mode) as f:
                 return mailparser.parse_from_file_obj(f)
         except Exception as e:
             print("Failed opening {} with mode {}".format(path, mode))
             print(e)
     return None
Exemple #4
0
def main():
    args = get_args().parse_args()
    log = custom_log(level=args.log_level)

    if args.file:
        if args.outlook:
            log.debug("Analysis Outlook mail")
            parser = mailparser.parse_from_file_msg(args.file)
        else:
            parser = mailparser.parse_from_file(args.file)
    elif args.string:
        parser = mailparser.parse_from_string(args.string)
    elif args.stdin:
        if args.outlook:
            raise MailParserOutlookError(
                "You can't use stdin with msg Outlook")
        parser = mailparser.parse_from_file_obj(sys.stdin)

    if args.json:
        safe_print(parser.mail_json)

    if args.body:
        safe_print(parser.body)

    if args.headers:
        safe_print(parser.headers_json)

    if args.to:
        safe_print(parser.to_json)

    if args.delivered_to:
        safe_print(parser.delivered_to_json)

    if args.from_:
        safe_print(parser.from_json)

    if args.subject:
        safe_print(parser.subject)

    if args.receiveds:
        safe_print(parser.received_json)

    if args.defects:
        log.debug("Printing defects")
        for i in parser.defects_categories:
            safe_print(i)

    if args.senderip:
        log.debug("Printing sender IP")
        r = parser.get_server_ipaddress(args.senderip)
        if r:
            safe_print(r)
        else:
            safe_print("Not Found")

    if args.attachments or args.attachments_hash:
        log.debug("Printing attachments details")
        print_attachments(parser.attachments, args.attachments_hash)

    if args.mail_hash:
        log.debug("Printing also mail fingerprints")
        print_mail_fingerprints(parser.body.encode("utf-8"))
Exemple #5
0
def main(dir: str):
    checker = language_tool_python.LanguageTool('en-US')
    emails = {}
    totalWords = ''

    filenames = [
        filename for filename in os.listdir(dir) if filename.endswith('.eml')
    ]
    for filename in filenames:
        print()
        print('[INFO] Processing {}...'.format(filename))

        with open(os.path.join(dir, filename), 'r', encoding='latin1') as file:
            try:
                mail = mailparser.parse_from_file_obj(file)
            except Exception as e:
                print('[WARNING] Error while parsing: {}'.format(e))
                continue
            # filter duplicates based on subject
            #if mail.subject in emails:
            #    print('[WARNING] This email seems to be a duplicate of "{}"! Skipping...'
            #        .format(emails[mail.subject]['filename']))
            #    continue

            # don't process if auth results missing
            # if 'Authentication-Results' not in mail.headers:
            #     print('[WARNING] This email is missing an authentication results header! Skipping...')
            #     continue

            attachments = ''
            for attachment in mail.attachments:
                attachment['filename'] = re.sub(r'<|>', '',
                                                attachment['filename'])
            try:
                mail.write_attachments(dir)
                for attachment in mail.attachments:
                    if re.search('image', attachment['mail_content_type']):
                        if re.search('gif', attachment['mail_content_type']):
                            images, _, _ = gif2numpy.convert(
                                dir + '\\' + attachment['filename'])
                            img = images[0]
                        else:
                            img = cv2.imread(dir + '\\' +
                                             attachment['filename'])
                        img = cv2.resize(img,
                                         None,
                                         fx=1.2,
                                         fy=1.2,
                                         interpolation=cv2.INTER_CUBIC)
                        text = pytesseract.image_to_string(img)
                        attachments += text
                    elif re.search('pdf', attachment['mail_content_type']):
                        encoding = chardet.detect(
                            pdf_to_text(dir + '\\' +
                                        attachment['filename']))['encoding']
                        attachments += pdf_to_text(
                            dir + '\\' +
                            attachment['filename']).decode(encoding)
                    # elif re.search('text', attachment['mail_content_type']):
                    #     #print(chardet.detect((attachment['payload']).encode()))
                    #     #encoding = chardet.detect(base64.b64decode(attachment['payload']).encode())['encoding']
                    #     #attachments += base64.b64decode(attachment['payload']).decode(encoding)
                    #     #print(codecs.encode(base64.b64decode(attachment['payload']), encoding=attachment['content_transfer_encoding']))
                    #     attachments += attachment['payload']
                    else:
                        attachments += attachment['payload']
                    os.remove(dir + '\\' + attachment['filename'])
            except Exception as e:
                print(
                    '[WARNING] Error while parsing attachments: {}'.format(e))
                [
                    os.remove(dir + '\\' + attachment['filename'])
                    for attachment in mail.attachments
                ]

            body = mail.subject + ' ' + \
                   remove_noise(BeautifulSoup(mail.body, 'lxml').get_text(separator=' ', strip=True) +
                                BeautifulSoup(attachments, 'lxml').get_text())
            blob = TextBlob(body)
            totalWords = totalWords + " " + body.lower()
            grammarErrors = checker.check(body)

            if 'Authentication-Results' in mail.headers:
                spf = re.findall('spf=(\S*)',
                                 mail.headers['Authentication-Results'])
                dkim = re.findall('dkim=(\S*)',
                                  mail.headers['Authentication-Results'])
                dmarc = re.findall('dmarc=(\S*)',
                                   mail.headers['Authentication-Results'])
            else:
                spf = dkim = dmarc = ''

            emails[filename] = {
                'filename': filename,
                # 'hops': mail.received[-1]['hop'],
                # 'totalDelay': sum([hop['delay']/60 for hop in mail.received]),
                'spf': spf[0] if len(spf) else None,
                'dkim': dkim[0] if len(dkim) else None,
                'dmarc': dmarc[0] if len(dmarc) else None,
                'subject': mail.subject,
                'from': mail.from_[0][1],
                'to': [tup[1] for tup in mail.to],
                'replyTo': [tup[1] for tup in mail.reply_to],
                'attachments': [x['filename'] for x in mail.attachments],
                'grammarErrors': len(grammarErrors),
                'counts': {
                    'characterCount': len(body),
                    'wordCount': textstat.lexicon_count(body),
                    'sentenceCount': textstat.sentence_count(body)
                },
                'readability': {
                    'flesch_kincaid':
                    textstat.flesch_kincaid_grade(body),
                    'gunning_fog':
                    textstat.gunning_fog(body),
                    'smog_index':
                    textstat.smog_index(body),
                    'automated_readability_index':
                    textstat.automated_readability_index(body),
                    'coleman_liau_index':
                    textstat.coleman_liau_index(body),
                    'linsear_write':
                    textstat.linsear_write_formula(body),
                },
                'sentiment': {
                    'polarity': blob.sentiment.polarity,
                    'subjectivity': blob.sentiment.subjectivity
                }
            }

            if save_body:
                emails[filename]['body'] = body

    ## quit if nothing found ##
    # if not emails:
    #     print('[WARNING] No files were found in "{}"!'.format(dir))
    #     return

    ## writing all words to file ##
    with open(os.path.join(dir, 'words.txt'), 'w', encoding='utf-8') as file:
        file.write(totalWords.lower())

    ## output json ##
    with open(os.path.join(dir, 'analysis.json'), 'w') as jsonFile:
        json.dump(emails, jsonFile, indent=2)

    ## build and output csv ##

    # generate and output headers using first email
    column_headers = list(flatten_json(emails[list(emails.keys())[0]]).keys())
    csvFile = open(os.path.join(dir, 'analysis.csv'), 'w', encoding='utf-8')
    csvFile.write(',{}\n'.format(','.join(column_headers)))

    # generate and output one line per email
    for email in emails.keys():
        # flatten json to 1 layer deep
        flattened_email = flatten_json(emails[email])
        # generate the values for this row
        csv_values = [
            '"' + str(flattened_email[column_header]) + '"'
            for column_header in column_headers
        ]
        # add email name and join w/ commas, then write out
        csvFile.write('{},{}\n'.format('"' + email + '"',
                                       ','.join(csv_values)))

    csvFile.close()

    # print out stats
    print('{}/{} processed. The remaining failed for some reason.'.format(
        len(emails), len(filenames)))
    def train(self):
        #PARSE EMAIL DATASET
        print('====PARSING EMAILS====')
        start_parse = time.time()

        for email_file in os.listdir(self.dataset_dir):  #array[start:end]
            with open(self.dataset_dir + email_file,
                      encoding="utf8",
                      errors="ignore") as email_fp:
                try:
                    email = mailparser.parse_from_file_obj(email_fp)
                    email_row = DataFrame({
                        "message": [email.body],
                        "class": [email_file.split(".")[-1]]
                    })
                    self.email_array = self.email_array.append(
                        email_row, ignore_index=True)
                    print(email_file, "appended into array")
                except:
                    pass

        end_parse = time.time()
        print('====PARSING FINISHED====')
        print(end_parse - start_parse, "Seconds")

        #SPLIT TRAINING SET AND TEST SET
        print('====SPLITTING TRAIN-TEST====')
        self.email_train, self.email_test = train_test_split(self.email_array,
                                                             test_size=0.30,
                                                             random_state=42)
        print('====DONE SPLITTING TRAIN-TEST===')

        #print(self.email_train.index.values)

        #TRAIN EMAIL DATASET
        print('====TRAINING START====')
        start_train = time.time()
        vectorizer = CountVectorizer()

        # This will generate a matrix m x n (m: email instance, n: word in the vocabulary)
        # Each entry in the matrix represents how many times a word n appeared in a particular email instance m
        self.word_counts = vectorizer.fit_transform(
            self.email_train['message'].values)
        self.vocabulary = vectorizer.get_feature_names()

        # CLEAN VOCABULARY
        for word in list(self.vocabulary):
            if not word.isalpha():
                self.vocabulary.remove(word)

        # Calculate the prior probabilites [p(ham), p(spam)]
        email_indexes = self.email_train.index.values
        email_count = len(email_indexes)
        ham_spam_count = self.email_train['class'].value_counts()
        ham_count = ham_spam_count['ham']
        spam_count = ham_spam_count['spam']
        self.p_ham = ham_count / email_count
        self.p_spam = spam_count / email_count

        # Find n_spam and n_ham
        num_of_words_array = self.word_counts.sum(axis=1)
        for index, email_num in enumerate(email_indexes):
            if self.email_train.loc[email_num, 'class'] == "ham":
                self.n_ham = self.n_ham + num_of_words_array[index, 0]
            else:
                self.n_spam = self.n_spam + num_of_words_array[index, 0]

        # Calculate the likelihood probabilities of each word in the vocabulary in each class
        # [p(word_1|ham), p(word_1|spam), p(word_2|spam) ....]
        vocab_map = vectorizer.vocabulary_

        for n, word in enumerate(self.vocabulary):
            print('Training for word:', word)
            p_word_ham = 0.0
            p_word_spam = 0.0

            n_word_ham = 0
            n_word_spam = 0
            for index, email_num in enumerate(email_indexes):
                if self.email_train.loc[email_num, 'class'] == "ham":
                    n_word_ham = n_word_ham + self.word_counts[index,
                                                               vocab_map[word]]
                else:
                    n_word_spam = n_word_spam + self.word_counts[
                        index, vocab_map[word]]

            p_word_ham = (n_word_ham + self.laplace_smoothing) / (
                self.n_ham + len(self.vocabulary))
            p_word_spam = (n_word_spam + self.laplace_smoothing) / (
                self.n_spam + len(self.vocabulary))

            self.p_words['ham'][word] = p_word_ham
            self.p_words['spam'][word] = p_word_spam
            print('Done training for word:', word)
            print('Progress:', n + 1, "/", len(self.vocabulary))

        end_train = time.time()
        print('====TRAINING END====')
        print(end_train - start_train, "Seconds")

        print("Train size:", len(self.email_train.index))
        print("Test size:", len(self.email_test.index))
        print(len(num_of_words_array))

        #Save classifier attributes
        self.save_classifier()
Exemple #7
0
        })
        files.update({photo['filename']: photo['data']})

    params = (('chat_id', chat_id), ('media', json.dumps(media)))

    if len(media) > 0:
        response = requests.post(url + "/sendMediaGroup",
                                 params=params,
                                 files=files)
        print(response.json())
    else:
        print("Nothing to send. Skip REST call.")


# Parase RAW email from STDIN
mail = mailparser.parse_from_file_obj(sys.stdin)

print(mail.from_)
print(mail.delivered_to)
print(mail.to)
#print(mail.body)

# Determine Telegram Bot API key and ID of the user who will get the message we will send
try:
    key = mail.from_[0][1]
    #user_key = re.search('\+(.+?)\@', key).group(1) # use +something part in the sender's email address as key
    user_key = key  # use sender's email address as key
except AttributeError:
    print("Use default user key (if set)")
    user_key = 'default'