def test_from_file_obj(self): with ported_open(mail_test_2) as fp: mail = mailparser.parse_from_file_obj(fp) trust = "smtp.customers.net" self.assertEqual(False, mail.has_defects) result = mail.mail self.assertIsInstance(result, dict) self.assertNotIn("defects", result) self.assertNotIn("anomalies", result) self.assertIn("has_defects", result) result = mail.get_server_ipaddress(trust) self.assertIsInstance(result, six.text_type) result = mail.mail_json self.assertIsInstance(result, six.text_type) result = mail.headers self.assertIsInstance(result, dict) result = mail.headers_json self.assertIsInstance(result, six.text_type) result = mail.body self.assertIsInstance(result, six.text_type) result = mail.date self.assertIsInstance(result, datetime.datetime) result = mail.from_ self.assertIsInstance(result, list) result = mail.to self.assertIsInstance(result, list) self.assertEquals(len(result), 2) self.assertIsInstance(result[0], tuple) self.assertEquals(len(result[0]), 2) result = mail.subject self.assertIsInstance(result, six.text_type) result = mail.message_id self.assertIsInstance(result, six.text_type) result = mail.attachments self.assertIsInstance(result, list) result = mail.date self.assertIsInstance(result, datetime.datetime) result = mail.defects self.assertIsInstance(result, list) result = mail.timezone self.assertEquals(result, "+1")
def main(): args = get_args().parse_args() if args.file: if args.outlook: parser = mailparser.parse_from_file_msg(args.file) else: parser = mailparser.parse_from_file(args.file) elif args.string: parser = mailparser.parse_from_string(args.string) elif args.stdin: if args.outlook: raise MailParserOutlookError( "You can't use stdin with msg Outlook") parser = mailparser.parse_from_file_obj(sys.stdin) if args.json: safe_print(parser.mail_json) if args.body: safe_print(parser.body) if args.headers: safe_print(parser.headers_json) if args.to: safe_print(parser.to_json) if args.delivered_to: safe_print(parser.delivered_to_json) if args.from_: safe_print(parser.from_json) if args.subject: safe_print(parser.subject) if args.receiveds: safe_print(parser.received_json) if args.defects: for i in parser.defects_categories: safe_print(i) if args.senderip: r = parser.get_server_ipaddress(args.senderip) if r: safe_print(r) else: safe_print("Not Found") if args.attachments or args.attachments_hash: print_attachments(parser.attachments, args.attachments_hash) if args.mail_hash: print_mail_fingerprints(parser.body.encode("utf-8"))
def parse_mail(self, path): encoding_modes = ["latin1", "utf8"] for mode in encoding_modes: try: with open(path, "r", encoding=mode) as f: return mailparser.parse_from_file_obj(f) except Exception as e: print("Failed opening {} with mode {}".format(path, mode)) print(e) return None
def main(): args = get_args().parse_args() log = custom_log(level=args.log_level) if args.file: if args.outlook: log.debug("Analysis Outlook mail") parser = mailparser.parse_from_file_msg(args.file) else: parser = mailparser.parse_from_file(args.file) elif args.string: parser = mailparser.parse_from_string(args.string) elif args.stdin: if args.outlook: raise MailParserOutlookError( "You can't use stdin with msg Outlook") parser = mailparser.parse_from_file_obj(sys.stdin) if args.json: safe_print(parser.mail_json) if args.body: safe_print(parser.body) if args.headers: safe_print(parser.headers_json) if args.to: safe_print(parser.to_json) if args.delivered_to: safe_print(parser.delivered_to_json) if args.from_: safe_print(parser.from_json) if args.subject: safe_print(parser.subject) if args.receiveds: safe_print(parser.received_json) if args.defects: log.debug("Printing defects") for i in parser.defects_categories: safe_print(i) if args.senderip: log.debug("Printing sender IP") r = parser.get_server_ipaddress(args.senderip) if r: safe_print(r) else: safe_print("Not Found") if args.attachments or args.attachments_hash: log.debug("Printing attachments details") print_attachments(parser.attachments, args.attachments_hash) if args.mail_hash: log.debug("Printing also mail fingerprints") print_mail_fingerprints(parser.body.encode("utf-8"))
def main(dir: str): checker = language_tool_python.LanguageTool('en-US') emails = {} totalWords = '' filenames = [ filename for filename in os.listdir(dir) if filename.endswith('.eml') ] for filename in filenames: print() print('[INFO] Processing {}...'.format(filename)) with open(os.path.join(dir, filename), 'r', encoding='latin1') as file: try: mail = mailparser.parse_from_file_obj(file) except Exception as e: print('[WARNING] Error while parsing: {}'.format(e)) continue # filter duplicates based on subject #if mail.subject in emails: # print('[WARNING] This email seems to be a duplicate of "{}"! Skipping...' # .format(emails[mail.subject]['filename'])) # continue # don't process if auth results missing # if 'Authentication-Results' not in mail.headers: # print('[WARNING] This email is missing an authentication results header! Skipping...') # continue attachments = '' for attachment in mail.attachments: attachment['filename'] = re.sub(r'<|>', '', attachment['filename']) try: mail.write_attachments(dir) for attachment in mail.attachments: if re.search('image', attachment['mail_content_type']): if re.search('gif', attachment['mail_content_type']): images, _, _ = gif2numpy.convert( dir + '\\' + attachment['filename']) img = images[0] else: img = cv2.imread(dir + '\\' + attachment['filename']) img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) text = pytesseract.image_to_string(img) attachments += text elif re.search('pdf', attachment['mail_content_type']): encoding = chardet.detect( pdf_to_text(dir + '\\' + attachment['filename']))['encoding'] attachments += pdf_to_text( dir + '\\' + attachment['filename']).decode(encoding) # elif re.search('text', attachment['mail_content_type']): # #print(chardet.detect((attachment['payload']).encode())) # #encoding = chardet.detect(base64.b64decode(attachment['payload']).encode())['encoding'] # #attachments += base64.b64decode(attachment['payload']).decode(encoding) # #print(codecs.encode(base64.b64decode(attachment['payload']), encoding=attachment['content_transfer_encoding'])) # attachments += attachment['payload'] else: attachments += attachment['payload'] os.remove(dir + '\\' + attachment['filename']) except Exception as e: print( '[WARNING] Error while parsing attachments: {}'.format(e)) [ os.remove(dir + '\\' + attachment['filename']) for attachment in mail.attachments ] body = mail.subject + ' ' + \ remove_noise(BeautifulSoup(mail.body, 'lxml').get_text(separator=' ', strip=True) + BeautifulSoup(attachments, 'lxml').get_text()) blob = TextBlob(body) totalWords = totalWords + " " + body.lower() grammarErrors = checker.check(body) if 'Authentication-Results' in mail.headers: spf = re.findall('spf=(\S*)', mail.headers['Authentication-Results']) dkim = re.findall('dkim=(\S*)', mail.headers['Authentication-Results']) dmarc = re.findall('dmarc=(\S*)', mail.headers['Authentication-Results']) else: spf = dkim = dmarc = '' emails[filename] = { 'filename': filename, # 'hops': mail.received[-1]['hop'], # 'totalDelay': sum([hop['delay']/60 for hop in mail.received]), 'spf': spf[0] if len(spf) else None, 'dkim': dkim[0] if len(dkim) else None, 'dmarc': dmarc[0] if len(dmarc) else None, 'subject': mail.subject, 'from': mail.from_[0][1], 'to': [tup[1] for tup in mail.to], 'replyTo': [tup[1] for tup in mail.reply_to], 'attachments': [x['filename'] for x in mail.attachments], 'grammarErrors': len(grammarErrors), 'counts': { 'characterCount': len(body), 'wordCount': textstat.lexicon_count(body), 'sentenceCount': textstat.sentence_count(body) }, 'readability': { 'flesch_kincaid': textstat.flesch_kincaid_grade(body), 'gunning_fog': textstat.gunning_fog(body), 'smog_index': textstat.smog_index(body), 'automated_readability_index': textstat.automated_readability_index(body), 'coleman_liau_index': textstat.coleman_liau_index(body), 'linsear_write': textstat.linsear_write_formula(body), }, 'sentiment': { 'polarity': blob.sentiment.polarity, 'subjectivity': blob.sentiment.subjectivity } } if save_body: emails[filename]['body'] = body ## quit if nothing found ## # if not emails: # print('[WARNING] No files were found in "{}"!'.format(dir)) # return ## writing all words to file ## with open(os.path.join(dir, 'words.txt'), 'w', encoding='utf-8') as file: file.write(totalWords.lower()) ## output json ## with open(os.path.join(dir, 'analysis.json'), 'w') as jsonFile: json.dump(emails, jsonFile, indent=2) ## build and output csv ## # generate and output headers using first email column_headers = list(flatten_json(emails[list(emails.keys())[0]]).keys()) csvFile = open(os.path.join(dir, 'analysis.csv'), 'w', encoding='utf-8') csvFile.write(',{}\n'.format(','.join(column_headers))) # generate and output one line per email for email in emails.keys(): # flatten json to 1 layer deep flattened_email = flatten_json(emails[email]) # generate the values for this row csv_values = [ '"' + str(flattened_email[column_header]) + '"' for column_header in column_headers ] # add email name and join w/ commas, then write out csvFile.write('{},{}\n'.format('"' + email + '"', ','.join(csv_values))) csvFile.close() # print out stats print('{}/{} processed. The remaining failed for some reason.'.format( len(emails), len(filenames)))
def train(self): #PARSE EMAIL DATASET print('====PARSING EMAILS====') start_parse = time.time() for email_file in os.listdir(self.dataset_dir): #array[start:end] with open(self.dataset_dir + email_file, encoding="utf8", errors="ignore") as email_fp: try: email = mailparser.parse_from_file_obj(email_fp) email_row = DataFrame({ "message": [email.body], "class": [email_file.split(".")[-1]] }) self.email_array = self.email_array.append( email_row, ignore_index=True) print(email_file, "appended into array") except: pass end_parse = time.time() print('====PARSING FINISHED====') print(end_parse - start_parse, "Seconds") #SPLIT TRAINING SET AND TEST SET print('====SPLITTING TRAIN-TEST====') self.email_train, self.email_test = train_test_split(self.email_array, test_size=0.30, random_state=42) print('====DONE SPLITTING TRAIN-TEST===') #print(self.email_train.index.values) #TRAIN EMAIL DATASET print('====TRAINING START====') start_train = time.time() vectorizer = CountVectorizer() # This will generate a matrix m x n (m: email instance, n: word in the vocabulary) # Each entry in the matrix represents how many times a word n appeared in a particular email instance m self.word_counts = vectorizer.fit_transform( self.email_train['message'].values) self.vocabulary = vectorizer.get_feature_names() # CLEAN VOCABULARY for word in list(self.vocabulary): if not word.isalpha(): self.vocabulary.remove(word) # Calculate the prior probabilites [p(ham), p(spam)] email_indexes = self.email_train.index.values email_count = len(email_indexes) ham_spam_count = self.email_train['class'].value_counts() ham_count = ham_spam_count['ham'] spam_count = ham_spam_count['spam'] self.p_ham = ham_count / email_count self.p_spam = spam_count / email_count # Find n_spam and n_ham num_of_words_array = self.word_counts.sum(axis=1) for index, email_num in enumerate(email_indexes): if self.email_train.loc[email_num, 'class'] == "ham": self.n_ham = self.n_ham + num_of_words_array[index, 0] else: self.n_spam = self.n_spam + num_of_words_array[index, 0] # Calculate the likelihood probabilities of each word in the vocabulary in each class # [p(word_1|ham), p(word_1|spam), p(word_2|spam) ....] vocab_map = vectorizer.vocabulary_ for n, word in enumerate(self.vocabulary): print('Training for word:', word) p_word_ham = 0.0 p_word_spam = 0.0 n_word_ham = 0 n_word_spam = 0 for index, email_num in enumerate(email_indexes): if self.email_train.loc[email_num, 'class'] == "ham": n_word_ham = n_word_ham + self.word_counts[index, vocab_map[word]] else: n_word_spam = n_word_spam + self.word_counts[ index, vocab_map[word]] p_word_ham = (n_word_ham + self.laplace_smoothing) / ( self.n_ham + len(self.vocabulary)) p_word_spam = (n_word_spam + self.laplace_smoothing) / ( self.n_spam + len(self.vocabulary)) self.p_words['ham'][word] = p_word_ham self.p_words['spam'][word] = p_word_spam print('Done training for word:', word) print('Progress:', n + 1, "/", len(self.vocabulary)) end_train = time.time() print('====TRAINING END====') print(end_train - start_train, "Seconds") print("Train size:", len(self.email_train.index)) print("Test size:", len(self.email_test.index)) print(len(num_of_words_array)) #Save classifier attributes self.save_classifier()
}) files.update({photo['filename']: photo['data']}) params = (('chat_id', chat_id), ('media', json.dumps(media))) if len(media) > 0: response = requests.post(url + "/sendMediaGroup", params=params, files=files) print(response.json()) else: print("Nothing to send. Skip REST call.") # Parase RAW email from STDIN mail = mailparser.parse_from_file_obj(sys.stdin) print(mail.from_) print(mail.delivered_to) print(mail.to) #print(mail.body) # Determine Telegram Bot API key and ID of the user who will get the message we will send try: key = mail.from_[0][1] #user_key = re.search('\+(.+?)\@', key).group(1) # use +something part in the sender's email address as key user_key = key # use sender's email address as key except AttributeError: print("Use default user key (if set)") user_key = 'default'