def test_reply_from_gmail_2_ptBr(self): with open("test/emails/email_gmail2_ptBr.txt") as f: self.assertIn( "entendi, muito obrigado pela informação, vou verificar aqui se tenho outras opções.", EmailReplyParser.parse_reply(f.read()), ) with open("test/emails/email_gmail2_ptBr.txt") as f: self.assertIn( "Em sex., 18 de dez. de 2020 às 14:12", EmailReplyParser.read(f.read()).fragments[1].content, ) with open("test/emails/email_gmail2_ptBr.txt") as f: self.assertIn( "Já viu o link desse anuncio?", EmailReplyParser.read(f.read()).fragments[0].content, )
def test_reply_from_gmail_ptBr(self): with open("test/emails/email_gmail_ptBr.txt") as f: self.assertEqual( "Esta é uma resposta para mensagens github.", EmailReplyParser.parse_reply(f.read()), ) with open("test/emails/email_gmail_ptBr.txt") as f: self.assertIn( "Em qua., 18 de mai. de 2016 às 11:10 Someone", EmailReplyParser.read(f.read()).fragments[1].content, ) with open("test/emails/email_gmail_ptBr.txt") as f: self.assertIn( "Esta é uma resposta para mensagens github.", EmailReplyParser.read(f.read()).fragments[0].content, )
def __init__(self, email_text): """Decode base64 email and turn it into a Django email object.""" try: email_text = base64.standard_b64decode( urllib2.unquote(email_text.rstrip())) except TypeError: # Corrupt or invalid base 64. self.decode_error = True log.info('Decoding error for CommEmailParser') return self.email = message_from_string(email_text) payload = self.email.get_payload() if isinstance(payload, list): # If multipart, get the plain text part. for part in payload: # Nested multipart. Go deeper. if part.get_content_type() == 'multipart/alternative': payload = part.get_payload() for part in payload: if part.get_content_type() == 'text/plain': # Found the plain text part. payload = part.get_payload() break if part.get_content_type() == 'text/plain': # Found the plain text part. payload = part.get_payload() break # Decode quoted-printable data and remove non-breaking spaces. payload = (quopri.decodestring(payload).replace('\xc2\xa0', ' ')) payload = self.extra_email_reply_parse(payload) self.reply_text = EmailReplyParser.read(payload).reply
def set_content_and_type(self): self.content, self.content_type = '[Blank Email]', 'text/plain' if self.html_content: self.content, self.content_type = self.html_content, 'text/html' else: self.content, self.content_type = EmailReplyParser.read( self.text_content).text.replace("\n", "\n\n"), 'text/plain'
def __init__(self, message): if (not isinstance(message, dict) or 'TextBody' not in message): log.exception('ActivityEmailParser didn\'t get a valid message.') raise ActivityEmailEncodingError( 'Invalid or malformed json message object.') self.email = message reply = self._extra_email_reply_parse(self.email['TextBody']) self.reply = EmailReplyParser.read(reply).reply
def __init__(self, message): if (not isinstance(message, dict) or 'TextBody' not in message): log.exception('ActivityEmailParser didn\'t get a valid message.') raise ActivityEmailEncodingError( 'Invalid or malformed json message object.') self.email = message reply = self._extra_email_reply_parse(self.email['TextBody']) self.reply = EmailReplyParser.read(reply).reply
def set_content_and_type(self): self.content, self.content_type = "[Blank Email]", "text/plain" if self.html_content: self.content, self.content_type = self.html_content, "text/html" else: self.content, self.content_type = ( EmailReplyParser.read(self.text_content).text.replace( "\n", "\n\n"), "text/plain", )
def test_parse_out_just_top_for_outlook_with_reply_directly_above_line_ptBr(self): with open("test/emails/email_2_2_ptBr.txt") as f: self.assertEqual( "um novo dia testando !! navegador!", EmailReplyParser.parse_reply(f.read()), ) with open("test/emails/email_2_2_ptBr.txt") as f: self.assertIn( "um novo dia testando", EmailReplyParser.read(f.read()).fragments[0].content, ) with open("test/emails/email_2_2_ptBr.txt") as f: self.assertIn( "Outlook", EmailReplyParser.read(f.read()).fragments[1].content ) with open("test/emails/email_2_2_ptBr.txt") as f: self.assertIn( "De: Store <*****@*****.**>", EmailReplyParser.read(f.read()).fragments[3].content, )
def __init__(self, email_text): """Decode base64 email and turn it into a Django email object.""" try: email_text = base64.standard_b64decode(urllib2.unquote(email_text.rstrip())) except TypeError: # Corrupt or invalid base 64. self.decode_error = True return self.email = message_from_string(email_text) self.reply_text = EmailReplyParser.read(self.email.get_payload()).reply
def __init__(self, email_text): """Decode base64 email and turn it into a Django email object.""" try: email_text = base64.standard_b64decode( urllib2.unquote(email_text.rstrip())) except TypeError: # Corrupt or invalid base 64. self.decode_error = True return self.email = message_from_string(email_text) self.reply_text = EmailReplyParser.read(self.email.get_payload()).reply
def __init__(self, message): invalid_email = not isinstance(message, dict) or not message.get( 'TextBody', None) if invalid_email: log.exception("ActivityEmailParser didn't get a valid message.") raise ActivityEmailEncodingError( 'Invalid or malformed json message object.') self.email = message reply = self._extra_email_reply_parse(self.email['TextBody']) self.reply = EmailReplyParser.read(reply).reply
def get_mail_corpus(nlon_cleaning=False): if (nlon_cleaning): nlon, nlon_model = training_nlon() #Path to mail's corpus corpus_file = 'data/mailcorpus.json' with open(corpus_file) as data_file: corpus = json.load(data_file) print('Reading and cleaning emails corpus. Number of emails: ' + str(len(corpus))) dict = {} n = 0 #Text cleaning for d in corpus: if d['type_of_recipient'] == 'From': # if not d['is_response_of'] == None: res = EmailReplyParser.read(d['message_body'].replace('\\n', '\n')) text = res.reply # else: # text = d['message_body'].replace('\\n', '\n') n += 1 if (nlon_cleaning): try: soup = BS4(text, 'html.parser') clean_message_body = soup.text except Exception as e: print('Error with BS4 on text:\n\n%s\n\n' % text, str(e)) clean_message_body = text.strip() message_by_lines = text.splitlines() list_length = len(message_by_lines) index = 0 for count in range(0, list_length): text1 = robjects.StrVector([message_by_lines[index]]) if nlon.NLoNPredict(nlon_model, text1)[0] == 'Not': del message_by_lines[index] else: index = index + 1 clean_message_body = '\n'.join(message_by_lines) text = clean_message_body if not text == '': if d['email_address'] in dict: dict[d['email_address']].append(text) else: dict[d['email_address']] = [text] print(str(n)+'/'+str(len(corpus))+'\n' if n%50==0 else '', end='') print('Mails retrieved: '+ str(n)) print('Email addresses: '+ str(len(dict))) return dict
def extract_alert(msg): """Extract the original alert from an email thread. Walk through all replies comprising the message, locate the original alert email, strip off all pseudo-headers, remove quote markers, and return the result. """ for part in msg.walk(): if part.get_content_type() == 'text/plain': content = EmailReplyParser.read(part.get_payload(decode=True)) for fragment in content.fragments: content = fragment._content if content != extract_reply(msg): return sanitize_email_fragment(content) return ''
def extract_alert(msg): """Extract the original alert from an email thread. Walk through all replies comprising the message, locate the original alert email, strip off all pseudo-headers, remove quote markers, and return the result. """ for part in msg.walk(): if part.get_content_type() == 'text/plain': content = EmailReplyParser.read( part.get_payload(decode=True)) for fragment in content.fragments: content = fragment._content if content != extract_reply(msg): return sanitize_email_fragment(content) return ''
def __init__(self, email_text): """Decode base64 email and turn it into a Django email object.""" try: email_text = base64.standard_b64decode( urllib2.unquote(email_text.rstrip())) except TypeError: # Corrupt or invalid base 64. self.decode_error = True return self.email = message_from_string(email_text) payload = self.email.get_payload() # If not multipart, it's a string. if isinstance(payload, list): # If multipart, get the plaintext part. for part in payload: if part.get_content_type() == 'text/plain': payload = part.get_payload() break self.reply_text = EmailReplyParser.read(payload).reply
def get_mail_corpus(): # Path to mail corpus corpus_file = 'dataset/raw/mailcorpus.json' with open(corpus_file) as data_file: corpus = json.load(data_file) print('Reading and cleaning emails corpus. Number of emails: ' + str(len(corpus))) _dict = {} n = 0 # Text cleaning for d in corpus: try: res = EmailReplyParser.read(d['message_body'].replace('\\n', '\n')) clean_message_body = EmailReplyParser.parse_reply(res.text) n += 1 #clean_message_body = _remove_contractions(clean_message_body) clean_message_body = _remove_lines_of_code(clean_message_body) clean_message_body = _clean_body(clean_message_body) clean_message_body = _remove_stopwords_nonenglish_punctuation( clean_message_body) if not clean_message_body == '': if d['email_address'] in _dict: _dict[d['email_address']].add(clean_message_body) else: _dict[d['email_address']] = {clean_message_body} print(str(n) + '/' + str(len(corpus)) + '\n' if n % 50 == 0 else '', end='') except Exception as e: print(e) continue print('Mails retrieved: ' + str(n)) print('Email addresses: ' + str(len(_dict))) return _dict
def __init__(self, email_text): """Decode base64 email and turn it into a Django email object.""" try: log.info('CommEmailParser received email: ' + email_text) email_text = base64.standard_b64decode( urllib2.unquote(email_text.rstrip())) except TypeError: # Corrupt or invalid base 64. self.decode_error = True log.info('Decoding error for CommEmailParser') return self.email = message_from_string(email_text) payload = self.email.get_payload() # If not multipart, it's a string. if isinstance(payload, list): # If multipart, get the plaintext part. for part in payload: if part.get_content_type() == 'text/plain': payload = part.get_payload() break self.reply_text = EmailReplyParser.read(payload).reply
def __init__(self, email_text): """Decode base64 email and turn it into a Django email object.""" try: log.info('CommEmailParser received email: ' + email_text) email_text = base64.standard_b64decode( urllib2.unquote(email_text.rstrip())) except TypeError: # Corrupt or invalid base 64. self.decode_error = True log.info('Decoding error for CommEmailParser') return self.email = message_from_string(email_text) payload = self.email.get_payload() if isinstance(payload, list): # If multipart, get the plain text part. for part in payload: # Nested multipart. Go deeper. if part.get_content_type() == 'multipart/alternative': payload = part.get_payload() for part in payload: if part.get_content_type() == 'text/plain': # Found the plain text part. payload = part.get_payload() break if part.get_content_type() == 'text/plain': # Found the plain text part. payload = part.get_payload() break # Decode quoted-printable data and remove non-breaking spaces. payload = (quopri.decodestring(payload) .replace('\xc2\xa0', ' ')) payload = self.extra_email_reply_parse(payload) self.reply_text = EmailReplyParser.read(payload).reply
def text(self): from email_reply_parser import EmailReplyParser message = EmailReplyParser.read(self.raw_text) return message.reply
def get_email(self, name): """ Return EmailMessage instance """ with open('test/emails/%s.txt' % name) as f: text = f.read() return EmailReplyParser.read(text)
def get_email(self, name): """ Return EmailMessage instance """ with open("test/emails/%s.txt" % name) as f: text = f.read() return EmailReplyParser.read(text)
def object_from_message(message, queue, logger): # 'message' must be an RFC822 formatted message. message = email.message_from_string(message) subject = message.get('subject', _('Comment from e-mail')) subject = decode_mail_headers(decodeUnknown(message.get_charset(), subject)) for affix in STRIPPED_SUBJECT_STRINGS: subject = subject.replace(affix, "") subject = subject.strip() sender = message.get('from', _('Unknown Sender')) sender = decode_mail_headers(decodeUnknown(message.get_charset(), sender)) # to address bug #832, we wrap all the text in front of the email address in # double quotes by using replace() on the email string. Then, # take first item of list, second item of tuple is the actual email address. # Note that the replace won't work on just an email with no real name, # but the getaddresses() function seems to be able to handle just unclosed quotes # correctly. Not ideal, but this seems to work for now. sender_email = email.utils.getaddresses(['\"' + sender.replace('<', '\" <')])[0][1] cc = message.get_all('cc', None) if cc: # first, fixup the encoding if necessary cc = [decode_mail_headers(decodeUnknown(message.get_charset(), x)) for x in cc] # get_all checks if multiple CC headers, but individual emails may be comma separated too tempcc = [] for hdr in cc: tempcc.extend(hdr.split(',')) # use a set to ensure no duplicates cc = set([x.strip() for x in tempcc]) for ignore in IgnoreEmail.objects.filter(Q(queues=queue) | Q(queues__isnull=True)): if ignore.test(sender_email): if ignore.keep_in_mailbox: # By returning 'False' the message will be kept in the mailbox, # and the 'True' will cause the message to be deleted. return False return True matchobj = re.match(r".*\[" + queue.slug + r"-(?P<id>\d+)\]", subject) if matchobj: # This is a reply or forward. ticket = matchobj.group('id') logger.info("Matched tracking ID %s-%s" % (queue.slug, ticket)) else: logger.info("No tracking ID matched.") ticket = None body = None full_body = None counter = 0 files = [] for part in message.walk(): if part.get_content_maintype() == 'multipart': continue name = part.get_param("name") if name: name = email.utils.collapse_rfc2231_value(name) if part.get_content_maintype() == 'text' and name is None: if part.get_content_subtype() == 'plain': body = part.get_payload(decode=True) # https://github.com/django-helpdesk/django-helpdesk/issues/732 if part['Content-Transfer-Encoding'] == '8bit' and part.get_content_charset() == 'utf-8': body = body.decode('unicode_escape') body = decodeUnknown(part.get_content_charset(), body) # have to use django_settings here so overwritting it works in tests # the default value is False anyway if ticket is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False): # first message in thread, we save full body to avoid losing forwards and things like that body_parts = [] for f in EmailReplyParser.read(body).fragments: body_parts.append(f.content) full_body = '\n\n'.join(body_parts) body = EmailReplyParser.parse_reply(body) else: # second and other reply, save only first part of the message body = EmailReplyParser.parse_reply(body) full_body = body # workaround to get unicode text out rather than escaped text try: body = body.encode('ascii').decode('unicode_escape') except UnicodeEncodeError: body.encode('utf-8') logger.debug("Discovered plain text MIME part") else: try: email_body = encoding.smart_text(part.get_payload(decode=True)) except UnicodeDecodeError: email_body = encoding.smart_text(part.get_payload(decode=False)) if not body and not full_body: # no text has been parsed so far - try such deep parsing for some messages altered_body = email_body.replace("</p>", "</p>\n").replace("<br", "\n<br") mail = BeautifulSoup(str(altered_body), "html.parser") full_body = mail.get_text() if "<body" not in email_body: email_body = f"<body>{email_body}</body>" payload = ( '<html>' '<head>' '<meta charset="utf-8" />' '</head>' '%s' '</html>' ) % email_body files.append( SimpleUploadedFile(_("email_html_body.html"), payload.encode("utf-8"), 'text/html') ) logger.debug("Discovered HTML MIME part") else: if not name: ext = mimetypes.guess_extension(part.get_content_type()) name = "part-%i%s" % (counter, ext) else: name = ("part-%i_" % counter) + name # # FIXME: this code gets the paylods, then does something with it and then completely ignores it # # writing the part.get_payload(decode=True) instead; and then the payload variable is # # replaced by some dict later. # # the `payloadToWrite` has been also ignored so was commented # payload = part.get_payload() # if isinstance(payload, list): # payload = payload.pop().as_string() # # payloadToWrite = payload # # check version of python to ensure use of only the correct error type # non_b64_err = TypeError # try: # logger.debug("Try to base64 decode the attachment payload") # # payloadToWrite = base64.decodebytes(payload) # except non_b64_err: # logger.debug("Payload was not base64 encoded, using raw bytes") # # payloadToWrite = payload files.append(SimpleUploadedFile(name, part.get_payload(decode=True), mimetypes.guess_type(name)[0])) logger.debug("Found MIME attachment %s" % name) counter += 1 if not body: mail = BeautifulSoup(str(message), "html.parser") beautiful_body = mail.find('body') if beautiful_body: try: body = beautiful_body.text full_body = body except AttributeError: pass if not body: body = "" if getattr(django_settings, 'HELPDESK_ALWAYS_SAVE_INCOMING_EMAIL_MESSAGE', False): # save message as attachment in case of some complex markup renders wrong files.append( SimpleUploadedFile( _("original_message.eml").replace( ".eml", timezone.localtime().strftime("_%d-%m-%Y_%H:%M") + ".eml" ), str(message).encode("utf-8"), 'text/plain' ) ) smtp_priority = message.get('priority', '') smtp_importance = message.get('importance', '') high_priority_types = {'high', 'important', '1', 'urgent'} priority = 2 if high_priority_types & {smtp_priority, smtp_importance} else 3 payload = { 'body': body, 'full_body': full_body or body, 'subject': subject, 'queue': queue, 'sender_email': sender_email, 'priority': priority, 'files': files, } return create_object_from_email_message(message, ticket, payload, files, logger=logger)
def get_email(self, name): """ Return EmailMessage instance """ with open(os.path.join(TEST_EMAILS_DIR, '%s.txt' % name)) as f: text = f.read() return EmailReplyParser.read(text)
def get_email(self, name): """ Return EmailMessage instance """ text = open('test/emails/%s.txt' % name).read() return EmailReplyParser.read(text)
def clean_email_body(raw_body): """ Cleans an email's plain text body by stripping out any signatures. """ s = EmailReplyParser.read(raw_body) return r'\n'.join([f.content for f in s.fragments if not f.signature])
def set_content_and_type(self): self.content, self.content_type = '[Blank Email]', 'text/plain' if self.html_content: self.content, self.content_type = self.html_content, 'text/html' else: self.content, self.content_type = EmailReplyParser.read(self.text_content).text.replace("\n","\n\n"), 'text/plain'
def __init__(self, email_text): self.email = message_from_string(email_text) self.reply_text = EmailReplyParser.read(self.email.get_payload()).reply
def collect_data(): """Messy code to download training data. """ c = load_config('templates') templates = c['templates'] training_data = [] mail = imaplib2.IMAP4_SSL(IMAP_SERVER) mail.login(MAIL_USER, MAIL_PASSWORD) mail.select("[Gmail]/All Mail", readonly=True) result, data = mail.search(None, '(BODY "%s")' % ("@faqbot")) ids = data[0] id_list = ids.split() for idx, r_id in enumerate(id_list): _, data = mail.fetch(r_id, "(RFC822)") print "%i / %i (%i%%)" % (idx, len(id_list), int(float(idx) / len(id_list) * 100)) raw_email = "null" for d in data: if type(d) is tuple: if "RFC822" in d[0]: raw_email = d[1] flanker_msg = mime.from_string(raw_email) body = "null" try: for part in flanker_msg.parts: if str(part) == "(text/plain)": pp = part.body.encode('ascii', 'ignore') body = pp except Exception as _: pass if body == "null": continue parsed_body = EmailReplyParser.read(body) if len(parsed_body.fragments) >= 2: if parsed_body.fragments[0].content.split()[0] == "@faqbot": fb = parsed_body.fragments[0].content.split()[1] original = parsed_body.fragments[1].content lines = [] for l in original.split('\n'): if l.startswith('> '): tl = l.replace('>', '').strip() if tl != '' and not (tl.startswith('On')): lines.append(l.replace('>', '')) key = fb original = '\n'.join(lines) # Now that we have this, let's make sure it's # valid and stuff and then save it. if key in templates: training_data.append((key, original)) save_config(training_data, 'smartreply_data')