def unpack_message(data): msg = pyzmail.PyzMessage(data) # Get the name and email the message is coming from name, email = msg.get_address('from') email = email.lower() # Parse the date date = msg.get_decoded_header("Date") subj = msg.get_subject() if not date or not subj: return None date = parsedate(date) date = datetime(*date[:6]) date = timezone.make_aware(date, timezone=utc) b = Bunch(name=name, email=email, date=date) b.id = msg.get_decoded_header("Message-ID") b.reply_to = msg.get_decoded_header('In-Reply-To') b.subj = subj for patt in REPLACE_PATT: b.subj = b.subj.replace(patt, "") # Get the body of the message if not msg.text_part: return None body = msg.text_part.get_payload() charset = detect(body)['encoding'] or 'utf-8' try: body = body.decode(charset, "replace") body = fix_accents(body) except Exception as exc: logger.error("error decoding message %s" % b.id) raise exc # Checks for remote body for bioconductor import body = bioc_remote_body(body) # Reformat the body body = format_text(body) try: b.body = to_unicode_or_bust(body) except UnicodeDecodeError as exc: # Ignore this post return None return b
def unpack_message(data): msg = pyzmail.PyzMessage(data) # Get the name and email the message is coming from name, email = msg.get_address('from') email = email.lower() # Parse the date date = msg.get_decoded_header("Date") subj = msg.get_subject() if not date or not subj: return None date = parsedate(date) date = datetime(*date[:6]) date = timezone.make_aware(date, timezone=utc) b = Bunch(name=name, email=email, date=date) b.id = msg.get_decoded_header("Message-ID") b.reply_to = msg.get_decoded_header('In-Reply-To') b.subj = subj for patt in REPLACE_PATT: b.subj = b.subj.replace(patt, "") # Get the body of the message if not msg.text_part: return None body = msg.text_part.get_payload() charset = detect(body)['encoding'] or 'utf-8' try: body = body.decode(charset, "replace") body = fix_accents(body) except Exception, exc: logger.error("error decoding message %s" % b.id ) raise exc
def msg_to_dict(msg): """ Convert a PyZmail message to a dictionary @type msg: PyzMessage @param msg: email to convert @returns: {'Header': 'content'} """ # FIXME: any repeated header will be ignored # Usually it is only 'Received' header d = {} if msg.text_part: body = msg.text_part.get_payload() charset = msg.text_part.charset else: body = msg.get_payload() charset = msg.get_charset() if charset: charset = charset.lower() i = charset.find('iso') u = charset.find('utf') if i > 0: charset = charset[i:] elif u > 0: charset = charset[u:] # Some old emails say it's ascii or unkown but in reality is not # not use any charset not iso or utf elif i != 0 and u != 0: charset = None for header in msg.keys(): value = msg.get_decoded_header(header) value, _ = pyzmail.decode_text(value, charset, None) value = value.encode('UTF-8') header = header.replace('.', ',') # mongoDB don't likes '.' on keys d[header] = value attach = [] if type(body) == str: body, _ = pyzmail.decode_text(body, charset, None) body = body.encode('UTF-8') # On attachments of emails sometimes it end up with a list of email.message elif type(body) == list: for part in body: zmail = pyzmail.PyzMessage(part) a = msg_to_dict(zmail) attach.append(a) body = attach[0]['Body'] d['Body'] = body if len(msg.mailparts) > 1: for mailpart in msg.mailparts: zmail = pyzmail.PyzMessage(mailpart.part) a = msg_to_dict(zmail) attach.append(a) if attach: d['Attachments'] = attach return d