Ejemplo n.º 1
0
def get_email(num, conn):
    result = {}
    typ, content = conn.fetch(num, '(RFC822)')
    msg = BytesParser().parsebytes(content[0][1])
    sub = msg.get('Subject')
    from_ = msg.get("From")
    # Body details
    result["From"] = decode_str(from_, "From")
    result["Subject"] = decode_str(sub, "Subject")
    result["File"] = []
    for part in msg.walk():
        if part.get_content_type() == "text/plain":
            body = part.get_payload(decode=True)
            charsets = part.get_charsets()
            result["Body"] = body.decode(charsets[0])
        fileName = part.get_filename()
        if None != fileName:
            file_dict = {}
            file_dict["name"] = decode_str(fileName, "File")
            file_dict["attachment"] = part.get_payload(decode=True)
            file_dict["content_type"] = part.get_content_type()
            new_file = ContentFile(file_dict["attachment"])
            file_obj = UploadedFile(new_file, file_dict["name"],
                                    file_dict["content_type"], new_file.size,
                                    None, None)
            result["File"].append(file_obj)


#                 fileName_str = decode_str(fileName,"File")
#                 att_path = os.path.join(settings.LOG_DIR,fileName_str)
#result["File"] = part.get_payload(decode=True)
#                 fp = open(att_path, 'wb')
#                 fp.write(part.get_payload(decode=True))
#                 fp.close()
    return result
Ejemplo n.º 2
0
 def get_content(self, data, _path) -> str:
     '''
     get email content parsed
     '''
     with open(_path, 'rb') as file:
         msg = BytesParser(policy=policy.default).parse(file)
         data["Parsed"] = msg.get_body(preferencelist=('plain')).get_content()
Ejemplo n.º 3
0
    def fetch_and_parse(uids):
        ''' fetches and parses up to "commit_limit" new emails '''

        result = list()

        for uid in uids:
            email_dict = dict()
            reply, email_data = imap_server.uid('fetch', uid, '(RFC822)')
            if reply == 'OK':
                raw_email = email_data[0][1]
                email = BytesParser(policy=default).parsebytes(raw_email)
                email_dict['Date'] = datetime.strptime(
                    email['Date'], '%a, %d %b %Y %H:%M:%S %z')

                for header in [
                        'From', 'To', 'Delivered-To', 'Message-ID', 'Subject'
                ]:
                    email_dict[header] = email[header]
                email_dict['plain'] = None
                email_dict['html'] = None
                for part in email.walk():
                    if part.get_content_type() == 'text/html':
                        email_dict['html'] = part.get_body().get_content()
                    elif part.get_content_type() == 'text/plain':
                        email_dict['plain'] = part.get_body().get_content()
                result.append(email_dict)

        return result
Ejemplo n.º 4
0
Archivo: etm-n.py Proyecto: 0x024/etm
def get_content(num):
    print(num)
    type, data = raw_conn.fetch(num, '(RFC822)')
    email_date = get_date(email_list[int(count)])
    try:
        msg = BytesParser().parsebytes(data[0][1])
        for part in msg.walk():
            if not part.is_multipart():
                charset = part.get_charset()
                contenttype = part.get_content_type()
                content = part.get_payload(decode=True)
                content = content.decode('GBK')
                temp = time_formate(email_date)
                print(temp)
                if temp == '1':
                    print(temp)
                    get_transfer_v1(content)
                elif temp == '2':
                    print(temp)
                    get_transfer_v2(content)

                #
                #print (content)

    except TypeError:
        print('empty-email')
    except UnicodeDecodeError:
        print('hahah')
def parse_body(body):
    """
        Parse the body from the email and extract the required fields. 
        Need to extract sender email, subject of the email, the receive date, and body of the email.
    """
    msg = BytesParser(policy=policy.SMTP).parsebytes(body)
    print("This is the message: ", msg.keys())
    print("From : ",msg['From'])
    print("Date: ",msg['Date'])
    print("To: ",msg['To'])
    print("Subject : ",msg['Subject'])
    plain = ''
    try:
        plain = msg.get_body(preferencelist=('plain'))
        plain = ''.join(plain.get_content().splitlines(keepends=True))
        plain = '' if plain == None else plain
    except:
        print('Incoming message does not have an plain text part - skipping this part.')
        
    return {
        'from': msg['From'],
        'to': msg['To'],
        'subject': msg['Subject'],
        'date': msg['Date'],
        'text':plain
        }
Ejemplo n.º 6
0
    def Receive(self, index):
        self.server = poplib.POP3_SSL(self.emailInfo["pop3_server"])
        # 身份认证:
        self.server.user(self.emailInfo["email"])
        self.server.pass_(self.emailInfo["pwd"])
        if index > 6:
            for i in range(index, index - 6, -1):
                msg_content = ''
                resp, lines, octets = self.server.retr(i)
                msg_content = b'\r\n'.join(lines)

                # # 稍后解析出邮件:
                msg = BytesParser().parsebytes(msg_content)
                self.print_info(msg)

                # 可以根据邮件索引号直接从服务器删除邮件:
                # server.dele(i)
                # 关闭连接:
        else:
            for i in range(index, 0, -1):
                msg_content = ''
                resp, lines, octets = self.server.retr(i)

                # lines存储了邮件的原始文本的每一行,
                # 可以获得整个邮件的原始文本:
                msg_content = b'\r\n'.join(lines)

                # 稍后解析出邮件:
                msg = BytesParser().parsebytes(msg_content)
                self.print_info(msg)
        self.server.quit()
Ejemplo n.º 7
0
def _get_email_content(uid, data):
    content = dict(text=None, html=None, attachments=[])
    email = BytesParser(policy=policy.default).parsebytes(data)

    for part in email.walk():
        if part.is_multipart():
            continue

        if part.is_attachment():
            content['attachments'].append(_read_attachment(part, uid))
            continue

        if part.get_content_type() == 'text/plain':
            content['text'] = _read_text(part)
            continue

        if part.get_content_type() == 'text/html':
            content['html'] = _read_html(part, uid)
            continue

    if content['html'] and not content['text']:
        tmp = open(content['html'], 'r')
        content['text'] = tmp.read()
        tmp.close()

    return content
Ejemplo n.º 8
0
def processEmail(emailBytes):
    try:
        msg = BytesParser(policy=policy.default).parse(io.BytesIO(emailBytes))
        text = msg.get_body(preferencelist=('plain')).get_content()
        text = emailBytes.decode()
    except Exception as e:
        text = emailBytes.decode()

    lines = text.split('\n')
    if 'Subject:' in lines[0]:
        subject = lines[0][8:]
    else:
        subject = ''

    if subject != '':
        text = ' '.join(lines)
    else:
        text = ' '.join(lines[1:])
    # print(f'Pre-formatted text: {text}')
    text = re.sub(r'https?://\S+', '', text,
                  flags=re.MULTILINE)  # remove links
    text = re.sub(r' +|\t+|\\n', ' ', text)  # remove unnecessary spaces
    text = re.sub(r'\s([,?.!"](?:\s|$))', r'\1',
                  text)  # remove spaces before punctuation
    # print(f'Text: {text}')

    # Check if text is empty before forwarding

    return subject, text
Ejemplo n.º 9
0
    def parse(self, bytesfile):
        p = Parser()
        msgobj = p.parse(bytesfile)
        subject = self.parse_header_field(msgobj['Subject'])
        body, html, attachments = self.parse_body(msgobj.walk())
        body = '\n'.join(body)
        html = '\n'.join(html)

        tos = self.get_address_list(msgobj.get_all('To', []))
        tos.extend(self.get_address_list(msgobj.get_all('X-Original-To', [])))
        ccs = self.get_address_list(msgobj.get_all('Cc', []))
        resent_tos = self.get_address_list(msgobj.get_all('resent-to', []))
        resent_ccs = self.get_address_list(msgobj.get_all('resent-cc', []))

        from_field = parseaddr(self.get(msgobj.get('From')))
        from_field = (self.parse_header_field(from_field[0]),
                      from_field[1].lower() if from_field[1] else from_field[1])
        date = self.parse_date(self.get(msgobj.get("Date")))
        return {
            'msgobj': msgobj,
            'message_id': msgobj.get('Message-Id'),
            'date': date,
            'subject': subject,
            'body': body,
            'html': html,
            'from': from_field,
            'to': tos,
            'cc': ccs,
            'resent_to': resent_tos,
            'resent_cc': resent_ccs,
            'attachments': attachments
        }
Ejemplo n.º 10
0
def make_person_schema(mailFile, outputDir, person_db):
    msg = BytesParser().parse(mailFile)
    # Retrieve the from person.
    (realname, mailAddr) = get_info_from_mail_field(msg['from'])
    person = Person(realname, mailAddr)

    # Add it to the database.
    update_db(person_db, person)

    # Find ourself
    (my_name, my_email) = get_info_from_mail_field(msg['Delivered-To'])
    me = Person(my_name, my_email)

    def addToMyEmailAddr(field_name):
        (_, my_email_addr) = get_info_from_mail_field(msg[field_name])
        if my_email_addr:
            me.addEmail(my_email_addr)

    addToMyEmailAddr('X-Original-To')
    addToMyEmailAddr('Resent-From')

    update_db(person_db, me)

    # Find cc and to relation (excluding ourself)
    link_people(person_db, me, msg.get_all('to', []))
    link_people(person_db, me, msg.get_all('cc', []))
Ejemplo n.º 11
0
    def analyse(self, sample, samplename):
        try:

            def json_serial(obj):
                if isinstance(obj, datetime.datetime):
                    serial = obj.isoformat()
                    return serial

            with open(sample, 'rb') as fhdl:
                raw_email = fhdl.read()

            #Lets Grab All Dem Headers
            ep = eml_parser.EmlParser()
            parsed_eml = ep.decode_email_bytes(raw_email)

            jsonEML = json.dumps(parsed_eml, default=json_serial)
            jsonEML = json.loads(jsonEML)

            #Grab the email body and pass into the report
            with open(sample, 'rb') as fp:
                msg = BytesParser(policy=policy.default).parse(fp)
            text = msg.get_body(preferencelist=('plain')).get_content()

            self.build_report(jsonEML, text)

        except Exception as e:
            self.unexpectedError(e)
Ejemplo n.º 12
0
Archivo: ribbit.py Proyecto: 0xf4b1/keg
class RibbitResponse:
	"""
	A response to a RibbitRequest.

	The request that created that response is available on the .request attribute.
	"""

	def __init__(
		self, request: RibbitRequest, data: bytes, *, verify: bool = True
	) -> None:
		self.request = request
		self.data = data
		self.date = datetime.utcnow()

		self.message = BytesParser().parsebytes(data)  # type: ignore # (typeshed#2502)
		self.checksum = parse_checksum(self.message.epilogue)

		# The bytes of everything except the checksum (the epilogue)
		# The checksum is of those bytes
		self.content_bytes = data[:-len(self.message.epilogue)]
		if verify:
			content_checksum = sha256(self.content_bytes).hexdigest()
			if self.checksum != content_checksum:
				raise IntegrityVerificationError("ribbit response", content_checksum, self.checksum)

		self.content = self.message.get_payload(0).get_payload()
		self.signature = self.message.get_payload(1).get_payload()
Ejemplo n.º 13
0
def process_email(raw_email):
    msg = BytesParser(policy=policy.default).parsebytes(raw_email)
    body = msg.get_body(preferencelist=['plain'])
    content = body.get_payload(decode=True)

    charset = body.get_content_charset()
    if not charset:
        charset = chardet.detect(content)['encoding']
    content = content.decode(charset)

    regex = re.compile('^[^+@]+\+(?P<token>[a-zA-Z0-9]{80})@[^@]+$')

    for addr in msg.get('To', '').split(','):
        m = regex.match(addr.strip())
        if m:
            break

    if not m:
        raise NoTokenFoundException

    token = m.group('token')

    try:
        in_reply_to, author = process_new_token(token)
    except InvalidTokenException:
        in_reply_to, author = process_old_token(token)

    subject = msg.get('Subject', '')

    Message.objects.create(thread=in_reply_to.thread,
                           in_reply_to=in_reply_to,
                           author=author,
                           subject=subject,
                           content=content)
Ejemplo n.º 14
0
    def _encode_parts(self, header_data, msg_data, encoder):
        """Encodes any MIME part in the current message that is 8-bit.

        :type header_data: :py:obj:`bytes`
        :type msg_data: :py:obj:`bytes`
        """
        self.headers = None
        self.message = None

        if six.PY3:
            msg = BytesParser().parsebytes(header_data+msg_data)

        else:
            msg = Parser().parsestr(header_data+msg_data)

        for part in msg.walk():
            if not part.is_multipart():
                payload = part.get_payload()
                try:
                    payload.encode('ascii')
                except UnicodeError:
                    del part['Content-Transfer-Encoding']
                    encoder(part)

        self.parse_msg(msg)
Ejemplo n.º 15
0
    def _encode_parts(self, header_data, msg_data, encoder):
        """Encodes any MIME part in the current message that is 8-bit.

        :type header_data: :py:obj:`bytes`
        :type msg_data: :py:obj:`bytes`
        """
        self.headers = None
        self.message = None

        if six.PY3:
            msg = BytesParser().parsebytes(header_data+msg_data)

        else:
            msg = Parser().parsestr(header_data+msg_data)

        for part in msg.walk():
            if not part.is_multipart():
                payload = part.get_payload()
                try:
                    payload.encode('ascii')
                except UnicodeError:
                    del part['Content-Transfer-Encoding']
                    encoder(part)

        self.parse_msg(msg)
def decode_email(
        msg_str, pos, key_map
):  # process whole email parts and build email list/dict records
    filenames = None
    p = BytesParser()
    message = p.parsebytes(msg_str)  # get header
    parts = parse_parts(message,
                        key_map)  # add header parts specified in key_map
    parts['Size'] = len(msg_str)
    plain_body = ''
    html_body = ''
    for part in message.walk():

        plain_body += decode_part(part, 'text/plain')
        if len(plain_body) > 0:
            html_body = ""
        else:
            html_body += decode_part(part, 'text/html')

        fn = part.get_filename()
        if fn:
            if filenames == None: filenames = []
            filenames.append(fn)
    if filenames:
        parts['Attachments'] = filenames
    if len(plain_body) > 0:
        parts['text/plain'] = plain_body
    elif len(html_body) > 0:
        parts['text/html'] = html_body
    return parts
Ejemplo n.º 17
0
    def get_text_with_eml(self) -> str:

        file_list = glob.glob('*.eml')  # returns list of files
        with open(file_list[2],
                  'rb') as fp:  # select a specific email file from the list
            msg = BytesParser(policy=policy.default).parse(fp)
        return msg.get_body(preferencelist=('plain')).get_content()
Ejemplo n.º 18
0
def process_email(raw_email):
    msg = BytesParser(policy=policy.default).parsebytes(raw_email)
    body = msg.get_body(preferencelist=['plain'])
    content = body.get_payload(decode=True)

    charset = body.get_content_charset()
    if not charset:
        charset = chardet.detect(content)['encoding']
    content = content.decode(charset)

    regex = re.compile('^[^+@]+\+(?P<token>[a-zA-Z0-9]{80})@[^@]+$')

    for addr in msg.get('To', '').split(','):
        m = regex.match(addr.strip())
        if m:
            break

    if not m:
        raise NoTokenFoundException

    token = m.group('token')
    key = token[64:]
    try:
        thread = MessageThread.objects.get(token=token[:32])
        sender = MessageCorrespondent.objects.get(token=token[32:64])
    except models.DoesNotExist:
        raise InvalidTokenException

    if key != hexdigest_sha256(settings.SECRET_KEY, thread.token,
                               sender.token)[:16]:
        raise InvalidKeyException

    Message.objects.create(thread=thread,
                           from_email=sender.email,
                           content=content)
Ejemplo n.º 19
0
    def preview_held_msg(self, datacomponent):
        # datacomponent as defined/used by simple_term_menu. see self.get_held_items()
        s = datacomponent.split(':')
        # we already know s[1]=="HELD"
        lista = s[0]
        rid = s[2]
        # Get the list
        mmlist = self.mmclient.get_list(lista)
        # From it, get held message by request_id
        msg = mmlist.get_held_message(rid)
        # Extract necessary details
        sender = msg._get('sender')
        subject = msg._get('subject')
        msgid = msg._get('message_id')
        reason = msg._get('reason')
        # MIMEparse msg._get('msg'):
        mp = BytesParser(policy=policy.default).parsebytes(
            msg._get('msg').encode('utf8'))
        preview_text = mp.get_body(preferencelist=('plain')).get_content()
        pre = """{t_full_subject}: {subject}
{t_msgid}: {msgid}
{t_reason}: {reason}

{preview_text}""".format(t_full_subject=_T('FULL SUBJECT'),
                         t_msgid=_T('MESSAGE ID'),
                         t_reason=_T('REASON'),
                         sender=sender,
                         subject=subject,
                         msgid=msgid,
                         reason=reason,
                         preview_text=preview_text)

        return (pre)
Ejemplo n.º 20
0
 def _get_content(self):
     # self.content is provided by __getattr__ through the cache var self._content
     p = BytesParser()
     content = self.content
     content_io = BytesIO(content)
     parsed_msg = p.parse(content_io)
     return parsed_msg
Ejemplo n.º 21
0
    def parse(self, bytesfile):
        p = Parser()
        msgobj = p.parse(bytesfile)
        subject = self.parse_header_field(msgobj["Subject"])
        attachments = []
        body = []
        html = []
        self.parse_body(msgobj.walk(), attachments, body, html)
        body = u"\n".join(body)
        html = u"\n".join(html)

        tos = self.get_address_list(msgobj.get_all("To", []))
        tos.extend(self.get_address_list(msgobj.get_all("X-Original-To", [])))
        ccs = self.get_address_list(msgobj.get_all("Cc", []))
        resent_tos = self.get_address_list(msgobj.get_all("resent-to", []))
        resent_ccs = self.get_address_list(msgobj.get_all("resent-cc", []))

        from_field = parseaddr(self.get(msgobj.get("From")))
        from_field = (self.parse_header_field(from_field[0]), from_field[1].lower() if from_field[1] else from_field[1])
        date = self.parse_date(self.get(msgobj.get("Date")))
        return {
            "msgobj": msgobj,
            "date": date,
            "subject": subject,
            "body": body,
            "html": html,
            "from": from_field,
            "to": tos,
            "cc": ccs,
            "resent_to": resent_tos,
            "resent_cc": resent_ccs,
            "attachments": attachments,
        }
Ejemplo n.º 22
0
    def fillUp(self):
        fruits= []
        #status= open("status.remi","r",encoding="utf8")
        #self.myEmails
        #if self.connected==False:
        self.connect()

        result, data = self.mail.uid('search', None, "ALL") # search and return uids instead
        id_list = data[0].split()

        for latest_email_uid in id_list[-100::1]:
            uniqueEmail=repr(latest_email_uid)
            if False:
                pass

            else:
                result, data = self.mail.uid('fetch', latest_email_uid, '(RFC822)')
                raw_email = data[0][1]
                # here's the body, which is raw text of the whole email
                # including headers and alternate payloads

                #Parsing
                manager=BytesParser()
                email_message = manager.parsebytes(raw_email)

                try:
                    message_juice= email_message.get_payload(decode=False)
                    while type(message_juice)==type([1,2]) and type(message_juice[0].get_payload(decode=False))==type([1,2]):
                        message_juice= message_juice[0].get_payload(decode=False)

                    if type(message_juice)==type([1,2]):
                        if message_juice[-1].get_filename() == None:
                            html_message_juice= message_juice[-1].get_payload(decode=True)
                        else:
                            html_message_juice= message_juice[0].get_payload(decode=True)
                    else:
                        html_message_juice= email_message.get_payload(decode=True)

                    try:
                        #fruits.append(html_message_juice.decode())
                        ssd= open("Data/"+str(latest_email_uid)+".html","w",encoding="utf8")
                        ssd.write(html_message_juice.decode())
                        ssd.close()
                        #newBlog= Blog(title=email_message['Subject'], body= html_message_juice.decode())
                        #newBlog.save()
                        #self.setData(self,uniqueID=uniqueEmail) #string of latest_email_uid
                    except:
                        #fruits.append(html_message_juice.decode('windows-1251'))
                        ssd= open("Data/"+str(latest_email_uid)+".html","w",encoding="utf8")
                        ssd.write(html_message_juice.decode('windows-1251'))
                        ssd.close()
                        #newBlog= Blog(title=email_message['Subject'], body= html_message_juice.decode('windows-1251'))
                        #newBlog.save()
                        #self.setData(self,uniqueID=uniqueEmail) #string of latest_email_uid

                except:
                    #fruits.append("This email could not be processed see what happened \n\nSubject: "+email_message['Subject'])
                    ssd= open("Data/"+str(latest_email_uid)+".html","w",encoding="utf8")
                    ssd.write("This email could not be processed see what happened \n\nSubject: "+email_message['Subject'])
                    ssd.close()
Ejemplo n.º 23
0
 def __init__(self, mail_data, mysql_creds, threshold, sensitivity, account,
              logger, mail_id, spam_folder):
     self.JS_IMPORT_REGEX = r'/<script.*(?:src="(.*)").*>/s'
     self.JS_EXTRACT_REGEX = r'/<script.*>(.*?)<\/script>/s'
     self.URL_REGEX = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|[^\x00-\x7F]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
     self.parser = BytesParser()
     self.sensitivity = sensitivity
     self.threshold = threshold
     self.log = logger
     self.spam_folder = spam_folder
     self.mysql_db = mysql.connector.connect(
         user=mysql_creds["mysql_username"],
         password=mysql_creds["mysql_password"],
         database=mysql_creds["mysql_database"],
         host=mysql_creds["mysql_host"])
     self.account = account
     self.spam_points = 0
     self.js_code = {}
     self.urls_in_document = []
     self.documents = {}
     self.mail_id = mail_id
     # The headers are defined as <key>:<to_remove_from key>
     # -1 is used to define the last header, after that comes the mail contents
     self.whitelisted = False
     self.blacklisted = False
     self.parsed_mail = self.parser.parsebytes(mail_data)
     self.header_data = dict(self.parsed_mail)
     self.message = ""
     self.extract_message()
     self._spam = -1
     self.check_whitelist()
     self.check_blacklisted()
     self.urls = re.findall(self.URL_REGEX, self.message)
     for i in range(len(self.urls)):
         self.urls[i] = self.urls[i].strip()
Ejemplo n.º 24
0
    def _login_btn_clicked(self):
        # print("Clicked")
        username = self.input_User.get()
        password = self.input_Pass.get()

        print(username, password)

        #conexion a servicios de gmail
        M = poplib.POP3_SSL('pop.gmail.com')
        M.user(username)
        M.pass_(password)
        #obtiene el numero de mensaje
        numero = len(M.list()[1])
        #Obtiene mensaje
        global response, headerLines, bytes
        for i in range(numero):
            # Se lee el mensaje
            response, headerLines, bytes = M.retr(i + 1)
        #se mete todo en un string
        mensaje = b'\n'.join(headerLines)
        #se parsea
        # Se parsea el mensaje
        p = BytesParser()
        email = p.parsebytes(mensaje)
        #crea nueva ventana
        self.new_window(email)
Ejemplo n.º 25
0
def make_person_schema(mailFile, outputDir, person_db):
  msg = BytesParser().parse(mailFile)
  # Retrieve the from person.
  (realname, mailAddr) = get_info_from_mail_field(msg['from'])
  person = Person(realname, mailAddr)

  # Add it to the database.
  update_db(person_db, person)

  # Find ourself
  (my_name, my_email) = get_info_from_mail_field(msg['Delivered-To'])
  me = Person(my_name, my_email)

  def addToMyEmailAddr(field_name):
    (_, my_email_addr) = get_info_from_mail_field(msg[field_name])
    if my_email_addr:
      me.addEmail(my_email_addr)

  addToMyEmailAddr('X-Original-To')
  addToMyEmailAddr('Resent-From')

  update_db(person_db, me)

  # Find cc and to relation (excluding ourself)
  link_people(person_db, me, msg.get_all('to', []))
  link_people(person_db, me, msg.get_all('cc', []))
Ejemplo n.º 26
0
 def get_mail_content(self, file_name):
     # msg = email.message_from_file(open('sample.eml'))
     with open(file_name, 'rb') as fp:
         msg = BytesParser(policy=policy.default).parse(fp)
     text = msg.get_body(preferencelist=('plain')).get_content()
     fp.close()
     return text
Ejemplo n.º 27
0
def query_S3(bucket, objkey):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket)
    body = ""
    for obj in bucket.objects.all():
        key = obj.key
        if key == objkey:
            body = obj.get()['Body'].read()
    #print(body)
    raw_email = body
    msg = BytesParser(policy=policy.SMTP).parsebytes(body)

    # get the plain text version of the email
    plain = ''
    try:
        plain = msg.get_body(preferencelist=('plain'))
        plain = ''.join(plain.get_content().splitlines(keepends=True))
        plain = '' if plain == None else plain
    except:
        print(
            'Incoming message does not have an plain text part - skipping this part.'
        )

    #print("This is the plaintext : ",plain)

    return plain
Ejemplo n.º 28
0
    def __init__(self, data):
        """
        Cribbed heavily from
        https://www.ianlewis.org/en/parsing-email-attachments-python
        """

        Loggable.__init__(self)

        self.raw = data
        self.attachments = []
        self.recipients = []

        message = BytesParser(policy=policy.default).parsebytes(self.raw)

        self.hash = hashlib.sha512(data).hexdigest()
        self.sender = parseaddr(str(message["From"]))[1].lower()
        self.subject = str(message["Subject"]).replace("\r\n", "")

        # Prefer plain text and strip everything south of the signature. Note
        # that I'm not sure what will happen here if you send an HTML-only
        # email.
        self.body = "\n\n".join(
            re.sub(r"\r?\n\r?\n-- \r?\n.*", "", str(
                message.get_body(
                    preferencelist=('plain', 'related', 'html')
                )
            ), flags=re.DOTALL).split("\n\n")[1:]
        )

        self._set_recipients(message)
        self._set_time(message)
        self._set_attachments(message)

        self.logger.info('Consuming email: "{}"'.format(self.subject))
Ejemplo n.º 29
0
    def parse1(request_text=None, file_path=None):
        # TODO: 如果提供了file_path, 优先filepath
        if file_path:
            request_text = ParseReqHeader().get_request_text_by_file(file_path)

        request_line, headers_alone = request_text.split(b'\r\n', 1)
        headers = BytesParser().parsebytes(headers_alone)
        return {k: v for k, v in headers.items()}
Ejemplo n.º 30
0
def test_prepend_headerfields_encoded():
    # we cope with non-ascii encodings in raw strings
    msg = BytesParser(policy=default_policy).parsebytes(
        'Subject: föö'.encode('utf-8'))
    assert msg.get_all("Subject")[0] == "föö"
    result = pgp.prepend_header_fields(msg, [("To", "foo"), ("From", "bar")])
    assert result.items() == [('To', 'foo'), ('From', 'bar'),
                              ('Subject', 'föö')]
Ejemplo n.º 31
0
def test_prepend_headerfields_as_header_objs():
    # we cope with email.header.Header instances as headerfields
    msg = BytesParser(policy=compat32).parsebytes(
        'Subject: föö'.encode('utf-8'))
    assert not isinstance(msg.get_all("Subject")[0], str)
    result = pgp.prepend_header_fields(msg, [("To", "foo"), ("From", "bar")])
    assert result.items() == [('To', 'foo'), ('From', 'bar'),
                              ('Subject', '=?unknown-8bit?b?ZsO2w7Y=?=')]
Ejemplo n.º 32
0
    def load_email_messages(self, message_set):
        """
        load_email_messages yields an EmailMessage for each email defined in message_set
        """
        parser = BytesParser(policy=email.policy.default)

        for email_bytes in self.load_raw_emails(message_set):
            yield parser.parsebytes(text=email_bytes)
Ejemplo n.º 33
0
def read_em():
    with open("dict/em.txt", 'rb') as fp:
        content = BytesParser(policy=default).parse(fp,headersonly=False)
        print('To: {}'.format(content['to']))
        print('From: {}'.format(content['from']))
        print('Subject: {}'.format(content['subject']))
        print('Recipient username: {}'.format(content['to'].addresses[0].username))
        print('Sender name: {}'.format(content['from'].addresses[0].display_name))
        print('Body: {}'.format(content.get_body(preferencelist=('related', 'html', 'plain')).get_content()))
Ejemplo n.º 34
0
def fillUp(modeladmin, request, queryset):

    for obj in queryset:
        #if self.connected==False:
        obj.connect()

        result, data = obj.mail.uid('search', None,
                                    "ALL")  # search and return uids instead
        id_list = data[0].split()

        for latest_email_uid in id_list[-100::1]:
            result, data = obj.mail.uid('fetch', latest_email_uid, '(RFC822)')
            raw_email = data[0][1]

            # here's the body, which is raw text of the whole email
            # including headers and alternate payloads

            #Parsing
            manager = BytesParser()
            email_message = manager.parsebytes(raw_email)

            try:
                message_juice = email_message.get_payload(decode=False)
                while type(message_juice) == type([1, 2]) and type(
                        message_juice[0].get_payload(decode=False)) == type(
                            [1, 2]):
                    message_juice = message_juice[0].get_payload(decode=False)

                if type(message_juice) == type([1, 2]):
                    if message_juice[-1].get_filename() == None:
                        html_message_juice = message_juice[-1].get_payload(
                            decode=True)
                    else:
                        html_message_juice = message_juice[0].get_payload(
                            decode=True)
                else:
                    html_message_juice = email_message.get_payload(decode=True)

                try:
                    newBlog = Blog(title=email_message['Subject'],
                                   body=html_message_juice.decode())
                    newBlog.save()
                except:
                    newBlog = Blog(
                        title=email_message['Subject'],
                        body=html_message_juice.decode('windows-1251'))
                    newBlog.save()

            except:
                newBlog = Blog(
                    title=email_message['Subject'],
                    body=
                    "This email could not be processed see what happened \n\nSubject: "
                    + email_message['Subject'])
                newBlog.save()
                pass
            obj.setData(repr(latest_email_uid))
Ejemplo n.º 35
0
def get_mail_body(file_path):
    try:
        with open(file_path, "rb") as fp:
            msg = BytesParser(policy=policy.default).parse(fp)
    except IOError as error:
        print(error)
        exit(0)
    body = msg.get_body(preferencelist=('plain')).get_content()
    return body
Ejemplo n.º 36
0
def get_email_headers(message_bytes, headers=None):
    p = Parser()
    with closing(BytesIO(message_bytes)) as stream:
        msgobj = p.parse(stream)
    if headers is None:
        headers = dict(msgobj)
    return {
        k: [parse_header_field(x) for x in msgobj.get_all(k, [])]
        for k in headers
    }
Ejemplo n.º 37
0
 def get_content(self, raw):
     data = base64.urlsafe_b64decode(raw)
     email_parser = EmailParser(policy=policy.default)
     email = email_parser.parsebytes(data)
     plain = email.get_body(preferencelist=('plain',))
     body = None
     if plain:
         body = plain.get_payload()
     email_dict = dict(email)
     email_dict['body'] = body
     return email_dict
Ejemplo n.º 38
0
    def __init__(self, data, verbosity=1):
        """
        Cribbed heavily from
        https://www.ianlewis.org/en/parsing-email-attachments-python
        """

        self.verbosity = verbosity

        self.subject = None
        self.time = None
        self.attachment = None

        message = BytesParser(policy=policy.default).parsebytes(data)
        self.subject = str(message["Subject"]).replace("\r\n", "")
        self.body = str(message.get_body())

        self.check_subject()
        self.check_body()

        self._set_time(message)

        Log.info(
            'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL)

        attachments = []
        for part in message.walk():

            content_disposition = part.get("Content-Disposition")
            if not content_disposition:
                continue

            dispositions = content_disposition.strip().split(";")
            if not dispositions[0].lower() == "attachment":
                continue

            file_data = part.get_payload()

            attachments.append(Attachment(
                b64decode(file_data), content_type=part.get_content_type()))

        if len(attachments) == 0:
            raise InvalidMessageError(
                "There don't appear to be any attachments to this message")

        if len(attachments) > 1:
            raise InvalidMessageError(
                "There's more than one attachment to this message. It cannot "
                "be indexed automatically."
            )

        self.attachment = attachments[0]
Ejemplo n.º 39
0
 def split_email(self, raw_email):
     parsed_email = BytesParser().parsebytes(raw_email)
     to_keep = []
     attachments = []
     if parsed_email.is_multipart():
         for p in parsed_email.get_payload():
             if p.get_filename():
                 filename = decode_header(p.get_filename())
                 if filename[0][1]:
                     filename = filename[0][0].decode(filename[0][1])
                 else:
                     filename = filename[0][0]
                 attachments.append(File(p.get_payload(decode=True), filename))
             else:
                 to_keep.append(p)
     else:
         to_keep.append(parsed_email.get_payload())
     return to_keep, attachments, parsed_email
Ejemplo n.º 40
0
  def process_mailbox(self):
    rv, data = self.imap.uid('search',None, "ALL")
    if rv != 'OK':
      dbgprint("No messages found!")
      return

    #delete removed messages
    self.remove_deleted_msgs_from_history(data[0].split())
    self.post_progress(50)
      
    #get last history uid
    lastmessage=0
    if self.histcontainer.get_nr_elements('email') > 0:
      lastmessage = self.histcontainer.get_last_element('email').get_uid()

    #lastmessage = 35
    dbgprint("last element uid: " + str(lastmessage))

    #loop over all messages and download new ones
    if (int(data[0].split()[-1])-lastmessage) > 0 :
      progressstep = 30/(int(data[0].split()[-1])-lastmessage)
      progressactual = 50
      for uid in data[0].split():
        if int(uid) <= lastmessage:
          continue
        rv, data = self.imap.uid('fetch', uid, '(RFC822)')
        if rv != 'OK':
          dbgprint("ERROR getting message " + uid)
          continue
        #dbgprint("New message UID: "+ str(int(uid))) 
        msg = email.message_from_bytes(data[0][1])
        msg2 = BytesParser(policy=policy.default).parsebytes(data[0][1])
        #dbgprint("BODY:",msg2.get_body(),"------------------")
        body = msg2.get_body(preferencelist=('plain', 'html'))

        el = self.histcontainer.make_element_from_message(int(uid),msg)
        self.process_body(str(body),el)
        self.process_attachments(msg,el)
        self.histcontainer.add_element(el)

        self.mark_msg_as_read(int(uid))
        progressactual+=progressstep
        self.post_progress(progressactual)
Ejemplo n.º 41
0
    def parse(self, bytesfile):
        p = Parser()
        msgobj = p.parse(bytesfile)

        body, html, attachments = parse_email_body(msgobj)
        body = '\n'.join(body).strip()
        html = '\n'.join(html).strip()

        if not body and html:
            body = convert_html_to_text(html)

        email_info = parse_main_headers(msgobj)
        email_info.update({
            'body': body,
            'html': html,
            'attachments': attachments
        })

        return ParsedEmail(msgobj, **email_info)
    def parse_attachment(self, message_part):
        content_disposition = message_part.get("Content-Disposition", None)
        if content_disposition:
            dispo_type, dispo_dict = self.parse_dispositions(content_disposition)
            if dispo_type == "attachment" or (dispo_type == 'inline' and
                    'filename' in dispo_dict):
                content_type = message_part.get("Content-Type", None)
                file_data = message_part.get_payload(decode=True)
                if file_data is None:
                    payloads = message_part.get_payload()
                    file_data = '\n\n'.join([p.as_string() for p in payloads])
                    try:
                        file_data = file_data.encode('utf-8')
                    except:
                        pass

                attachment = BytesIO(file_data)
                attachment.content_type = message_part.get_content_type()
                attachment.size = len(file_data)
                attachment.name = None
                attachment.create_date = None
                attachment.mod_date = None
                attachment.read_date = None
                if "filename" in dispo_dict:
                    attachment.name = dispo_dict['filename']
                if content_type:
                    _, content_dict = self.parse_dispositions(content_type)
                    if 'name' in content_dict:
                        attachment.name = content_dict['name']
                if attachment.name is None and content_type == 'message/rfc822':
                    p = Parser()
                    msgobj = p.parse(BytesIO(attachment.getvalue()))
                    subject = self.parse_header_field(msgobj['Subject'])
                    if subject:
                        attachment.name = '%s.eml' % subject[:45]
                if "create-date" in dispo_dict:
                    attachment.create_date = dispo_dict['create-date']  # TODO: datetime
                if "modification-date" in dispo_dict:
                    attachment.mod_date = dispo_dict['modification-date']  # TODO: datetime
                if "read-date" in dispo_dict:
                    attachment.read_date = dispo_dict['read-date']  # TODO: datetime
                return attachment
        return None
Ejemplo n.º 43
0
def getMailAttachment(connection, mailID, AttachmentNr):
    """AttachmentNr starting with 1
    """
    result,data = connection.fetch(mailID,"(RFC822)")
    raw_email = data[0][1]
    p = BytesParser()
    msg = p.parsebytes(raw_email)
    sender = msg.get('From')
    subject = msg.get('Subject')
    date_of_mail = msg.get('Date')
    mail_as_list = msg.get_payload()
    try:
        attachment = mail_as_list[AttachmentNr] # 0 is the message itself
        attachmentName = attachment.get_filename()
    except:
        attachment = None
        attachmentName = None
        print('Anhang Nr. ', AttachmentNr, ' of mail ', mailID, ' does not exist.')
    if 'Remotefox' not in sender:
        attachment = None
        attachmentName = None     
    return attachment, attachmentName, date_of_mail, subject
Ejemplo n.º 44
0
 def parse_attachment(self, message_part):
     content_disposition = message_part.get("Content-Disposition", None)
     if content_disposition:
         dispo_type, dispo_dict = self.parse_dispositions(content_disposition)
         if dispo_type == "attachment" or (dispo_type == "inline" and "filename" in dispo_dict):
             content_type = message_part.get("Content-Type", None)
             file_data = message_part.get_payload(decode=True)
             if file_data is None:
                 payloads = message_part.get_payload()
                 file_data = "\n\n".join([p.as_string() for p in payloads]).encode("utf-8")
             attachment = BytesIO(file_data)
             attachment.content_type = message_part.get_content_type()
             attachment.size = len(file_data)
             attachment.name = None
             attachment.create_date = None
             attachment.mod_date = None
             attachment.read_date = None
             if "filename" in dispo_dict:
                 attachment.name = dispo_dict["filename"]
             if content_type:
                 _, content_dict = self.parse_dispositions(content_type)
                 if "name" in content_dict:
                     attachment.name = content_dict["name"]
             if attachment.name is None and content_type == "message/rfc822":
                 p = Parser()
                 msgobj = p.parse(BytesIO(attachment.getvalue()))
                 subject = self.parse_header_field(msgobj["Subject"])
                 if subject:
                     attachment.name = "%s.eml" % subject[:45]
             if "create-date" in dispo_dict:
                 attachment.create_date = dispo_dict["create-date"]  # TODO: datetime
             if "modification-date" in dispo_dict:
                 attachment.mod_date = dispo_dict["modification-date"]  # TODO: datetime
             if "read-date" in dispo_dict:
                 attachment.read_date = dispo_dict["read-date"]  # TODO: datetime
             return attachment
     return None
Ejemplo n.º 45
0
	def extractMetaData(self,obj):
		headers = BytesParser().parse(obj)
		h = dict(headers.items())
		return self.convertMetaDataToSwiftFormat(h)
	def extractMetaData(self, obj):
		headers = BytesParser().parse(obj)
		metadata = dict(headers.items())

		return self.cleanupMetaDataDict(metadata)
Ejemplo n.º 47
0
class ArchivesParser(object):
    def __init__(self):
        self.parser = BytesParser(policy=compat32)

    def parse(self, stream):
        self.rawtxt = stream.read()
        self.msg = self.parser.parse(io.BytesIO(self.rawtxt))

    def is_msgid(self, msgid):
        # Look for a specific messageid. This means we might parse it twice,
        # but so be it. Any exception means we know it's not this one...
        try:
            if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid:
                return True
        except Exception as e:
            return False

    def analyze(self, date_override=None):
        self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID')))
        self._from = self.decode_mime_header(self.get_mandatory('From'), True)
        self.to = self.decode_mime_header(self.get_optional('To'), True)
        self.cc = self.decode_mime_header(self.get_optional('CC'), True)
        self.subject = self.decode_mime_header(self.get_optional('Subject'))
        if date_override:
            self.date = self.forgiving_date_decode(date_override)
        else:
            self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date')))

            # Accept times up to 4 hours in the future, for badly synced clocks
            maxdate = datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(hours=4)
            if self.date > maxdate:
                # Date is in the future, we don't trust that. Instead, let's see if we can find
                # it in the raw text of the message.
                def _extract_date(d):
                    m = _re_received.match(d)
                    if m:
                        try:
                            return self.forgiving_date_decode(m.group(1).strip())
                        except IgnorableException:
                            pass

                lowdate = min((x for x in map(_extract_date, self.msg.get_all('Received')) if x and x < maxdate))
                if lowdate:
                    self.date = lowdate
                # Else we're going to go with what we found
        self.bodytxt = self.get_body()
        self.attachments = []
        self.get_attachments()
        if len(self.attachments) > 0:
            log.status("Found %s attachments" % len(self.attachments))

        # Build an list of the message id's we are interested in
        self.parents = []
        # The first one is in-reply-to, if it exists
        if self.get_optional('in-reply-to'):
            m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True)
            if m:
                self.parents.append(m)

        # Then we add all References values, in backwards order
        if self.get_optional('references'):
            cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())]
            # Can't do this with a simple self.parents.extend() due to broken
            # mailers that add the same reference more than once. And we can't
            # use a set() to make it unique, because order is very important
            for m in cleaned_msgids:
                if m and m not in self.parents:
                    self.parents.append(m)

    def clean_charset(self, charset):
        lcharset = charset.lower()
        if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown':
            # Special case where we don't know... We'll assume
            # us-ascii and use replacements
            return 'us-ascii'
        if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset':
            # Seriously broken charset definitions, map to us-ascii
            # and throw away the rest with replacements
            return 'us-ascii'
        if lcharset == 'x-gbk':
            # Some MUAs set it to x-gbk, but there is a valid
            # declaratoin as gbk...
            return 'gbk'
        if lcharset == 'iso-8859-8-i':
            # -I is a special logical version, but should be the
            # same charset
            return 'iso-8859-8'
        if lcharset == 'windows-874':
            # This is an alias for iso-8859-11
            return 'iso-8859-11'
        if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1':
            # Strange way of saying 8859....
            return 'iso-8859-1'
        if lcharset == 'iso885915':
            return 'iso-8859-15'
        if lcharset == 'iso-latin-2':
            return 'iso-8859-2'
        if lcharset == 'iso-850':
            # Strange spelling of cp850 (windows charset)
            return 'cp850'
        if lcharset == 'koi8r':
            return 'koi8-r'
        if lcharset == 'cp 1252':
            return 'cp1252'
        if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii':
            # Why did this show up more than once?!
            return 'iso-8859-1'
        if lcharset == 'x-windows-949':
            return 'ms949'
        if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de':
            # This is a locale, and not a charset, but most likely it's this one
            return 'iso-8859-1'
        if lcharset == 'iso-8858-15':
            # How is this a *common* mistake?
            return 'iso-8859-15'
        if lcharset == 'macintosh':
            return 'mac_roman'
        if lcharset == 'cn-big5':
            return 'big5'
        if lcharset == 'x-unicode-2-0-utf-7':
            return 'utf-7'
        if lcharset == 'tscii':
            # No support for this charset :S Map it down to ascii
            # and throw away all the rest. sucks, but we have to
            return 'us-ascii'
        return charset

    def get_payload_as_unicode(self, msg):
        try:
            b = msg.get_payload(decode=True)
        except AssertionError:
            # Badly encoded data can throw an exception here, where the python
            # libraries fail to handle it and enters a cannot-happen path.
            # In which case we just ignore it and hope for a better MIME part later.
            b = None

        if b:
            # Find out if there is a charset
            charset = None
            params = msg.get_params()
            if not params:
                # No content-type, so we assume us-ascii
                return str(b, 'us-ascii', errors='ignore')
            for k, v in params:
                if k.lower() == 'charset':
                    charset = v
                    break
            if charset:
                try:
                    return str(b, self.clean_charset(charset), errors='ignore')
                except LookupError as e:
                    raise IgnorableException("Failed to get unicode payload: %s" % e)
            else:
                # XXX: reasonable default?
                return str(b, errors='ignore')
        # Return None or empty string, depending on what we got back
        return b

    # Regular expression matching the PostgreSQL custom mail footer that
    # is appended to all emails.
    _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL)

    def get_body(self):
        b = self._get_body()
        if b:
            # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
            # later reject..
            if b.find('\udbff\n\udef8'):
                b = b.replace('\udbff\n\udef8', '')

        # Remove postgres specific mail footer - if it's there
        m = self._re_footer.match(b)
        if m:
            b = m.group(1)

        # Sometimes we end up with a trailing \0 when decoding long strings, so
        # replace it if it's there.
        # In fact, replace it everywhere, since it can also turn up in the middle
        # of a text when it's a really broken decoding.
        b = b.replace('\0', '')

        return b

    def _get_body(self):
        # This is where the magic happens - try to figure out what the body
        # of this message should render as.
        hasempty = False

        # First see if this is a single-part message that we can just
        # decode and go.
        b = self.get_payload_as_unicode(self.msg)
        if b:
            return b
        if b == '':
            # We found something, but it was empty. We'll keep looking as
            # there might be something better available, but make a note
            # that empty exists.
            hasempty = True

        # Ok, it's multipart. Find the first part that is text/plain,
        # and use that one. Do this recursively, since we may have something
        # like:
        # multipart/mixed:
        #   multipart/alternative:
        #      text/plain
        #      text/html
        #   application/octet-stream (attachment)
        b = self.recursive_first_plaintext(self.msg)
        if b:
            return b
        if b == '':
            hasempty = True

        # Couldn't find a plaintext. Look for the first HTML in that case.
        # Fallback, but what can we do at this point...
        b = self.recursive_first_plaintext(self.msg, True)
        if b:
            b = self.html_clean(b)
            if b:
                return b
        if b == '' or b is None:
            hasempty = True

        if hasempty:
            log.status('Found empty body in %s' % self.msgid)
            return ''
        raise IgnorableException("Don't know how to read the body from %s" % self.msgid)

    def recursive_first_plaintext(self, container, html_instead=False):
        pl = container.get_payload()
        if isinstance(pl, str):
            # This was not a multipart, but it leaked... Give up!
            return None
        for p in pl:
            if p.get_params() is None:
                # MIME multipart/mixed, but no MIME type on the part
                log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid)
                return self.get_payload_as_unicode(p)
            if p.get_params()[0][0].lower() == 'text/plain':
                # Don't include it if it looks like an attachment
                if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
                    continue
                return self.get_payload_as_unicode(p)
            if html_instead and p.get_params()[0][0].lower() == 'text/html':
                # Don't include it if it looks like an attachment
                if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
                    continue
                return self.get_payload_as_unicode(p)
            if p.is_multipart():
                b = self.recursive_first_plaintext(p, html_instead)
                if b or b == '':
                    return b

        # Yikes, nothing here! Hopefully we'll find something when
        # we continue looping at a higher level.
        return None

    def get_attachments(self):
        self.attachments_found_first_plaintext = False
        self.recursive_get_attachments(self.msg)

    # Clean a filenames encoding and return it as a unicode string
    def _clean_filename_encoding(self, filename):
        # If this is a header-encoded filename, start by decoding that
        if filename.startswith('=?'):
            decoded, encoding = decode_header(filename)[0]
            return str(decoded, encoding, errors='ignore')

        # If it's already unicode, just return it
        if isinstance(filename, str):
            return filename

        # Anything that's not UTF8, we just get rid of. We can live with
        # filenames slightly mangled in this case.
        return str(filename, 'utf-8', errors='ignore')

    def _extract_filename(self, container):
        # Try to get the filename for an attachment in the container.
        # If the standard library can figure one out, use that one.
        f = container.get_filename()
        if f:
            return self._clean_filename_encoding(f)

        # Failing that, some mailers set Content-Description to the
        # filename
        if 'Content-Description' in container:
            return self._clean_filename_encoding(container['Content-Description'])
        return None

    def recursive_get_attachments(self, container):
        # We start recursion in the "multipart" container if any
        if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed':
            # Multipart - worth scanning into
            if not container.is_multipart():
                # Wow, this is broken. It's multipart/mixed, but doesn't
                # contain multiple parts.
                # Since we're just looking for attachments, let's just
                # ignore it...
                return
            for p in container.get_payload():
                if p.get_params() is None:
                    continue
                self.recursive_get_attachments(p)
        elif container.get_content_type() == 'multipart/alternative':
            # Alternative is not an attachment (we decide)
            # It's typilcally plantext + html
            self.attachments_found_first_plaintext = True
            return
        elif container.is_multipart():
            # Other kinds of multipart, such as multipart/signed...
            return
        else:
            # Not a multipart.
            # Exclude specific contenttypes
            if container.get_content_type() == 'application/pgp-signature':
                return
            if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'):
                return
            # For now, accept anything not text/plain
            if container.get_content_type() != 'text/plain':
                try:
                    self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
                except AssertionError:
                    # Badly encoded data can throw an exception here, where the python
                    # libraries fail to handle it and enters a cannot-happen path.
                    # In which case we just ignore this attachment.
                    return
                return

            # It's a text/plain, it might be worthwhile.
            # If it has a name, we consider it an attachments
            if not container.get_params():
                return
            for k, v in container.get_params():
                if k == 'name' and v != '':
                    # Yes, it has a name
                    try:
                        self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
                    except AssertionError:
                        # Badly encoded data can throw an exception here, where the python
                        # libraries fail to handle it and enters a cannot-happen path.
                        # In which case we just ignore this attachment.
                        return

                    return

            # If it's content-disposition=attachment, we also want to save it
            if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'):
                try:
                    self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
                except AssertionError:
                    # Badly encoded data can throw an exception here, where the python
                    # libraries fail to handle it and enters a cannot-happen path.
                    # In which case we just ignore this attachment.
                    return

                return

            # If we have already found one text/plain part, make all
            # further text/plain parts attachments
            if self.attachments_found_first_plaintext:
                # However, this will also *always* catch the MIME part added
                # by majordomo with the footer. So if that one is present,
                # we need to explicitly exclude it again.
                try:
                    b = container.get_payload(decode=True)
                except AssertionError:
                    # Badly encoded data can throw an exception here, where the python
                    # libraries fail to handle it and enters a cannot-happen path.
                    # In which case we just ignore this attachment.
                    return

                if isinstance(b, str) and not self._re_footer.match(b):
                    # We know there is no name for this one
                    self.attachments.append((None, container.get_content_type(), b))
                return

            # Ok, so this was a plaintext that we ignored. Set the flag
            # that we have now ignored one, so we'll make the next one
            # an attachment.
            self.attachments_found_first_plaintext = True
            # No name, and text/plain, so ignore it

    re_msgid = re.compile('^\s*<(.*)>\s*')

    def clean_messageid(self, messageid, ignorebroken=False):
        m = self.re_msgid.match(messageid)
        if not m:
            if ignorebroken:
                log.status("Could not parse messageid '%s', ignoring it" % messageid)
                return None
            raise IgnorableException("Could not parse message id '%s'" % messageid)
        return m.groups(1)[0].replace(' ', '')

#    _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$')
    # Now using [^\s] instead of \w, to work with japanese chars
    _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$')
    _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$')
    _date_multiminus_re = re.compile(' -(-\d+)$')
    _date_offsetnoplus_re = re.compile(' (\d{4})$')

    def forgiving_date_decode(self, d):
        if d.strip() == '':
            raise IgnorableException("Failed to parse empty date")
        # Strange timezones requiring manual adjustments
        if d.endswith('-7700 (EST)'):
            d = d.replace('-7700 (EST)', 'EST')
        if d.endswith('+6700 (EST)'):
            d = d.replace('+6700 (EST)', 'EST')
        if d.endswith('+-4-30'):
            d = d.replace('+-4-30', '+0430')
        if d.endswith('+1.00'):
            d = d.replace('+1.00', '+0100')
        if d.endswith('+-100'):
            d = d.replace('+-100', '+0100')
        if d.endswith('+500'):
            d = d.replace('+500', '+0500')
        if d.endswith('-500'):
            d = d.replace('-500', '-0500')
        if d.endswith('-700'):
            d = d.replace('-700', '-0700')
        if d.endswith('-800'):
            d = d.replace('-800', '-0800')
        if d.endswith('+05-30'):
            d = d.replace('+05-30', '+0530')
        if d.endswith('+0-900'):
            d = d.replace('+0-900', '-0900')
        if d.endswith('Mexico/General'):
            d = d.replace('Mexico/General', 'CDT')
        if d.endswith('Pacific Daylight Time'):
            d = d.replace('Pacific Daylight Time', 'PDT')
        if d.endswith(' ZE2'):
            d = d.replace(' ZE2', ' +0200')
        if d.find('-Juin-') > 0:
            d = d.replace('-Juin-', '-Jun-')
        if d.find('-Juil-') > 0:
            d = d.replace('-Juil-', '-Jul-')
        if d.find(' 0 (GMT)'):
            d = d.replace(' 0 (GMT)', ' +0000')

        if self._date_multiminus_re.search(d):
            d = self._date_multiminus_re.sub(' \\1', d)

        if self._date_offsetnoplus_re.search(d):
            d = self._date_offsetnoplus_re.sub('+\\1', d)

        # We have a number of dates in the format
        # "<full datespace> +0200 (MET DST)"
        # or similar. The problem coming from the space within the
        # parenthesis, or if the contents of the parenthesis is
        # completely empty
        if self._date_multi_re.search(d):
            d = self._date_multi_re.sub('', d)

        # If the spec is instead
        # "<full datespace> +0200 (...)"
        # of any kind, we can just remove what's in the (), because the
        # parser is just going to rely on the fixed offset anyway.
        if self._date_multi_re2.search(d):
            d = self._date_multi_re2.sub(' \\1', d)

        try:
            dp = dateutil.parser.parse(d, fuzzy=True)

            # Some offsets are >16 hours, which postgresql will not
            # (for good reasons) accept
            if dp.utcoffset() and abs(dp.utcoffset().days * (24 * 60 * 60) + dp.utcoffset().seconds) > 60 * 60 * 16 - 1:
                # Convert it to a UTC timestamp using Python. It will give
                # us the right time, but the wrong timezone. Should be
                # enough...
                dp = datetime.datetime(*dp.utctimetuple()[:6])
            if not dp.tzinfo:
                dp = dp.replace(tzinfo=datetime.timezone.utc)
            return dp
        except Exception as e:
            raise IgnorableException("Failed to parse date '%s': %s" % (d, e))

    def _maybe_decode(self, s, charset):
        if isinstance(s, str):
            return s.strip(' ')
        return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ')

    # Workaround for broken quoting in some MUAs (see below)
    _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE)

    def _decode_mime_header(self, hdr, email_workaround):
        if hdr is None:
            return None

        # Per http://bugs.python.org/issue504152 (and lots of testing), it seems
        # we must get rid of the sequence \n\t at least in the header. If we
        # do this *before* doing any MIME decoding, we should be safe against
        # anybody *actually* putting that sequence in the header (since we
        # won't match the encoded contents)
        hdr = hdr.replace("\n\t", " ")

        # In at least some cases, at least gmail (and possibly other MUAs)
        # incorrectly put double quotes in the name/email field even when
        # it's encoded. That's not allowed - they have to be escaped - but
        # since there's a fair amount of those, we apply a regex to get
        # rid of them.
        m = self._re_mailworkaround.search(hdr)
        if m:
            hdr = self._re_mailworkaround.sub(r'\1', hdr)

        try:
            return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)])
        except HeaderParseError as e:
            # Parser error is typically someone specifying an encoding,
            # but then not actually using that encoding. We'll do the best
            # we can, which is cut it down to ascii and ignore errors
            return str(hdr, 'us-ascii', errors='ignore').strip(' ')

    def decode_mime_header(self, hdr, email_workaround=False):
        try:
            if isinstance(hdr, Header):
                hdr = hdr.encode()

            h = self._decode_mime_header(hdr, email_workaround)
            if h:
                return h.replace("\0", "")
            return ''
        except LookupError as e:
            raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e))
        except ValueError as ve:
            raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve))

    def get_mandatory(self, fieldname):
        try:
            x = self.msg[fieldname]
            if x is None:
                raise Exception()
            return x
        except:
            raise IgnorableException("Mandatory field '%s' is missing" % fieldname)

    def get_optional(self, fieldname):
        try:
            return self.msg[fieldname]
        except:
            return ''

    def html_clean(self, html):
        # First we pass it through tidy
        (html, errors) = tidylib.tidy_document(html,
                                               options={
                                                   'drop-proprietary-attributes': 1,
                                                   'alt-text': '',
                                                   'hide-comments': 1,
                                                   'output-xhtml': 1,
                                                   'show-body-only': 1,
                                                   'clean': 1,
                                                   'char-encoding': 'utf8',
                                                   'show-warnings': 0,
                                                   'show-info': 0,
                                               })
        if errors:
            print(("HTML tidy failed for %s!" % self.msgid))
            print(errors)
            return None

        try:
            cleaner = HTMLCleaner()
            cleaner.feed(html)
            return cleaner.get_text()
        except Exception as e:
            # Failed to parse the html, thus failed to clean it. so we must
            # give up...
            return None
Ejemplo n.º 48
0
    logger.exception('Wrong login/password!')
    sys.exit()

# Getting all unseen mail

result, data = mail.search(None, 'unseen')
if len(data[0]) == 0:
    logger.info('No unseen mails!')
    logger.info(mail.logout())
    logger.info(s.quit())
else:
    ids = data[0].split()  # getting unseen letters id list
    msgs = []
    for x in ids:
        result, data = mail.fetch(x, 'RFC822')
        parser = BytesParser()
        msg = parser.parsebytes(data[0][1])
        # changing "To" and "From" fields in header
        msg.__delitem__('To')
        msg.__setitem__('To', smtp_send_to_header)
        msg.__delitem__('From')
        msg.__setitem__('From', smtp_login)
        msgs.append(msg)
    logger.info('Have %d new letters', len(ids))
    logger.info('Close imap protocol')
    logger.info(mail.logout())

    # sending messages

    i = 0
    for msg in msgs:
Ejemplo n.º 49
0
 def __init__(self):
     self.parser = BytesParser(policy=compat32)
Ejemplo n.º 50
0
from email import policy
from email.parser import BytesParser


raw = sys.stdin.buffer.read()

if not os.isatty(0):
    fd = os.open('/dev/tty', os.O_RDONLY)
    if fd < 0:
        sys.stderr.write('Unable to open an input tty.\n')
        sys.exit(-1)
    else:
        os.dup2(fd, 0)
        os.close(fd)

msg = BytesParser(policy=policy.default).parsebytes(raw)

# We can extract the richest alternative in order to display it:
richest = msg.get_body()
partfiles = {}
if richest['content-type'].maintype == 'text':
    if richest['content-type'].subtype == 'plain':
        for line in richest.get_content().splitlines():
            print(line)
        sys.exit()
    elif richest['content-type'].subtype == 'html':
        body = richest
    else:
        print("Don't know how to display {}".format(richest.get_content_type()))
        sys.exit()
elif richest['content-type'].content_type == 'multipart/related':
import os
import sys
import tempfile
import mimetypes
import webbrowser

# Import the email modules we'll need
from email import policy
from email.parser import BytesParser

# An imaginary module that would make this work and be safe.
from imaginary import magic_html_parser

# In a real program you'd get the filename from the arguments.
msg = BytesParser(policy=policy.default).parse(open('outgoing.msg', 'rb'))

# Now the header items can be accessed as a dictionary, and any non-ASCII will
# be converted to unicode:
print('To:', msg['to'])
print('From:', msg['from'])
print('Subject:', msg['subject'])

# If we want to print a priview of the message content, we can extract whatever
# the least formatted payload is and print the first three lines.  Of course,
# if the message has no plain text part printing the first three lines of html
# is probably useless, but this is just a conceptual example.
simplest = msg.get_body(preferencelist=('plain', 'html'))
print()
print(''.join(simplest.get_content().splitlines(keepends=True)[:3]))

ans = input("View full message?")
Ejemplo n.º 52
-1
def process_email(raw_email):
    msg = BytesParser(policy=policy.default).parsebytes(raw_email)
    body = msg.get_body(preferencelist=['plain'])
    content = body.get_payload(decode=True)

    charset = body.get_content_charset()
    if not charset:
        charset = chardet.detect(content)['encoding']
    content = content.decode(charset)

    regex = re.compile('^[^+@]+\+(?P<token>[a-zA-Z0-9]{80})@[^@]+$')

    for addr in msg.get('To', '').split(','):
        m = regex.match(addr.strip())
        if m:
            break

    if not m:
        raise NoTokenFoundException

    token = m.group('token')

    try:
        in_reply_to, author = process_new_token(token)
    except InvalidTokenException:
        in_reply_to, author = process_old_token(token)

    subject = msg.get('Subject', '')

    Message.objects.create(thread=in_reply_to.thread, in_reply_to=in_reply_to, author=author, subject=subject, content=content)