Beispiel #1
0
def extract_urls(mail_parts, defanged_urls=False):
    links = set()

    # results = parse_email_from_string(email)

    for mail_part in mail_parts:
        if mail_part['is_body']:
            if mail_part['is_body'].startswith('text/html'):
                l = _extract_urls(mail_part['decoded_body'], html=True, defanged_urls=defanged_urls)
                links.update(l)
            if mail_part['is_body'].startswith('text/plain'):
                l = _extract_urls(mail_part['decoded_body'], html=False, defanged_urls=defanged_urls)
                links.update(l)

    return list(links)
Beispiel #2
0
def from_string(data):
    message = pyzmail.PyzMessage.factory(data)
    headers = parse_headers(message)

    # for mailpart in message.mailparts:
    #     print '    %sfilename=%r alt_filename=%r type=%s charset=%s desc=%s size=%d' % ( \
    #         '*' if mailpart.is_body else ' ', \
    #         mailpart.filename, \
    #         mailpart.sanitized_filename, \
    #         mailpart.type, \
    #         mailpart.charset, \
    #         mailpart.part.get('Content-Description'), \
    #         len(mailpart.get_payload()))
    #
    #     if mailpart.type.startswith('text/'):
    #         # display first line of the text
    #         payload, used_charset = pyzmail.decode_text(mailpart.get_payload(), mailpart.charset, None)
    #         print '        >', payload.split('\\n')[0]

    # is fwd

    # inline
    # attachment is .eml
    # is html
    # is txt
    # attachment is [.docx?|.zip|.html?|.xlsx?]
    # multiple .eml attachments
    # is html
    # is txt

    urls = set()
    email_addresses = set()
    attachments = []
    if message.html_part:
        body = message.html_part.get_payload()
        try:
            body = body.decode('utf-8')
        except Exception:
            body = body.decode('latin-1')
        urls = _extract_urls(body, html=True)
        email_addresses = _extract_email_addresses(body, html=True)
    else:
        if message.text_part:
            body = message.text_part.get_payload()
            try:
                body = body.decode('utf-8')
            except Exception:
                body = body.decode('latin-1')
            urls = _extract_urls(body)
            email_addresses = _extract_email_addresses(body)
        else:
            body = message.get_payload()
            for p in body:
                attachments.append({
                    'type': p.get_content_type(),
                    'attachment': str(p)
                })
            body = ''

    return {
        'message': body,
        'urls': list(urls),
        'email_addresses': list(email_addresses),
        'headers': headers,
        'attachments': attachments
    }