コード例 #1
0
ファイル: mail_extractor.py プロジェクト: ptal/people-link
def make_person_schema(mailFile, outputDir, person_db):
  msg = BytesParser().parse(mailFile)
  # Retrieve the from person.
  (realname, mailAddr) = get_info_from_mail_field(msg['from'])
  person = Person(realname, mailAddr)

  # Add it to the database.
  update_db(person_db, person)

  # Find ourself
  (my_name, my_email) = get_info_from_mail_field(msg['Delivered-To'])
  me = Person(my_name, my_email)

  def addToMyEmailAddr(field_name):
    (_, my_email_addr) = get_info_from_mail_field(msg[field_name])
    if my_email_addr:
      me.addEmail(my_email_addr)

  addToMyEmailAddr('X-Original-To')
  addToMyEmailAddr('Resent-From')

  update_db(person_db, me)

  # Find cc and to relation (excluding ourself)
  link_people(person_db, me, msg.get_all('to', []))
  link_people(person_db, me, msg.get_all('cc', []))
コード例 #2
0
def make_person_schema(mailFile, outputDir, person_db):
    msg = BytesParser().parse(mailFile)
    # Retrieve the from person.
    (realname, mailAddr) = get_info_from_mail_field(msg['from'])
    person = Person(realname, mailAddr)

    # Add it to the database.
    update_db(person_db, person)

    # Find ourself
    (my_name, my_email) = get_info_from_mail_field(msg['Delivered-To'])
    me = Person(my_name, my_email)

    def addToMyEmailAddr(field_name):
        (_, my_email_addr) = get_info_from_mail_field(msg[field_name])
        if my_email_addr:
            me.addEmail(my_email_addr)

    addToMyEmailAddr('X-Original-To')
    addToMyEmailAddr('Resent-From')

    update_db(person_db, me)

    # Find cc and to relation (excluding ourself)
    link_people(person_db, me, msg.get_all('to', []))
    link_people(person_db, me, msg.get_all('cc', []))
コード例 #3
0
def test_prepend_headerfields_as_header_objs():
    # we cope with email.header.Header instances as headerfields
    msg = BytesParser(policy=compat32).parsebytes(
        'Subject: föö'.encode('utf-8'))
    assert not isinstance(msg.get_all("Subject")[0], str)
    result = pgp.prepend_header_fields(msg, [("To", "foo"), ("From", "bar")])
    assert result.items() == [('To', 'foo'), ('From', 'bar'),
                              ('Subject', '=?unknown-8bit?b?ZsO2w7Y=?=')]
コード例 #4
0
def test_prepend_headerfields_encoded():
    # we cope with non-ascii encodings in raw strings
    msg = BytesParser(policy=default_policy).parsebytes(
        'Subject: föö'.encode('utf-8'))
    assert msg.get_all("Subject")[0] == "föö"
    result = pgp.prepend_header_fields(msg, [("To", "foo"), ("From", "bar")])
    assert result.items() == [('To', 'foo'), ('From', 'bar'),
                              ('Subject', 'föö')]
コード例 #5
0
ファイル: parser.py プロジェクト: ww9rivers/c9r
    def __call__(self, content):
        '''Parse an email message in "content", which is a string or a text input object.

        /content/       Standard encoded email message content.

        Returns parsed message in a dict of (subject, date, body, html, from, to, attachments).
        '''
        if isinstance(content, bytes):
            msgobj = BytesParser().parsebytes(content)
        else:
            msgobj = StrParser().parse(StringIO(content))
        subject = parse_header('Subject', msgobj)
        date = parse_header('Date', msgobj)
        received = []
        for part in (msgobj.get_all('Received') or []):
            lx = self.re_received.split(part)
            tmp = dict(zip(lx[1::2], [ x.strip() for x in lx[2::2] ]))
            tx = tmp.get(';')
            if tx: tmp['time'] = parse_time(tx)
            received.append(tmp)
        fromaddr = parse_addr(msgobj, 'From')
        if date:
            date = date.replace(',', '')
        logger.debug('Parsing message: Date={0}, Subject={1}'.format(date, subject))
        #-------- Parsing attachments:
        attachments = []
        body = None
        html = None
        for part in msgobj.walk():
            attachment = parse_attachment(part)
            if attachment:
                attachments.append(attachment)
            else: # parse text content
                content_type = part.get_content_type()
                if content_type[0:5] == 'text/':
                    payload = str(part.get_payload(decode=True),
                                  part.get_content_charset() or 'ascii',
                                  'replace').encode('utf8','replace')
                if content_type == "text/plain":
                    if body is None:
                        body = ''
                    body += str(payload)
                elif content_type == "text/html":
                    if html is None:
                        html = ''
                    html += str(payload)
                else:
                    logger.debug('Ignored: Content_type "{0}" in message "{1}" from {2}, Date={3}'.format(content_type, subject, fromaddr, date))
        return {
            'subject' : subject,
            'date' : date,
            'received': received,
            # 'received': sorted(received, key=lambda k: k['time']),
            'body' : body,
            'html' : html,
            'from' : fromaddr,
            'to' : parse_addr(msgobj, 'To'),
            'cc' : parse_addr(msgobj, 'CC'),
            'bcc' : parse_addr(msgobj, 'BCC'),
            'attachments': attachments
            }
コード例 #6
0
def email_analysis(filename, exclude_private_ip):
    urlList = []
    domainList = []
    hopList = []
    hopListIP = []
    attachList = []
    data = {}
    data["data"] = []

    with open(filename, "rb") as fp:
        msg = BytesParser(policy=policy.default).parse(fp)

    if msg:
        # Identify each url or attachment reported in the eMail body
        for part in msg.walk():
            if part.get_content_type(
            ) == "text/plain" or part.get_content_type() == "text/html":
                extractor = URLExtract()
                urlList.extend(extractor.find_urls(part.get_content()))
            else:
                if part.get_filename():
                    attachList.append(part.get_filename())

        # Identify each domain reported in the eMail body
        for url in urlList:
            analyzeddomain = tldcache(url).registered_domain
            if analyzeddomain:
                domainList.append(analyzeddomain)

        # Remove Duplicate
        urlList = list(set(urlList))
        domainList = list(set(domainList))

        # A sender obfuscation technique involves entering two e-mails. Only the last one is the real one. Example:
        #
        # Sender Name: Mario Rossi <*****@*****.**>
        # Sender Mail: [email protected]

        if msg["From"]:
            mail_from = re.findall(
                "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["From"],
                re.IGNORECASE)
            mail_from = mail_from[-1]
        else:
            mail_from = ""

        if msg["Sender"]:
            mail_sender = msg["Sender"]
        else:
            mail_sender = ""

        if msg["Subject"]:
            mail_subject = msg["Subject"]
        else:
            mail_subject = ""

        if msg["X-Originating-IP"]:
            mail_xorigip = msg["X-Originating-IP"]
        else:
            mail_xorigip = ""

        data["data"].append({
            "Filename": os.path.basename(filename),
            "From": mail_from,
            "Sender": mail_sender,
            "Subject": mail_subject,
            "X-Originating-IP": mail_xorigip,
            "attachments": [],
            "relay_full": [],
            "relay_ip": [],
            "urls": [],
            "domains": []
        })

        # Identify each relay
        received = msg.get_all("Received")
        if received:
            received.reverse()
            for line in received:
                hops = re.findall(
                    "from\s+(.*?)\s+by(.*?)(?:(?:with|via)(.*?)(?:id|$)|id|$)",
                    line, re.DOTALL | re.X)
                for hop in hops:

                    ipv4_address = re.findall(r"[0-9]+(?:\.[0-9]+){3}", hop[0],
                                              re.DOTALL | re.X)

                    # https://gist.github.com/dfee/6ed3a4b05cfe7a6faf40a2102408d5d8
                    ipv6_address = re.findall(
                        r"(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}:[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:::(?:ffff(?::0{1,4}){0,1}:){0,1}[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:fe80:(?::(?:(?:[0-9a-fA-F]){1,4})){0,4}%[0-9a-zA-Z]{1,})|(?::(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,7}|:))|(?:(?:(?:[0-9a-fA-F]){1,4}):(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,6}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,2}(?::(?:(?:[0-9a-fA-F]){1,4})){1,5})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,3}(?::(?:(?:[0-9a-fA-F]){1,4})){1,4})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}(?::(?:(?:[0-9a-fA-F]){1,4})){1,3})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,5}(?::(?:(?:[0-9a-fA-F]){1,4})){1,2})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,6}:(?:(?:[0-9a-fA-F]){1,4}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,7}:)|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){7,7}(?:(?:[0-9a-fA-F]){1,4}))",
                        hop[0], re.DOTALL | re.X)

                    if ipv4_address:
                        if ipaddress.ip_address(ipv4_address[0]):
                            if ipaddress.ip_address(
                                    ipv4_address[0]).is_private:
                                if not exclude_private_ip:
                                    hopListIP.append(ipv4_address[0])
                            else:
                                hopListIP.append(ipv4_address[0])

                    if ipv6_address:
                        if ipaddress.ip_address(ipv6_address[0]):
                            if ipaddress.ip_address(
                                    ipv6_address[0]).is_private:
                                if not exclude_private_ip:
                                    hopListIP.append(ipv6_address[0])
                            else:
                                hopListIP.append(ipv6_address[0])

                    if hop[0]:
                        hopList.append(hop[0])

        if attachList:
            data["data"][0]["attachments"].append(
                dict(zip(range(len(attachList)), attachList)))

        if hopList:
            data["data"][0]["relay_full"].append(
                dict(zip(range(len(hopList)), hopList)))

        if hopListIP:
            data["data"][0]["relay_ip"].append(
                dict(zip(range(len(hopListIP)), hopListIP)))

        if urlList:
            data["data"][0]["urls"].append(
                dict(zip(range(len(urlList)), urlList)))
            data["data"][0]["domains"].append(
                dict(zip(range(len(domainList)), domainList)))

        print(json.dumps(data, indent=4))
コード例 #7
0
def email_analysis(filename, exclude_private_ip, check_spf):
    urlList = []
    hopList = []
    hopListIP = []
    domainList = []
    attachmentsList = []
    hopListIPnoPrivate = []

    resultmeioc = {
        "filename": os.path.basename(filename),
        "from": None,
        "sender": None,
        "x-sender": None,
        "to": None,
        "cc": None,
        "bcc": None,
        "envelope-to": None,
        "delivered-to": None,
        "subject": None,
        "x-originating-ip": None,
        "relay_full": None,
        "relay_ip": None,
        "spf": None,
        "urls": None,
        "domains": None,
        "attachments": None
    }

    with open(filename, "rb") as fp:
        msg = BytesParser(policy=policy.default).parse(fp)

    if msg:

        #
        # Header analysis
        #

        if msg["From"]:
            # A sender obfuscation technique involves entering two e-mails. Only the last one is the real one. Example:
            #
            # Sender Name: Mario Rossi <*****@*****.**>
            # Sender Mail: [email protected]
            mail_from = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["From"], re.IGNORECASE)

            if mail_from:
                resultmeioc["from"] = mail_from[-1]

        if msg["Sender"]:
            mail_sender = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["Sender"], re.IGNORECASE)

            if mail_sender:
                resultmeioc["sender"] = mail_sender[-1]

        if msg["X-Sender"]:
            mail_xsender = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["X-Sender"], re.IGNORECASE)

            if mail_xsender:
                resultmeioc["x-sender"] = mail_xsender[-1]

        if msg["To"]:
            mail_to = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["To"], re.IGNORECASE)

            if mail_to:
                # Remove possible duplicates and create a numbered dictionary
                mail_to = dict(
                    zip(range(len(list(set(mail_to)))), list(set(mail_to))))
                resultmeioc["to"] = mail_to

        if msg["Bcc"]:
            resultmeioc["bcc"] = msg["Bcc"]

        if msg["Cc"]:
            mail_cc = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["Cc"], re.IGNORECASE)

            if mail_cc:
                # Remove possible duplicates and create a numbered dictionary
                mail_cc = dict(
                    zip(range(len(list(set(mail_cc)))), list(set(mail_cc))))
                resultmeioc["cc"] = mail_cc

        if msg["Envelope-to"]:

            mail_envelopeto = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["Envelope-to"], re.IGNORECASE)

            if mail_envelopeto:
                # Remove possible duplicates and create a numbered dictionary
                mail_envelopeto = dict(
                    zip(range(len(list(set(mail_envelopeto)))),
                        list(set(mail_envelopeto))))
                resultmeioc["envelope-to"] = mail_envelopeto

        if msg["Delivered-To"]:
            resultmeioc["delivered-to"] = msg["Delivered-To"]

        if msg["X-Originating-IP"]:
            # Usually the IP is in square brackets, I remove them if present.
            mail_xorigip = msg["X-Originating-IP"].replace("[", "").replace(
                "]", "")
            resultmeioc["x-originating-ip"] = mail_xorigip

        if msg["Subject"]:
            resultmeioc["subject"] = msg["Subject"]

        # Identify each relay
        received = msg.get_all("Received")
        if received:
            received.reverse()
            for line in received:
                hops = re.findall(
                    "from\s+(.*?)\s+by(.*?)(?:(?:with|via)(.*?)(?:id|$)|id|$)",
                    line, re.DOTALL | re.X)
                for hop in hops:

                    ipv4_address = re.findall(r"[0-9]+(?:\.[0-9]+){3}", hop[0],
                                              re.DOTALL | re.X)

                    # https://gist.github.com/dfee/6ed3a4b05cfe7a6faf40a2102408d5d8
                    ipv6_address = re.findall(
                        r"(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}:[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:::(?:ffff(?::0{1,4}){0,1}:){0,1}[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:fe80:(?::(?:(?:[0-9a-fA-F]){1,4})){0,4}%[0-9a-zA-Z]{1,})|(?::(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,7}|:))|(?:(?:(?:[0-9a-fA-F]){1,4}):(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,6}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,2}(?::(?:(?:[0-9a-fA-F]){1,4})){1,5})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,3}(?::(?:(?:[0-9a-fA-F]){1,4})){1,4})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}(?::(?:(?:[0-9a-fA-F]){1,4})){1,3})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,5}(?::(?:(?:[0-9a-fA-F]){1,4})){1,2})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,6}:(?:(?:[0-9a-fA-F]){1,4}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,7}:)|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){7,7}(?:(?:[0-9a-fA-F]){1,4}))",
                        hop[0], re.DOTALL | re.X)

                    if ipv4_address:
                        for ipv4 in ipv4_address:
                            if ipaddress.ip_address(ipv4):
                                hopListIP.append(ipv4)
                                if not ipaddress.ip_address(ipv4).is_private:
                                    hopListIPnoPrivate.append(ipv4)

                    if ipv6_address:
                        for ipv6 in ipv6_address:
                            if ipaddress.ip_address(ipv6) and not "6::":
                                hopListIP.append(ipv6)

                                if not ipaddress.ip_address(ipv6).is_private:
                                    hopListIPnoPrivate.append(ipv6)

                    if hop[0]:
                        hopList.append(hop[0])

        if hopList:
            resultmeioc["relay_full"] = dict(zip(range(len(hopList)), hopList))

        if hopListIP:
            if exclude_private_ip:
                resultmeioc["relay_ip"] = dict(
                    zip(range(len(hopListIPnoPrivate)), hopListIPnoPrivate))
            else:
                resultmeioc["relay_ip"] = dict(
                    zip(range(len(hopListIP)), hopListIP))

        #
        # Body analysis
        #
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                # https://gist.github.com/dperini/729294
                urlList.extend(
                    re.findall(
                        "(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?",
                        part.get_content(),
                        re.UNICODE | re.IGNORECASE | re.MULTILINE))

            if part.get_content_type() == "text/html":
                # The try/except is necessary, if the body of the eMail contains an incorrect or unencoded HTML code the script freeezes.
                try:
                    soup = BeautifulSoup(part.get_content(), "html.parser")
                    tags = soup.find_all("a", href=True)
                    for url in tags:
                        urlList.append(url.get("href"))
                except:
                    pass

            if part.get_filename():
                if part.get_payload(decode=True):
                    filename = part.get_filename()
                    filemd5 = hashlib.md5(
                        part.get_payload(decode=True)).hexdigest()
                    filesha1 = hashlib.sha1(
                        part.get_payload(decode=True)).hexdigest()
                    filesha256 = hashlib.sha256(
                        part.get_payload(decode=True)).hexdigest()

                    attachmentsList.append({
                        "filename": filename,
                        "MD5": filemd5,
                        "SHA1": filesha1,
                        "SHA256": filesha256
                    })

        # Identify each domain reported in the eMail body
        for url in urlList:
            analyzeddomain = tldcache(url).registered_domain
            if analyzeddomain:
                domainList.append(analyzeddomain)

        # Remove Duplicate
        urlList = list(set(urlList))
        domainList = list(set(domainList))

        if urlList:
            resultmeioc["urls"] = dict(zip(range(len(urlList)), urlList))
            resultmeioc["domains"] = dict(
                zip(range(len(domainList)), domainList))

        if attachmentsList:
            resultmeioc["attachments"] = attachmentsList

        #
        # Verify the SPF record if requested
        #
        if check_spf:
            testspf = False
            resultspf = ""
            for ip in hopListIPnoPrivate:
                if not testspf and "mail_from" in locals():
                    resultspf = spf.check2(ip, mail_from[-1],
                                           mail_from[-1].split("@")[1])[0]
                    try:
                        resultspf = spf.check2(ip, mail_from[-1],
                                               mail_from[-1].split("@")[1])[0]
                    except:
                        pass

                    if resultspf == "pass":
                        testspf = True
                    else:
                        testspf = False

            resultmeioc["spf"] = testspf

        print(json.dumps(resultmeioc, indent=4))
コード例 #8
0
def get_eml_subject(eml_file):
    with open(eml_file, 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)
        return msg.get_all('Subject')[0]